diff --git a/.circleci/config.yml b/.circleci/config.yml index 7b839f250dc1..4e6718005ab7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,7 +12,7 @@ jobs: # Ensure running with CircleCI/huggingface check_circleci_user: docker: - - image: cimg/python:3.7.12 + - image: cimg/python:3.8.12 parallelism: 1 steps: - run: echo $CIRCLE_PROJECT_USERNAME @@ -26,13 +26,13 @@ jobs: fetch_tests: working_directory: ~/transformers docker: - - image: cimg/python:3.7.12 + - image: cimg/python:3.8.12 parallelism: 1 steps: - checkout - - run: pip install --upgrade pip - - run: pip install GitPython - - run: pip install . + - run: pip install --upgrade --upgrade-strategy eager pip + - run: pip install -U --upgrade-strategy eager GitPython + - run: pip install -U --upgrade-strategy eager . - run: mkdir -p test_preparation - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt - store_artifacts: @@ -43,6 +43,24 @@ jobs: else touch test_preparation/test_list.txt fi + - run: | + if [ -f examples_test_list.txt ]; then + mv examples_test_list.txt test_preparation/examples_test_list.txt + else + touch test_preparation/examples_test_list.txt + fi + - run: | + if [ -f filtered_test_list_cross_tests.txt ]; then + mv filtered_test_list_cross_tests.txt test_preparation/filtered_test_list_cross_tests.txt + else + touch test_preparation/filtered_test_list_cross_tests.txt + fi + - run: | + if [ -f doctest_list.txt ]; then + cp doctest_list.txt test_preparation/doctest_list.txt + else + touch test_preparation/doctest_list.txt + fi - run: | if [ -f test_repo_utils.txt ]; then mv test_repo_utils.txt test_preparation/test_repo_utils.txt @@ -56,15 +74,10 @@ jobs: else touch test_preparation/filtered_test_list.txt fi - - run: python utils/tests_fetcher.py --filters tests examples | tee examples_tests_fetched_summary.txt - - run: | - if [ -f test_list.txt ]; then - mv test_list.txt test_preparation/examples_test_list.txt - else - touch test_preparation/examples_test_list.txt - fi - store_artifacts: path: test_preparation/test_list.txt + - store_artifacts: + path: test_preparation/doctest_list.txt - store_artifacts: path: ~/transformers/test_preparation/filtered_test_list.txt - store_artifacts: @@ -78,6 +91,8 @@ jobs: - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt - store_artifacts: path: test_preparation/generated_config.txt + - store_artifacts: + path: test_preparation/filtered_test_list_cross_tests.txt - continuation/continue: configuration_path: test_preparation/generated_config.yml @@ -85,17 +100,17 @@ jobs: fetch_all_tests: working_directory: ~/transformers docker: - - image: cimg/python:3.7.12 + - image: cimg/python:3.8.12 parallelism: 1 steps: - checkout - - run: pip install --upgrade pip - - run: pip install GitPython - - run: pip install . + - run: pip install --upgrade --upgrade-strategy eager pip + - run: pip install -U --upgrade-strategy eager GitPython + - run: pip install -U --upgrade-strategy eager . - run: | mkdir test_preparation echo -n "tests" > test_preparation/test_list.txt - echo -n "tests" > test_preparation/examples_test_list.txt + echo -n "all" > test_preparation/examples_test_list.txt echo -n "tests/repo_utils" > test_preparation/test_repo_utils.txt - run: | echo -n "tests" > test_list.txt @@ -111,7 +126,7 @@ jobs: check_code_quality: working_directory: ~/transformers docker: - - image: cimg/python:3.7.12 + - image: cimg/python:3.8.12 resource_class: large environment: TRANSFORMERS_IS_CI: yes @@ -121,31 +136,38 @@ jobs: - checkout - restore_cache: keys: - - v0.5-code_quality-{{ checksum "setup.py" }} - - v0.5-code-quality - - run: pip install --upgrade pip - - run: pip install .[all,quality] + - v0.7-code_quality-pip-{{ checksum "setup.py" }} + - v0.7-code-quality-pip + - restore_cache: + keys: + - v0.7-code_quality-site-packages-{{ checksum "setup.py" }} + - v0.7-code-quality-site-packages + - run: pip install --upgrade --upgrade-strategy eager pip + - run: pip install -U --upgrade-strategy eager .[all,quality] - save_cache: - key: v0.5-code_quality-{{ checksum "setup.py" }} + key: v0.7-code_quality-pip-{{ checksum "setup.py" }} paths: - '~/.cache/pip' + - save_cache: + key: v0.7-code_quality-site-packages-{{ checksum "setup.py" }} + paths: + - '~/.pyenv/versions/' - run: name: Show installed libraries and their versions command: pip freeze | tee installed.txt - store_artifacts: path: ~/transformers/installed.txt - - run: black --check --preview examples tests src utils - - run: isort --check-only examples tests src utils + - run: black --check examples tests src utils + - run: ruff examples tests src utils - run: python utils/custom_init_isort.py --check_only - run: python utils/sort_auto_mappings.py --check_only - - run: flake8 examples tests src utils - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source - run: python utils/check_doc_toc.py check_repository_consistency: working_directory: ~/transformers docker: - - image: cimg/python:3.7.12 + - image: cimg/python:3.8.12 resource_class: large environment: TRANSFORMERS_IS_CI: yes @@ -155,14 +177,22 @@ jobs: - checkout - restore_cache: keys: - - v0.5-repository_consistency-{{ checksum "setup.py" }} - - v0.5-repository_consistency - - run: pip install --upgrade pip - - run: pip install .[all,quality] + - v0.7-repository_consistency-pip-{{ checksum "setup.py" }} + - v0.7-repository_consistency-pip + - restore_cache: + keys: + - v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }} + - v0.7-repository_consistency-site-packages + - run: pip install --upgrade --upgrade-strategy eager pip + - run: pip install -U --upgrade-strategy eager .[all,quality] - save_cache: - key: v0.5-repository_consistency-{{ checksum "setup.py" }} + key: v0.7-repository_consistency-pip-{{ checksum "setup.py" }} paths: - '~/.cache/pip' + - save_cache: + key: v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }} + paths: + - '~/.pyenv/versions/' - run: name: Show installed libraries and their versions command: pip freeze | tee installed.txt @@ -174,9 +204,12 @@ jobs: - run: python utils/check_repo.py - run: python utils/check_inits.py - run: python utils/check_config_docstrings.py + - run: python utils/check_config_attributes.py + - run: python utils/check_doctest_list.py - run: make deps_table_check_updated - - run: python utils/tests_fetcher.py --sanity_check - run: python utils/update_metadata.py --check-only + - run: python utils/check_task_guides.py + - run: python utils/check_docstrings.py workflows: version: 2 diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 599691bf1006..45ba5cd10cc5 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -15,16 +15,37 @@ import argparse import copy +import glob import os +import random from dataclasses import dataclass from typing import Any, Dict, List, Optional import yaml -COMMON_ENV_VARIABLES = {"OMP_NUM_THREADS": 1, "TRANSFORMERS_IS_CI": True, "PYTEST_TIMEOUT": 120} -COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "s": None} -DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.7.12"}] +COMMON_ENV_VARIABLES = { + "OMP_NUM_THREADS": 1, + "TRANSFORMERS_IS_CI": True, + "PYTEST_TIMEOUT": 120, + "RUN_PIPELINE_TESTS": False, + "RUN_PT_TF_CROSS_TESTS": False, + "RUN_PT_FLAX_CROSS_TESTS": False, +} +# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical +COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile"} +DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}] + + +class EmptyJob: + job_name = "empty" + + def to_dict(self): + return { + "working_directory": "~/transformers", + "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE), + "steps":["checkout"], + } @dataclass @@ -32,7 +53,7 @@ class CircleCIJob: name: str additional_env: Dict[str, Any] = None cache_name: str = None - cache_version: str = "0.5" + cache_version: str = "0.7" docker_image: List[Dict[str, str]] = None install_steps: List[str] = None marker: Optional[str] = None @@ -42,6 +63,8 @@ class CircleCIJob: resource_class: Optional[str] = "xlarge" tests_to_run: Optional[List[str]] = None working_directory: str = "~/transformers" + # This should be only used for doctest job! + command_timeout: Optional[int] = None def __post_init__(self): # Deal with defaults for mutable attributes. @@ -58,12 +81,21 @@ def __post_init__(self): self.pytest_options = {} if isinstance(self.tests_to_run, str): self.tests_to_run = [self.tests_to_run] + if self.parallelism is None: + self.parallelism = 1 def to_dict(self): + env = COMMON_ENV_VARIABLES.copy() + env.update(self.additional_env) + + cache_branch_prefix = os.environ.get("CIRCLE_BRANCH", "pull") + if cache_branch_prefix != "main": + cache_branch_prefix = "pull" + job = { "working_directory": self.working_directory, "docker": self.docker_image, - "environment": {**COMMON_ENV_VARIABLES, **self.additional_env}, + "environment": env, } if self.resource_class is not None: job["resource_class"] = self.resource_class @@ -75,8 +107,21 @@ def to_dict(self): { "restore_cache": { "keys": [ - f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}', - f"v{self.cache_version}-{self.cache_name}-", + # check the fully-matched cache first + f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}', + # try the partially-matched cache from `main` + f"v{self.cache_version}-{self.cache_name}-main-pip-", + # try the general partially-matched cache + f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-", + ] + } + }, + { + "restore_cache": { + "keys": [ + f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}', + f"v{self.cache_version}-{self.cache_name}-main-site-packages-", + f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-", ] } }, @@ -85,28 +130,137 @@ def to_dict(self): steps.append( { "save_cache": { - "key": f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}', + "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}', "paths": ["~/.cache/pip"], } } ) + steps.append( + { + "save_cache": { + "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}', + "paths": ["~/.pyenv/versions/"], + } + } + ) steps.append({"run": {"name": "Show installed libraries and their versions", "command": "pip freeze | tee installed.txt"}}) steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}}) all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options} - pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()] + pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()] pytest_flags.append( f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}" ) - test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) - if self.tests_to_run is None: - test_command += " << pipeline.parameters.tests_to_run >>" + + steps.append({"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}}) + + test_command = "" + if self.command_timeout: + test_command = f"timeout {self.command_timeout} " + test_command += f"python -m pytest --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags) + + if self.parallelism == 1: + if self.tests_to_run is None: + test_command += " << pipeline.parameters.tests_to_run >>" + else: + test_command += " " + " ".join(self.tests_to_run) else: - test_command += " " + " ".join(self.tests_to_run) + # We need explicit list instead of `pipeline.parameters.tests_to_run` (only available at job runtime) + tests = self.tests_to_run + if tests is None: + folder = os.environ["test_preparation_dir"] + test_file = os.path.join(folder, "filtered_test_list.txt") + if os.path.exists(test_file): + with open(test_file) as f: + tests = f.read().split(" ") + + # expand the test list + if tests == ["tests"]: + tests = [os.path.join("tests", x) for x in os.listdir("tests")] + expanded_tests = [] + for test in tests: + if test.endswith(".py"): + expanded_tests.append(test) + elif test == "tests/models": + expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)]) + elif test == "tests/pipelines": + expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)]) + else: + expanded_tests.append(test) + # Avoid long tests always being collected together + random.shuffle(expanded_tests) + tests = " ".join(expanded_tests) + + # Each executor to run ~10 tests + n_executors = max(len(tests) // 10, 1) + # Avoid empty test list on some executor(s) or launching too many executors + if n_executors > self.parallelism: + n_executors = self.parallelism + job["parallelism"] = n_executors + + # Need to be newline separated for the command `circleci tests split` below + command = f'echo {tests} | tr " " "\\n" >> tests.txt' + steps.append({"run": {"name": "Get tests", "command": command}}) + + command = 'TESTS=$(circleci tests split tests.txt) && echo $TESTS > splitted_tests.txt' + steps.append({"run": {"name": "Split tests", "command": command}}) + + steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}}) + steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}}) + + test_command = "" + if self.timeout: + test_command = f"timeout {self.timeout} " + test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags) + test_command += " $(cat splitted_tests.txt)" if self.marker is not None: test_command += f" -m {self.marker}" - test_command += " | tee tests_output.txt" + + if self.name == "pr_documentation_tests": + # can't use ` | tee tee tests_output.txt` as usual + test_command += " > tests_output.txt" + # Save the return code, so we can check if it is timeout in the next step. + test_command += '; touch "$?".txt' + # Never fail the test step for the doctest job. We will check the results in the next step, and fail that + # step instead if the actual test failures are found. This is to avoid the timeout being reported as test + # failure. + test_command = f"({test_command}) || true" + else: + test_command += " || true" steps.append({"run": {"name": "Run tests", "command": test_command}}) + + # Deal with errors + check_test_command = f'if [ -s reports/{self.job_name}/errors.txt ]; ' + check_test_command += 'then echo "Some tests errored out!"; echo ""; ' + check_test_command += f'cat reports/{self.job_name}/errors.txt; ' + check_test_command += 'echo ""; echo ""; ' + + py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("ERROR ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()' + check_test_command += f"$(python3 -c '{py_command}'); " + check_test_command += f'cat summary_short.txt; echo ""; exit -1; ' + + # Deeal with failed tests + check_test_command += f'elif [ -s reports/{self.job_name}/failures_short.txt ]; ' + check_test_command += 'then echo "Some tests failed!"; echo ""; ' + check_test_command += f'cat reports/{self.job_name}/failures_short.txt; ' + check_test_command += 'echo ""; echo ""; ' + + py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("FAILED ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()' + check_test_command += f"$(python3 -c '{py_command}'); " + check_test_command += f'cat summary_short.txt; echo ""; exit -1; ' + + check_test_command += f'elif [ -s reports/{self.job_name}/stats.txt ]; then echo "All tests pass!"; ' + + # return code `124` means the previous (pytest run) step is timeout + if self.name == "pr_documentation_tests": + check_test_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; ' + + check_test_command += 'else echo "other fatal error"; echo ""; exit -1; fi;' + + steps.append({"run": {"name": "Check test results", "command": check_test_command}}) + + steps.append({"store_test_results": {"path": "test-results"}}) + steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}}) steps.append({"store_artifacts": {"path": "~/transformers/reports"}}) job["steps"] = steps @@ -122,12 +276,12 @@ def job_name(self): "torch_and_tf", additional_env={"RUN_PT_TF_CROSS_TESTS": True}, install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs", + "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake", "git lfs install", - "pip install --upgrade pip", - "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]", - "pip install tensorflow_probability", - "pip install git+https://github.com/huggingface/accelerate", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]", + "pip install -U --upgrade-strategy eager tensorflow_probability", + "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate", ], marker="is_pt_tf_cross_test", pytest_options={"rA": None, "durations": 0}, @@ -139,9 +293,9 @@ def job_name(self): additional_env={"RUN_PT_FLAX_CROSS_TESTS": True}, install_steps=[ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade pip", - "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]", - "pip install git+https://github.com/huggingface/accelerate", + "pip install -U --upgrade-strategy eager --upgrade pip", + "pip install -U --upgrade-strategy eager .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]", + "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate", ], marker="is_pt_flax_cross_test", pytest_options={"rA": None, "durations": 0}, @@ -152,23 +306,24 @@ def job_name(self): "torch", install_steps=[ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time", - "pip install --upgrade pip", - "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]", - "pip install git+https://github.com/huggingface/accelerate", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]", + "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate", ], - pytest_num_workers=3, + parallelism=1, + pytest_num_workers=6, ) tf_job = CircleCIJob( "tf", install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade pip", - "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]", - "pip install tensorflow_probability", + "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]", + "pip install -U --upgrade-strategy eager tensorflow_probability", ], - pytest_options={"rA": None}, + parallelism=1, ) @@ -176,34 +331,36 @@ def job_name(self): "flax", install_steps=[ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade pip", - "pip install .[flax,testing,sentencepiece,flax-speech,vision]", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece,flax-speech,vision]", ], - pytest_options={"rA": None}, + parallelism=1, ) pipelines_torch_job = CircleCIJob( "pipelines_torch", + additional_env={"RUN_PIPELINE_TESTS": True}, install_steps=[ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade pip", - "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]", ], - pytest_options={"rA": None}, - tests_to_run="tests/pipelines/" + marker="is_pipeline_test", + pytest_num_workers=6, ) pipelines_tf_job = CircleCIJob( "pipelines_tf", + additional_env={"RUN_PIPELINE_TESTS": True}, install_steps=[ - "pip install --upgrade pip", - "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]", - "pip install tensorflow_probability", + "sudo apt-get -y update && sudo apt-get install -y cmake", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,vision]", + "pip install -U --upgrade-strategy eager tensorflow_probability", ], - pytest_options={"rA": None}, - tests_to_run="tests/pipelines/" + marker="is_pipeline_test", ) @@ -222,8 +379,8 @@ def job_name(self): "sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local\n" "sudo make install\n", }, - "pip install --upgrade pip", - "pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]", "python -m unidic download", ], parallelism=None, @@ -241,11 +398,10 @@ def job_name(self): cache_name="torch_examples", install_steps=[ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade pip", - "pip install .[sklearn,torch,sentencepiece,testing,torch-speech]", - "pip install -r examples/pytorch/_tests_requirements.txt", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,torch,sentencepiece,testing,torch-speech]", + "pip install -U --upgrade-strategy eager -r examples/pytorch/_tests_requirements.txt", ], - tests_to_run="./examples/pytorch/", ) @@ -253,11 +409,11 @@ def job_name(self): "examples_tensorflow", cache_name="tensorflow_examples", install_steps=[ - "pip install --upgrade pip", - "pip install .[sklearn,tensorflow,sentencepiece,testing]", - "pip install -r examples/tensorflow/_tests_requirements.txt", + "sudo apt-get -y update && sudo apt-get install -y cmake", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[sklearn,tensorflow,sentencepiece,testing]", + "pip install -U --upgrade-strategy eager -r examples/tensorflow/_tests_requirements.txt", ], - tests_to_run="./examples/tensorflow/", ) @@ -265,22 +421,22 @@ def job_name(self): "examples_flax", cache_name="flax_examples", install_steps=[ - "pip install --upgrade pip", - "pip install .[flax,testing,sentencepiece]", - "pip install -r examples/flax/_tests_requirements.txt", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece]", + "pip install -U --upgrade-strategy eager -r examples/flax/_tests_requirements.txt", ], - tests_to_run="./examples/flax/", ) hub_job = CircleCIJob( "hub", + additional_env={"HUGGINGFACE_CO_STAGING": True}, install_steps=[ "sudo apt-get -y update && sudo apt-get install git-lfs", 'git config --global user.email "ci@dummy.com"', 'git config --global user.name "ci"', - "pip install --upgrade pip", - "pip install .[torch,sentencepiece,testing]", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing,vision]", ], marker="is_staging_test", pytest_num_workers=1, @@ -290,8 +446,9 @@ def job_name(self): onnx_job = CircleCIJob( "onnx", install_steps=[ - "pip install --upgrade pip", - "pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]", + "sudo apt-get -y update && sudo apt-get install -y cmake", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]", ], pytest_options={"k onnx": None}, pytest_num_workers=1, @@ -302,17 +459,23 @@ def job_name(self): "exotic_models", install_steps=[ "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev", - "pip install --upgrade pip", - "pip install .[torch,testing,vision]", - "pip install torchvision", - "pip install 'git+https://github.com/facebookresearch/detectron2.git'", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[torch,testing,vision]", + "pip install -U --upgrade-strategy eager torchvision", + "pip install -U --upgrade-strategy eager scipy", + "pip install -U --upgrade-strategy eager 'git+https://github.com/facebookresearch/detectron2.git'", "sudo apt install tesseract-ocr", - "pip install pytesseract", - "pip install natten", + "pip install -U --upgrade-strategy eager pytesseract", + "pip install -U --upgrade-strategy eager natten", + "pip install -U --upgrade-strategy eager python-Levenshtein", + "pip install -U --upgrade-strategy eager opencv-python", + "pip install -U --upgrade-strategy eager nltk", ], tests_to_run=[ "tests/models/*layoutlmv*", "tests/models/*nat", + "tests/models/deta", + "tests/models/nougat", ], pytest_num_workers=1, pytest_options={"durations": 100}, @@ -322,15 +485,57 @@ def job_name(self): repo_utils_job = CircleCIJob( "repo_utils", install_steps=[ - "pip install --upgrade pip", - "pip install .[quality,testing]", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager .[quality,testing,torch]", ], parallelism=None, pytest_num_workers=1, - resource_class=None, + resource_class="large", tests_to_run="tests/repo_utils", ) + +# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest +# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove +# the bash output redirection.) +py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)' +py_command = f"$(python3 -c '{py_command}')" +command = f'echo "{py_command}" > pr_documentation_tests_temp.txt' +doc_test_job = CircleCIJob( + "pr_documentation_tests", + additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"}, + install_steps=[ + "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg", + "pip install --upgrade --upgrade-strategy eager pip", + "pip install -U --upgrade-strategy eager -e .[dev]", + "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate", + "pip install --upgrade --upgrade-strategy eager pytest pytest-sugar", + "pip install -U --upgrade-strategy eager natten", + "find -name __pycache__ -delete", + "find . -name \*.pyc -delete", + # Add an empty file to keep the test step running correctly even no file is selected to be tested. + "touch dummy.py", + { + "name": "Get files to test", + "command": command, + }, + { + "name": "Show information in `Get files to test`", + "command": + "cat pr_documentation_tests_temp.txt" + }, + { + "name": "Get the last line in `pr_documentation_tests.txt`", + "command": + "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt" + }, + ], + tests_to_run="$(cat pr_documentation_tests.txt)", # noqa + pytest_options={"-doctest-modules": None, "doctest-glob": "*.md", "dist": "loadfile", "rvsA": None}, + command_timeout=1200, # test cannot run longer than 1200 seconds + pytest_num_workers=1, +) + REGULAR_TESTS = [ torch_and_tf_job, torch_and_flax_job, @@ -352,10 +557,14 @@ def job_name(self): pipelines_tf_job, ] REPO_UTIL_TESTS = [repo_utils_job] +DOC_TESTS = [doc_test_job] + def create_circleci_config(folder=None): if folder is None: folder = os.getcwd() + # Used in CircleCIJob.to_dict() to expand the test list (for using parallelism) + os.environ["test_preparation_dir"] = folder jobs = [] all_test_file = os.path.join(folder, "test_list.txt") if os.path.exists(all_test_file): @@ -375,25 +584,73 @@ def create_circleci_config(folder=None): if len(test_list) > 0: jobs.extend(REGULAR_TESTS) + extended_tests_to_run = set(test_list.split()) + # Extend the test files for cross test jobs + for job in jobs: + if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]: + for test_path in copy.copy(extended_tests_to_run): + dir_path, fn = os.path.split(test_path) + if fn.startswith("test_modeling_tf_"): + fn = fn.replace("test_modeling_tf_", "test_modeling_") + elif fn.startswith("test_modeling_flax_"): + fn = fn.replace("test_modeling_flax_", "test_modeling_") + else: + if job.job_name == "test_torch_and_tf": + fn = fn.replace("test_modeling_", "test_modeling_tf_") + elif job.job_name == "test_torch_and_flax": + fn = fn.replace("test_modeling_", "test_modeling_flax_") + new_test_file = str(os.path.join(dir_path, fn)) + if os.path.isfile(new_test_file): + if new_test_file not in extended_tests_to_run: + extended_tests_to_run.add(new_test_file) + extended_tests_to_run = sorted(extended_tests_to_run) + for job in jobs: + if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]: + job.tests_to_run = extended_tests_to_run + fn = "filtered_test_list_cross_tests.txt" + f_path = os.path.join(folder, fn) + with open(f_path, "w") as fp: + fp.write(" ".join(extended_tests_to_run)) + example_file = os.path.join(folder, "examples_test_list.txt") if os.path.exists(example_file) and os.path.getsize(example_file) > 0: - jobs.extend(EXAMPLES_TESTS) - + with open(example_file, "r", encoding="utf-8") as f: + example_tests = f.read() + for job in EXAMPLES_TESTS: + framework = job.name.replace("examples_", "").replace("torch", "pytorch") + if example_tests == "all": + job.tests_to_run = [f"examples/{framework}"] + else: + job.tests_to_run = [f for f in example_tests.split(" ") if f.startswith(f"examples/{framework}")] + + if len(job.tests_to_run) > 0: + jobs.append(job) + + doctest_file = os.path.join(folder, "doctest_list.txt") + if os.path.exists(doctest_file): + with open(doctest_file) as f: + doctest_list = f.read() + else: + doctest_list = [] + if len(doctest_list) > 0: + jobs.extend(DOC_TESTS) + repo_util_file = os.path.join(folder, "test_repo_utils.txt") if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0: jobs.extend(REPO_UTIL_TESTS) - if len(jobs) > 0: - config = {"version": "2.1"} - config["parameters"] = { - # Only used to accept the parameters from the trigger - "nightly": {"type": "boolean", "default": False}, - "tests_to_run": {"type": "string", "default": test_list}, - } - config["jobs"] = {j.job_name: j.to_dict() for j in jobs} - config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} - with open(os.path.join(folder, "generated_config.yml"), "w") as f: - f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False)) + if len(jobs) == 0: + jobs = [EmptyJob()] + config = {"version": "2.1"} + config["parameters"] = { + # Only used to accept the parameters from the trigger + "nightly": {"type": "boolean", "default": False}, + "tests_to_run": {"type": "string", "default": test_list}, + } + config["jobs"] = {j.job_name: j.to_dict() for j in jobs} + config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} + with open(os.path.join(folder, "generated_config.yml"), "w") as f: + f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False)) if __name__ == "__main__": diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index e4e78eceecea..1ec76462acfd 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -26,8 +26,9 @@ body: Models: - text models: @ArthurZucker and @younesbelkada - - vision models: @amyeroberts and @NielsRogge + - vision models: @amyeroberts - speech models: @sanchit-gandhi + - graph models: @clefourrier Library: @@ -36,14 +37,16 @@ body: - pipelines: @Narsil - tensorflow: @gante and @Rocketknight1 - tokenizers: @ArthurZucker - - trainer: @sgugger + - trainer: @muellerzr and @pacman100 Integrations: - - deepspeed: @stas00 + - deepspeed: HF Trainer/Accelerate: @pacman100 - ray/raytune: @richardliaw, @amogkam + - Big Model Inference: @SunMarc + - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada - Documentation: @sgugger, @stevhliu and @MKhalusova + Documentation: @stevhliu and @MKhalusova Model hub: @@ -59,7 +62,7 @@ body: Maintained examples (not research project or legacy): - Flax: @sanchit-gandhi - - PyTorch: @sgugger + - PyTorch: See Models above and tag the person corresponding to the modality of the example. - TensorFlow: @Rocketknight1 Research projects are not maintained and should be taken as is. diff --git a/.github/ISSUE_TEMPLATE/i18n.md b/.github/ISSUE_TEMPLATE/i18n.md index 39d369a25324..52667f930508 100644 --- a/.github/ISSUE_TEMPLATE/i18n.md +++ b/.github/ISSUE_TEMPLATE/i18n.md @@ -23,23 +23,23 @@ Some notes: * Please translate in a gender-neutral way. * Add your translations to the folder called `` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source). * Register your translation in `/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml). -* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @ArthurZucker, @sgugger for review. +* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review. * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/). ## Get Started section -- [ ] [index.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.mdx) https://github.com/huggingface/transformers/pull/20180 -- [ ] [quicktour.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.mdx) (waiting for initial PR to go through) -- [ ] [installation.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.mdx). +- [ ] [index.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.md) https://github.com/huggingface/transformers/pull/20180 +- [ ] [quicktour.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.md) (waiting for initial PR to go through) +- [ ] [installation.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.md). ## Tutorial section -- [ ] [pipeline_tutorial.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.mdx) -- [ ] [autoclass_tutorial.mdx](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.mdx) -- [ ] [preprocessing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.mdx) -- [ ] [training.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.mdx) -- [ ] [accelerate.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.mdx) -- [ ] [model_sharing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.mdx) -- [ ] [multilingual.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.mdx) +- [ ] [pipeline_tutorial.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.md) +- [ ] [autoclass_tutorial.md](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.md) +- [ ] [preprocessing.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.md) +- [ ] [training.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.md) +- [ ] [accelerate.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.md) +- [ ] [model_sharing.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.md) +- [ ] [multilingual.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.md) diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml index 6f6b680d1e6b..6bf33f842fbf 100644 --- a/.github/conda/meta.yaml +++ b/.github/conda/meta.yaml @@ -16,7 +16,6 @@ requirements: - pip - numpy >=1.17 - dataclasses - - importlib_metadata - huggingface_hub - packaging - filelock @@ -31,7 +30,6 @@ requirements: - python - numpy >=1.17 - dataclasses - - importlib_metadata - huggingface_hub - packaging - filelock diff --git a/.github/workflows/add-model-like.yml b/.github/workflows/add-model-like.yml index 3ea3c89249fe..68133a7e2243 100644 --- a/.github/workflows/add-model-like.yml +++ b/.github/workflows/add-model-like.yml @@ -3,13 +3,13 @@ name: Add model like runner on: push: branches: - - main - pull_request: - paths: - - "src/**" - - "tests/**" - - ".github/**" - types: [opened, synchronize, reopened] + - none # put main here when this is fixed + #pull_request: + # paths: + # - "src/**" + # - "tests/**" + # - ".github/**" + # types: [opened, synchronize, reopened] jobs: run_tests_templates_like: diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 03ecf450264d..710ad7afe77a 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,7 +3,7 @@ name: Build docker images (scheduled) on: push: branches: - - docker-image* + - build_ci_docker_image* repository_dispatch: workflow_call: inputs: @@ -11,7 +11,7 @@ on: required: true type: string schedule: - - cron: "0 1 * * *" + - cron: "17 0 * * *" concurrency: group: docker-images-builds @@ -22,21 +22,31 @@ jobs: name: "Latest PyTorch + TensorFlow [dev]" runs-on: ubuntu-latest steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Check out code uses: actions/checkout@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Build and push - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: context: ./docker/transformers-all-latest-gpu build-args: | @@ -49,7 +59,7 @@ jobs: # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. # The later case is useful for manual image building for debugging purpose. Use another tag in this case! if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: context: ./docker/transformers-all-latest-gpu build-args: | @@ -57,54 +67,35 @@ jobs: push: true tags: huggingface/transformers-all-latest-gpu-push-ci - latest-with-torch-nightly-docker: - name: "Nightly PyTorch + Stable TensorFlow" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - PYTORCH=pre - push: true - tags: huggingface/transformers-all-latest-torch-nightly-gpu - latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" runs-on: ubuntu-latest steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Check out code uses: actions/checkout@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Build and push - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: context: ./docker/transformers-pytorch-deepspeed-latest-gpu build-args: | @@ -117,15 +108,25 @@ jobs: name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" runs-on: ubuntu-latest steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Check out code uses: actions/checkout@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} @@ -135,7 +136,7 @@ jobs: # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. # The later case is useful for manual image building for debugging purpose. Use another tag in this case! if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: context: ./docker/transformers-pytorch-deepspeed-latest-gpu build-args: | @@ -143,87 +144,104 @@ jobs: push: true tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - nightly-torch-deepspeed-docker: - name: "Nightly PyTorch + DeepSpeed" + doc-builder: + name: "Doc builder" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' runs-on: ubuntu-latest steps: - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Check out code uses: actions/checkout@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Build and push - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: - context: ./docker/transformers-pytorch-deepspeed-nightly-gpu - build-args: | - REF=main + context: ./docker/transformers-doc-builder push: true - tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu + tags: huggingface/transformers-doc-builder - doc-builder: - name: "Doc builder" + latest-pytorch: + name: "Latest PyTorch [dev]" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' runs-on: ubuntu-latest steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Check out code uses: actions/checkout@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Build and push - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: - context: ./docker/transformers-doc-builder + context: ./docker/transformers-pytorch-gpu + build-args: | + REF=main push: true - tags: huggingface/transformers-doc-builder + tags: huggingface/transformers-pytorch-gpu - latest-pytorch: - name: "Latest PyTorch [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest + latest-pytorch-amd: + name: "Latest PyTorch (AMD) [dev]" + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Check out code uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 + - name: Login to DockerHub + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily - - name: Build and push - uses: docker/build-push-action@v3 + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 with: - context: ./docker/transformers-pytorch-gpu + context: ./docker/transformers-pytorch-amd-gpu build-args: | REF=main push: true - tags: huggingface/transformers-pytorch-gpu + tags: huggingface/transformers-pytorch-amd-gpu-push-ci latest-tensorflow: name: "Latest TensorFlow [dev]" @@ -233,19 +251,19 @@ jobs: steps: - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Check out code uses: actions/checkout@v3 - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Build and push - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 with: context: ./docker/transformers-tensorflow-gpu build-args: | diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml new file mode 100644 index 000000000000..1b8cab864d92 --- /dev/null +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -0,0 +1,85 @@ +name: Build docker images (Nightly CI) + +on: + workflow_call: + push: + branches: + - build_nightly_ci_docker_image* + +concurrency: + group: docker-images-builds + cancel-in-progress: false + +jobs: + latest-with-torch-nightly-docker: + name: "Nightly PyTorch + Stable TensorFlow" + runs-on: ubuntu-latest + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + PYTORCH=pre + push: true + tags: huggingface/transformers-all-latest-torch-nightly-gpu + + nightly-torch-deepspeed-docker: + name: "Nightly PyTorch + DeepSpeed" + runs-on: ubuntu-latest + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-pytorch-deepspeed-nightly-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu \ No newline at end of file diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml index 3a0e1612454c..aa47dfd08c2d 100644 --- a/.github/workflows/build-past-ci-docker-images.yml +++ b/.github/workflows/build-past-ci-docker-images.yml @@ -3,7 +3,7 @@ name: Build docker images (Past CI) on: push: branches: - - past-ci-docker-image* + - build_past_ci_docker_image* concurrency: group: docker-images-builds @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"] + version: ["1.13", "1.12", "1.11", "1.10"] runs-on: ubuntu-latest steps: - @@ -24,6 +24,17 @@ jobs: - name: Check out code uses: actions/checkout@v3 + - + id: get-base-image + name: Get Base Image + env: + framework_version: ${{ matrix.version }} + run: | + echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["pytorch"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT + - + name: Print Base Image + run: | + echo ${{ steps.get-base-image.outputs.base_image }} - name: Login to DockerHub uses: docker/login-action@v2 @@ -37,6 +48,7 @@ jobs: context: ./docker/transformers-past-gpu build-args: | REF=main + BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }} FRAMEWORK=pytorch VERSION=${{ matrix.version }} push: true @@ -47,7 +59,7 @@ jobs: strategy: fail-fast: false matrix: - version: ["2.8", "2.7", "2.6", "2.5"] + version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"] runs-on: ubuntu-latest steps: - @@ -57,37 +69,16 @@ jobs: name: Check out code uses: actions/checkout@v3 - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} + id: get-base-image + name: Get Base Image + env: + framework_version: ${{ matrix.version }} + run: | + echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["tensorflow"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-past-gpu - build-args: | - REF=main - FRAMEWORK=tensorflow - VERSION=${{ matrix.version }} - push: true - tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu - - past-tensorflow-docker-2-4: - name: "Past TensorFlow Docker" - strategy: - fail-fast: false - matrix: - version: ["2.4"] - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 + name: Print Base Image + run: | + echo ${{ steps.get-base-image.outputs.base_image }} - name: Login to DockerHub uses: docker/login-action@v2 @@ -101,8 +92,8 @@ jobs: context: ./docker/transformers-past-gpu build-args: | REF=main - BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04 + BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }} FRAMEWORK=tensorflow VERSION=${{ matrix.version }} push: true - tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu \ No newline at end of file + tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 9f29a7d7a7ef..6eecff24eb17 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -15,6 +15,7 @@ jobs: commit_sha: ${{ github.sha }} package: transformers notebook_folder: transformers_doc - languages: de en es it ko pt zh + languages: de en es fr it ko pt zh secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 0c8aa237f36e..640a0cb2f59f 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -14,4 +14,4 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: transformers - languages: de en es it ko pt zh + languages: de en es fr it ko pt zh diff --git a/.github/workflows/check_runner_status.yml b/.github/workflows/check_runner_status.yml index 8912e32c94ee..7d0e3853b5df 100644 --- a/.github/workflows/check_runner_status.yml +++ b/.github/workflows/check_runner_status.yml @@ -57,6 +57,7 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: runner status check RUNNER_STATUS: ${{ needs.check_runner_status.result }} OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }} diff --git a/.github/workflows/check_tiny_models.yml b/.github/workflows/check_tiny_models.yml new file mode 100644 index 000000000000..5a4cb9622f06 --- /dev/null +++ b/.github/workflows/check_tiny_models.yml @@ -0,0 +1,82 @@ +name: Check Tiny Models + +on: + push: + branches: + - check_tiny_models* + repository_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }} + +jobs: + check_tiny_models: + name: Check tiny models + runs-on: ubuntu-latest + steps: + - name: Checkout transformers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - uses: actions/checkout@v3 + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + # Semantic version range syntax or exact version of a Python version + python-version: '3.8' + # Optional - x64 or x86 architecture, defaults to x64 + architecture: 'x64' + + - name: Install + run: | + sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake + pip install --upgrade pip + python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu] + pip install tensorflow_probability + python -m pip install -U natten + + - name: Create all tiny models (locally) + run: | + python utils/create_dummy_models.py tiny_local_models --all --num_workers 2 + + - name: Local tiny model reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_local_model_creation_reports + path: tiny_local_models/reports + + # GitHub-hosted runners have 2-core CPUs + - name: Run pipeline tests against all new (local) tiny models + run: | + OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_local_model_creation_reports + path: reports/tests_pipelines + + - name: Create + Upload tiny models for new model architecture(s) + run: | + python utils/update_tiny_models.py --num_workers 2 + + - name: Full report + run: cat tiny_models/reports/tiny_model_creation_report.json + + - name: Failure report + run: cat tiny_models/reports/simple_failed_report.txt + + - name: Summary report + run: cat tiny_models/reports/tiny_model_summary.json + + - name: New tiny model creation reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_model_creation_reports + path: tiny_models/reports diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml index d894f991ca9c..8604019d76eb 100644 --- a/.github/workflows/delete_doc_comment.yml +++ b/.github/workflows/delete_doc_comment.yml @@ -1,13 +1,14 @@ -name: Delete dev documentation +name: Delete doc comment on: - pull_request: - types: [ closed ] + workflow_run: + workflows: ["Delete doc comment trigger"] + types: + - completed jobs: delete: uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main - with: - pr_number: ${{ github.event.number }} - package: transformers + secrets: + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml new file mode 100644 index 000000000000..f87d9bd4dca7 --- /dev/null +++ b/.github/workflows/delete_doc_comment_trigger.yml @@ -0,0 +1,12 @@ +name: Delete doc comment trigger + +on: + pull_request: + types: [ closed ] + + +jobs: + delete: + uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main + with: + pr_number: ${{ github.event.number }} diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index d65698e2a4f3..236ccbcd253a 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -6,7 +6,7 @@ on: - doctest* repository_dispatch: schedule: - - cron: "0 2 * * *" + - cron: "17 2 * * *" env: @@ -20,31 +20,36 @@ env: jobs: run_doctests: - runs-on: [self-hosted, doc-tests-gpu] + runs-on: [single-gpu, nvidia-gpu, t4, doctest-ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: + - name: uninstall transformers (installed during docker image build) + run: python3 -m pip uninstall -y transformers + - uses: actions/checkout@v3 - name: NVIDIA-SMI run: | nvidia-smi + - name: Install transformers in edit mode + run: python3 -m pip install -e .[flax] + - name: GPU visibility run: | python3 utils/print_env.py - - name: Prepare files for doctests - run: | - python3 utils/prepare_for_doc_test.py src docs + - name: Show installed libraries and their versions + run: pip freeze - - name: Run doctests + - name: Get doctest files run: | - python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx" + $(python3 -c 'from utils.tests_fetcher import get_all_doctest_files; to_test = get_all_doctest_files(); to_test = " ".join(to_test); fp = open("doc_tests.txt", "w"); fp.write(to_test); fp.close()') - - name: Clean files after doctests + - name: Run doctests run: | - python3 utils/prepare_for_doc_test.py src docs --remove_new_line + python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md" - name: Failure short reports if: ${{ failure() }} diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml new file mode 100644 index 000000000000..dfc258e5be85 --- /dev/null +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -0,0 +1,145 @@ +name: Self-hosted runner (nightly-past-ci-caller) + +on: + schedule: + # 2:17 am on each Sunday and Thursday + + - cron: "17 2 * * 0,4" + push: + branches: + - run_nightly_ci* + - run_past_ci* + +jobs: + build_nightly_ci_images: + name: Build Nightly CI Docker Images + if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) + uses: ./.github/workflows/build-nightly-ci-docker-images.yml + secrets: inherit + + run_nightly_ci: + name: Nightly CI + needs: [build_nightly_ci_images] + uses: ./.github/workflows/self-nightly-scheduled.yml + secrets: inherit + + run_past_ci_pytorch_1-13: + name: PyTorch 1.13 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_nightly_ci] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.13" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-12: + name: PyTorch 1.12 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-13] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.12" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-11: + name: PyTorch 1.11 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-12] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.11" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-10: + name: PyTorch 1.10 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-11] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.10" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-11: + name: TensorFlow 2.11 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_pytorch_1-10] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.11" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-10: + name: TensorFlow 2.10 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-11] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.10" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-9: + name: TensorFlow 2.9 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-10] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.9" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-8: + name: TensorFlow 2.8 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-9] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.8" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-7: + name: TensorFlow 2.7 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-8] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.7" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-6: + name: TensorFlow 2.6 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-7] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.6" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-5: + name: TensorFlow 2.5 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-6] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.5" + sha: ${{ github.sha }} + secrets: inherit diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index accccf6164bc..713e004d8e58 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (nightly) +name: Self-hosted runner (nightly-ci) # Note that each job's dependencies go into a corresponding docker file. # @@ -8,9 +8,7 @@ name: Self-hosted runner (nightly) on: repository_dispatch: -# Disable temporarily until the test suite can be run under 12 hours. -# schedule: -# - cron: "0 16 * * *" + workflow_call: env: HF_HOME: /mnt/cache @@ -33,7 +31,7 @@ jobs: fetch-depth: 2 - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} check_runners: name: Check Runners @@ -41,7 +39,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -56,7 +54,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -96,7 +94,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -117,6 +115,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -143,7 +145,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: @@ -153,7 +155,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -174,6 +176,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -200,7 +206,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_all_tests_torch_cuda_extensions_gpu: @@ -209,7 +215,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] needs: setup container: image: huggingface/transformers-pytorch-deepspeed-nightly-gpu @@ -219,6 +225,10 @@ jobs: working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Remove cached torch extensions run: rm -rf /github/home/.cache/torch_extensions/ @@ -229,7 +239,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -258,7 +268,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu send_results: @@ -291,7 +301,8 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} - CI_EVENT: nightly-build + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: Nightly CI RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} SETUP_STATUS: ${{ needs.setup.result }} @@ -301,3 +312,11 @@ jobs: pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + + + # delete-artifact + - uses: geekyeggo/delete-artifact@v2 + with: + name: | + single-* + multi-* \ No newline at end of file diff --git a/.github/workflows/self-past-caller.yml b/.github/workflows/self-past-caller.yml deleted file mode 100644 index 2cc81dac8ca2..000000000000 --- a/.github/workflows/self-past-caller.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Self-hosted runner (past-ci-caller) - -on: - push: - branches: - - run-past-ci* - -jobs: - run_past_ci_pytorch_1-11: - name: PyTorch 1.11 - if: always() - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.11" - secrets: inherit - - run_past_ci_pytorch_1-10: - name: PyTorch 1.10 - if: always() - needs: [run_past_ci_pytorch_1-11] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.10" - secrets: inherit - - run_past_ci_pytorch_1-9: - name: PyTorch 1.9 - if: always() - needs: [run_past_ci_pytorch_1-10] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.9" - secrets: inherit - - run_past_ci_pytorch_1-8: - name: PyTorch 1.8 - if: always() - needs: [run_past_ci_pytorch_1-9] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.8" - secrets: inherit - - run_past_ci_pytorch_1-7: - name: PyTorch 1.7 - if: always() - needs: [run_past_ci_pytorch_1-8] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.7" - secrets: inherit - - run_past_ci_pytorch_1-6: - name: PyTorch 1.6 - if: always() - needs: [run_past_ci_pytorch_1-7] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.6" - secrets: inherit - - run_past_ci_pytorch_1-5: - name: PyTorch 1.5 - if: always() - needs: [run_past_ci_pytorch_1-6] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.5" - secrets: inherit - - run_past_ci_pytorch_1-4: - name: PyTorch 1.4 - if: always() - needs: [run_past_ci_pytorch_1-5] - uses: ./.github/workflows/self-past.yml - with: - framework: pytorch - version: "1.4" - secrets: inherit - - run_past_ci_tensorflow_2-8: - name: TensorFlow 2.8 - if: always() - needs: [run_past_ci_pytorch_1-4] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.8" - secrets: inherit - - run_past_ci_tensorflow_2-7: - name: TensorFlow 2.7 - if: always() - needs: [run_past_ci_tensorflow_2-8] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.7" - secrets: inherit - - run_past_ci_tensorflow_2-6: - name: TensorFlow 2.6 - if: always() - needs: [run_past_ci_tensorflow_2-7] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.6" - secrets: inherit - - run_past_ci_tensorflow_2-5: - name: TensorFlow 2.5 - if: always() - needs: [run_past_ci_tensorflow_2-6] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.5" - secrets: inherit - - run_past_ci_tensorflow_2-4: - name: TensorFlow 2.4 - if: always() - needs: [run_past_ci_tensorflow_2-5] - uses: ./.github/workflows/self-past.yml - with: - framework: tensorflow - version: "2.4" - secrets: inherit \ No newline at end of file diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index c59800445bdc..71f904c831e9 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (past) +name: Self-hosted runner (past-ci) # Note that each job's dependencies go into a corresponding docker file. # @@ -50,7 +50,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -65,7 +65,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -101,7 +101,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -111,6 +111,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ inputs.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Echo folder ${{ matrix.folders }} shell: bash # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to @@ -126,6 +130,12 @@ jobs: run: | nvidia-smi + - name: Install + if: inputs.framework == 'pytorch' + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + - name: Environment working-directory: /transformers run: | @@ -157,7 +167,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: @@ -167,7 +177,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -177,6 +187,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ inputs.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Echo folder ${{ matrix.folders }} shell: bash # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to @@ -192,6 +206,12 @@ jobs: run: | nvidia-smi + - name: Install + if: inputs.framework == 'pytorch' + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + - name: Environment working-directory: /transformers run: | @@ -223,14 +243,89 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + run_all_tests_torch_cuda_extensions_gpu: + name: Torch CUDA extension tests + if: inputs.framework == 'pytorch' + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] + needs: setup + container: + image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Install + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: / + run: | + python3 -m pip uninstall -y deepspeed + rm -rf DeepSpeed + git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu + send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu] + needs: [ + check_runner_status, + check_runners, + setup, + run_tests_single_gpu, + run_tests_multi_gpu, + run_all_tests_torch_cuda_extensions_gpu + ] steps: - name: Preliminary job status shell: bash @@ -254,6 +349,7 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }} RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} @@ -271,4 +367,11 @@ jobs: uses: actions/upload-artifact@v3 with: name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }} - path: test_failure_tables \ No newline at end of file + path: test_failure_tables + + # delete-artifact + - uses: geekyeggo/delete-artifact@v2 + with: + name: | + single-* + multi-* \ No newline at end of file diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml new file mode 100644 index 000000000000..0dfbbca7ba12 --- /dev/null +++ b/.github/workflows/self-push-amd.yml @@ -0,0 +1,337 @@ +name: Self-hosted runner AMD GPU (push) + +on: + workflow_run: + workflows: ["Self-hosted runner (push-caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - ci_* + - ci-* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + repository_dispatch: + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + PYTEST_TIMEOUT: 60 + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 + +jobs: + check_runner_status: + name: Check Runner Status + runs-on: ubuntu-latest + steps: + - name: Checkout transformers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: Check Runner Status + run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + + check_runners: + name: Check Runners + needs: check_runner_status + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + gpu_flavor: [mi210] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: ROCM-SMI + run: | + rocminfo | grep "Agent" -A 14 + - name: Show HIP environment + run: | + echo "HIP: $HIP_VISIBLE_DEVICES" + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + setup_gpu: + name: Setup + needs: check_runners + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + gpu_flavor: [mi210] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + test_map: ${{ steps.set-matrix.outputs.test_map }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # `CI_BRANCH_PUSH`: The branch name from the push event + # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event + # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty) + # `CI_SHA_PUSH`: The commit SHA from the push event + # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event + # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty) + run: | + CI_BRANCH_PUSH=${{ github.event.ref }} + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH=${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Cleanup + working-directory: /transformers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Fetch the tests to run + working-directory: /transformers + # TODO: add `git-python` in the docker images + run: | + pip install --upgrade git-python + python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt + + - name: Report fetched tests + uses: actions/upload-artifact@v3 + with: + name: test_fetched + path: /transformers/test_preparation.txt + + - id: set-matrix + name: Organize tests into models + working-directory: /transformers + # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc. + # The `test_map` is used to get the actual identified test files under each key. + # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail) + run: | + if [ -f test_map.json ]; then + keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)') + test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)') + else + keys=$(python3 -c 'keys = ["dummy"]; print(keys)') + test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)') + fi + echo $keys + echo $test_map + echo "matrix=$keys" >> $GITHUB_OUTPUT + echo "test_map=$test_map" >> $GITHUB_OUTPUT + + run_tests_amdgpu: + name: Model tests + needs: setup_gpu + # `dummy` means there is no test to run + if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} + machine_type: [single-gpu, multi-gpu] + gpu_flavor: [mi210] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${{ github.event.ref }} + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH=${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: ROCM-SMI + run: | + rocminfo | grep "Agent" -A 14 + - name: Show HIP environment + run: | + echo "HIP: $HIP_VISIBLE_DEVICES" + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all non-slow selected tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + send_results: + name: Send results to webhook + runs-on: ubuntu-latest + if: always() + needs: [ + check_runner_status, + check_runners, + setup_gpu, + run_tests_amdgpu, +# run_tests_torch_cuda_extensions_single_gpu, +# run_tests_torch_cuda_extensions_multi_gpu + ] + steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Runner availability: ${{ needs.check_runner_status.result }}" + echo "Setup status: ${{ needs.setup_gpu.result }}" + echo "Runner status: ${{ needs.check_runners.result }}" + + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${{ github.event.ref }} + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + CI_BRANCH_WORKFLOW_RUN=${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH=${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN=${{ github.event.workflow_run.head_sha }} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - uses: actions/checkout@v3 + # To avoid failure when multiple commits are merged into `main` in a short period of time. + # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ... + # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit) + with: + fetch-depth: 20 + + - name: Update clone using environment variables + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - uses: actions/download-artifact@v3 + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: push + CI_TITLE_PUSH: ${{ github.event.head_commit.message }} + CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} + CI_SHA: ${{ env.CI_SHA }} + RUNNER_STATUS: ${{ needs.check_runner_status.result }} + RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} + SETUP_STATUS: ${{ needs.setup_gpu.result }} + + # We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. + run: | + pip install slack_sdk + pip show slack_sdk + python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}" diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index b6c3a70e3eb8..e4b1b3b4b235 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -45,7 +45,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -60,7 +60,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -158,7 +158,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -195,6 +195,10 @@ jobs: git checkout ${{ env.CI_SHA }} echo "log = $(git log -n 1)" + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Echo folder ${{ matrix.folders }} shell: bash # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to @@ -247,7 +251,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -284,6 +288,10 @@ jobs: git checkout ${{ env.CI_SHA }} echo "log = $(git log -n 1)" + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Echo folder ${{ matrix.folders }} shell: bash # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to @@ -336,7 +344,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -373,6 +381,10 @@ jobs: git checkout ${{ env.CI_SHA }} echo "log = $(git log -n 1)" + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Remove cached torch extensions run: rm -rf /github/home/.cache/torch_extensions/ @@ -381,7 +393,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -422,7 +434,7 @@ jobs: fail-fast: false matrix: machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -459,6 +471,10 @@ jobs: git checkout ${{ env.CI_SHA }} echo "log = $(git log -n 1)" + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Remove cached torch extensions run: rm -rf /github/home/.cache/torch_extensions/ @@ -467,7 +483,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -568,6 +584,7 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: push CI_TITLE_PUSH: ${{ github.event.head_commit.message }} CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 750f4a956943..2bd6bbade1cb 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -9,7 +9,10 @@ name: Self-hosted runner (scheduled) on: repository_dispatch: schedule: - - cron: "0 2 * * *" + - cron: "17 2 * * *" + push: + branches: + - run_scheduled_ci* env: HF_HOME: /mnt/cache @@ -40,7 +43,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -55,7 +58,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -95,7 +98,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -116,6 +119,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -152,7 +159,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -173,6 +180,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -208,7 +219,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -218,6 +229,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -255,7 +270,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-pytorch-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -265,6 +280,10 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -301,7 +320,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-tensorflow-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -312,6 +331,10 @@ jobs: run: | git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI run: | nvidia-smi @@ -348,7 +371,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] needs: setup container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu @@ -358,6 +381,10 @@ jobs: working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Remove cached torch extensions run: rm -rf /github/home/.cache/torch_extensions/ @@ -366,7 +393,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -482,13 +509,25 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: scheduled + CI_SHA: ${{ github.sha }} + CI_WORKFLOW_REF: ${{ github.workflow_ref }} RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} SETUP_STATUS: ${{ needs.setup.result }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | + sudo apt-get install -y curl pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. + - name: Failure table artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: test_failure_tables + path: test_failure_tables diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 9412442a7d0a..1211d71a32e2 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -2,7 +2,7 @@ name: Stale Bot on: schedule: - - cron: "0 15 * * *" + - cron: "0 8 * * *" jobs: close_stale_issues: @@ -17,7 +17,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.7 + python-version: 3.8 - name: Install requirements run: | diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml index f6c9afd15b7e..de25fe0a08a2 100644 --- a/.github/workflows/update_metdata.yml +++ b/.github/workflows/update_metdata.yml @@ -4,7 +4,7 @@ on: push: branches: - main - - update_transformers_metadata + - update_transformers_metadata* jobs: build_and_package: @@ -16,25 +16,12 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Load cached virtual environment - uses: actions/cache@v2 - id: cache - with: - path: ~/venv/ - key: v3-metadata-${{ hashFiles('setup.py') }} - - - name: Create virtual environment on cache miss - if: steps.cache.outputs.cache-hit != 'true' - run: | - python -m venv ~/venv && . ~/venv/bin/activate - pip install --upgrade pip - - name: Setup environment run: | - . ~/venv/bin/activate - pip install git+https://github.com/huggingface/transformers#egg=transformers[dev] + pip install --upgrade pip + pip install datasets pandas==2.0.3 + pip install .[torch,tf,flax] - name: Update metadata run: | - . ~/venv/bin/activate - python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }} + python utils/update_metadata.py --token ${{ secrets.LYSANDRE_HF_TOKEN }} --commit_sha ${{ github.sha }} diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml new file mode 100644 index 000000000000..64befc595c42 --- /dev/null +++ b/.github/workflows/upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload PR Documentation + +on: + workflow_run: + workflows: ["Build PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: transformers + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index cf8183463613..337f2ef2c735 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,7 @@ tags *.lock # DS_Store (MacOS) -.DS_Store \ No newline at end of file +.DS_Store + +# ruff +.ruff_cache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aae902fa4a8d..6cfa3e47398c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -130,7 +130,7 @@ You will need basic `git` proficiency to contribute to manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference. -You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing: +You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing: 1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code @@ -139,15 +139,15 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai 2. Clone your fork to your local disk, and add the base repository as a remote: ```bash - $ git clone git@github.com:/transformers.git - $ cd transformers - $ git remote add upstream https://github.com/huggingface/transformers.git + git clone git@github.com:/transformers.git + cd transformers + git remote add upstream https://github.com/huggingface/transformers.git ``` 3. Create a new branch to hold your development changes: ```bash - $ git checkout -b a-descriptive-name-for-my-changes + git checkout -b a-descriptive-name-for-my-changes ``` 🚨 **Do not** work on the `main` branch! @@ -155,39 +155,41 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai 4. Set up a development environment by running the following command in a virtual environment: ```bash - $ pip install -e ".[dev]" + pip install -e ".[dev]" ``` If 🤗 Transformers was already installed in the virtual environment, remove it with `pip uninstall transformers` before reinstalling it in editable mode with the `-e` flag. - Depending on your OS, you may need to install some external libraries as well if the `pip` installation fails. - - For macOS, you will likely need [MeCab](https://taku910.github.io/mecab/) which can be installed from Homebrew: - + Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a + failure with this command. If that's the case make sure to install the Deep Learning framework you are working with + (PyTorch, TensorFlow and/or Flax) then do: + ```bash - brew install mecab + pip install -e ".[quality]" ``` + which should be enough for most use cases. + 5. Develop the features on your branch. As you work on your code, you should make sure the test suite passes. Run the tests impacted by your changes like this: ```bash - $ pytest tests/.py + pytest tests/.py ``` For more information about tests, check out the [Testing](https://huggingface.co/docs/transformers/testing) guide. - 🤗 Transformers relies on `black` and `isort` to format its source code + 🤗 Transformers relies on `black` and `ruff` to format its source code consistently. After you make changes, apply automatic style corrections and code verifications that can't be automated in one go with: ```bash - $ make fixup + make fixup ``` This target is also optimized to only work with files modified by the PR you're working on. @@ -196,21 +198,21 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai style corrections: ```bash - $ make style + make style ``` - 🤗 Transformers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality + 🤗 Transformers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality controls are run by the CI, but you can run the same checks with: ```bash - $ make quality + make quality ``` Finally, we have a lot of scripts to make sure we didn't forget to update some files when adding a new model. You can run these scripts with: ```bash - $ make repo-consistency + make repo-consistency ``` To learn more about those checks and how to fix any issues with them, check out the @@ -220,13 +222,13 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai make sure you install the documentation builder: ```bash - $ pip install ".[docs]" + pip install ".[docs]" ``` Run the following command from the root of the repository: ```bash - $ doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build + doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build ``` This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated @@ -236,8 +238,8 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai record your changes locally with `git commit`: ```bash - $ git add modified_file.py - $ git commit + git add modified_file.py + git commit ``` Please remember to write [good commit @@ -247,14 +249,14 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer: ```bash - $ git fetch upstream - $ git rebase upstream/main + git fetch upstream + git rebase upstream/main ``` Push your changes to your branch: ```bash - $ git push -u origin a-descriptive-name-for-my-changes + git push -u origin a-descriptive-name-for-my-changes ``` If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally. @@ -273,7 +275,7 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai request description to make sure they are linked (and people viewing the issue know you are working on it).
☐ To indicate a work in progress please prefix the title with `[WIP]`. These are -useful to avoid duplicated work, and to differentiate it from PRs ready to be merged. +useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.
☐ Make sure existing tests pass.
☐ If adding a new feature, also add tests for it.
- If you are adding a new model, make sure you use @@ -282,7 +284,7 @@ useful to avoid duplicated work, and to differentiate it from PRs ready to be me `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`. - If you are adding a new tokenizer, write tests and make sure `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes. - CircleCI does not run the slow tests, but GitHub Actions does every night!
+ - CircleCI does not run the slow tests, but GitHub Actions does every night!
☐ All public methods must have informative docstrings (see [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) @@ -307,14 +309,14 @@ We like `pytest` and `pytest-xdist` because it's faster. From the root of the repository, specify a *path to a subfolder or a test file* to run the test. ```bash -$ python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model +python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model ``` Similarly, for the `examples` directory, specify a *path to a subfolder or test file* to run the test. For example, the following command tests the text classification subfolder in the PyTorch `examples` directory: ```bash -$ pip install -r examples/xxx/requirements.txt # only needed the first time -$ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification +pip install -r examples/xxx/requirements.txt # only needed the first time +python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification ``` In fact, this is actually how our `make test` and `make test-examples` commands are implemented (not including the `pip install`)! @@ -333,11 +335,16 @@ Remember to specify a *path to a subfolder or a test file* to run the test. Othe ```bash -$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model -$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification +RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model +RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification ``` -Like the slow tests, custom tokenizer tests are skipped but you can set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run them. +Like the slow tests, there are other environment variables available which not enabled by default during testing: +- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers. +- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration. +- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration. + +More environment variables and additional information can be found in the [testing_utils.py](src/transformers/testing_utils.py). 🤗 Transformers uses `pytest` as a test runner only. It doesn't use any `pytest`-specific features in the test suite itself. @@ -346,8 +353,8 @@ This means `unittest` is fully supported. Here's how to run tests with `unittest`: ```bash -$ python -m unittest discover -s tests -t . -v -$ python -m unittest discover -s examples -t examples -v +python -m unittest discover -s tests -t . -v +python -m unittest discover -s examples -t examples -v ``` ### Style guide @@ -358,7 +365,7 @@ for more information. ### Develop on Windows -On Windows (unless you're working in [Windows Subsytem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings: +On Windows (unless you're working in [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings: ```bash git config core.autocrlf input @@ -381,8 +388,8 @@ When updating the main branch of a forked repository, please follow these steps 2. If a PR is absolutely necessary, use the following steps after checking out your branch: ```bash -$ git checkout -b your-branch-for-syncing -$ git pull --squash --no-commit upstream main -$ git commit -m '' -$ git push --set-upstream origin your-branch-for-syncing +git checkout -b your-branch-for-syncing +git pull --squash --no-commit upstream main +git commit -m '' +git push --set-upstream origin your-branch-for-syncing ``` diff --git a/ISSUES.md b/ISSUES.md index 7c36da3c6804..95f2334b26c8 100644 --- a/ISSUES.md +++ b/ISSUES.md @@ -158,7 +158,7 @@ You are not required to read the following guidelines before opening an issue. H --do_train --n_train 500 --num_train_epochs 1 \ --per_device_train_batch_size 1 --freeze_embeds \ --src_lang en_XX --tgt_lang ro_RO --task translation \ - --fp16 --sharded_ddp + --fp16 ``` If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 1aba38f67a22..000000000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include LICENSE diff --git a/Makefile b/Makefile index 999ddd6ee156..0c51598594c0 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,8 @@ modified_only_fixup: $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs))) @if test -n "$(modified_py_files)"; then \ echo "Checking/fixing $(modified_py_files)"; \ - black --preview $(modified_py_files); \ - isort $(modified_py_files); \ - flake8 $(modified_py_files); \ + black $(modified_py_files); \ + ruff $(modified_py_files) --fix; \ else \ echo "No library .py files were modified"; \ fi @@ -40,17 +39,19 @@ repo-consistency: python utils/check_repo.py python utils/check_inits.py python utils/check_config_docstrings.py - python utils/tests_fetcher.py --sanity_check + python utils/check_config_attributes.py + python utils/check_doctest_list.py python utils/update_metadata.py --check-only + python utils/check_task_guides.py + python utils/check_docstrings.py # this target runs checks on all files quality: - black --check --preview $(check_dirs) - isort --check-only $(check_dirs) + black --check $(check_dirs) setup.py conftest.py python utils/custom_init_isort.py --check_only python utils/sort_auto_mappings.py --check_only - flake8 $(check_dirs) + ruff $(check_dirs) setup.py conftest.py doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source python utils/check_doc_toc.py @@ -65,8 +66,8 @@ extra_style_checks: # this target runs checks on all files and potentially modifies some of them style: - black --preview $(check_dirs) - isort $(check_dirs) + black $(check_dirs) setup.py conftest.py + ruff $(check_dirs) setup.py conftest.py --fix ${MAKE} autogenerate_code ${MAKE} extra_style_checks @@ -80,6 +81,9 @@ fix-copies: python utils/check_copies.py --fix_and_overwrite python utils/check_table.py --fix_and_overwrite python utils/check_dummies.py --fix_and_overwrite + python utils/check_doctest_list.py --fix_and_overwrite + python utils/check_task_guides.py --fix_and_overwrite + python utils/check_docstrings.py --fix_and_overwrite # Run tests for the library @@ -110,3 +114,10 @@ post-release: post-patch: python utils/release.py --post_release --patch + +build-release: + rm -rf dist + rm -rf build + python setup.py bdist_wheel + python setup.py sdist + python utils/check_build.py diff --git a/README.md b/README.md index 0906c65deeda..c8afdd51d334 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,15 @@ limitations under the License. -->

-
- -
-

+ + + + Hugging Face Transformers Library + +
+
+

+

Build @@ -46,8 +51,9 @@ limitations under the License. 한국어 | Español | 日本語 | - हिन्दी -

+ हिन्दी | + Русский +

@@ -91,16 +97,35 @@ In Computer Vision: - [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224) - [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50) - [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) -- [Panoptic Segmentation with DETR](https://huggingface.co/facebook/detr-resnet-50-panoptic) +- [Panoptic Segmentation with MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco) +- [Depth Estimation with DPT](https://huggingface.co/docs/transformers/model_doc/dpt) +- [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae) +- [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large) In Audio: - [Automatic Speech Recognition with Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) - [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) +- [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) In Multimodal tasks: +- [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq) - [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) +- [Zero-shot Image Classification with CLIP](https://huggingface.co/openai/clip-vit-large-patch14) +- [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa) +- [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip) + + +## 100 projects using Transformers + +Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the +Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone +else to build their dream projects. + +In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the +community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100 +incredible projects built in the vicinity of transformers. -**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities. +If you own or use a project that you believe should be part of the list, please open a PR to add it! ## If you are looking for custom support from the Hugging Face team @@ -223,7 +248,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta ### With pip -This repository is tested on Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ and TensorFlow 2.3+. +This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ and TensorFlow 2.6+. You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). @@ -263,8 +288,11 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. @@ -274,24 +302,31 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. @@ -300,36 +335,53 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai. 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. -1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. +1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori. -1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. +1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. @@ -338,6 +390,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding. +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. @@ -346,32 +400,50 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. @@ -380,52 +452,65 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal. 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. @@ -448,7 +533,6 @@ These implementations have been tested on several datasets (see the example scri | [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API | | [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks | | [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community | -| [Migration](https://huggingface.co/docs/transformers/migration) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` | ## Citation diff --git a/README_es.md b/README_es.md index 341fd87923ca..78cfb415f975 100644 --- a/README_es.md +++ b/README_es.md @@ -18,7 +18,7 @@ limitations under the License.

-

+

Build @@ -47,7 +47,7 @@ limitations under the License. Español | 日本語 | हिन्दी -

+

@@ -92,6 +92,7 @@ En visión de ordenador: - [Detección de objetos con DETR](https://huggingface.co/facebook/detr-resnet-50) - [Segmentación semántica con SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) - [Segmentación panóptica con DETR](https://huggingface.co/facebook/detr-resnet-50-panoptic) +- [Segmentación Universal con OneFormer (Segmentación Semántica, de Instancia y Panóptica con un solo modelo)](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large) En Audio: - [Reconocimiento de voz automático con Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) @@ -223,7 +224,7 @@ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.h ### Con pip -Este repositorio está probado en Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ y TensorFlow 2.3+. +Este repositorio está probado en Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ y TensorFlow 2.6+. Deberías instalar 🤗 Transformers en un [ambiente virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). @@ -263,8 +264,11 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. @@ -274,24 +278,31 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. @@ -300,36 +311,53 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai. 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori. 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. @@ -338,6 +366,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding. +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. @@ -346,32 +376,50 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. @@ -380,52 +428,65 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal. 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. @@ -465,4 +526,4 @@ Ahora nosotros tenemos un [papel](https://www.aclweb.org/anthology/2020.emnlp-de url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6", pages = "38--45" } -``` +``` \ No newline at end of file diff --git a/README_hd.md b/README_hd.md index 194aa1ab7a8b..4cd0052bd295 100644 --- a/README_hd.md +++ b/README_hd.md @@ -43,7 +43,7 @@ checkpoint: जाँच बिंदु

-

+

Build @@ -72,7 +72,7 @@ checkpoint: जाँच बिंदु Español | 日本語 | हिन्दी | -

+

@@ -200,7 +200,7 @@ checkpoint: जाँच बिंदु ### पिप का उपयोग करना -इस रिपॉजिटरी का परीक्षण Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ और TensorFlow 2.3+ के तहत किया गया है। +इस रिपॉजिटरी का परीक्षण Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ और TensorFlow 2.6+ के तहत किया गया है। आप [वर्चुअल एनवायरनमेंट] (https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश] (https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें। @@ -236,8 +236,11 @@ conda install -c huggingface transformers 🤗 ट्रांसफॉर्मर वर्तमान में निम्नलिखित आर्किटेक्चर का समर्थन करते हैं (मॉडल के अवलोकन के लिए [यहां] देखें (https://huggingface.co/docs/transformers/model_summary)): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago) साथ थीसिस [ALBERT: A Lite BERT for Self-supervised भाषा प्रतिनिधित्व सीखना](https://arxiv.org/abs/1909.11942), झेंझोंग लैन, मिंगदा चेन, सेबेस्टियन गुडमैन, केविन गिम्पेल, पीयूष शर्मा, राडू सोरिकट -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research से) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. द्वाराअनुसंधान पत्र [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) के साथ जारी किया गया +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई। 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया। @@ -247,24 +250,31 @@ conda install -c huggingface transformers 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research से) साथ में पेपर [BERTweet: अंग्रेजी ट्वीट्स के लिए एक पूर्व-प्रशिक्षित भाषा मॉडल] (https://aclanthology.org/2020.emnlp-demos.2/) डाट क्वोक गुयेन, थान वु और अन्ह तुआन गुयेन द्वारा प्रकाशित। 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (गूगल रिसर्च से) साथ वाला पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv .org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानोन, फिलिप फाम, अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा। 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (गूगल रिसर्च से) साथ में पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv.org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानन, फिलिप फाम द्वारा , अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा पोस्ट किया गया। -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (फेसबुक से) साथ में कागज [एक ओपन-डोमेन चैटबॉट बनाने की विधि](https://arxiv.org /abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम। स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा। 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (फेसबुक से) साथ में पेपर [एक ओपन-डोमेन चैटबॉट बनाने की रेसिपी](https://arxiv .org/abs/2004.13637) स्टीफन रोलर, एमिली दीनन, नमन गोयल, दा जू, मैरी विलियमसन, यिनहान लियू, जिंग जू, मायल ओट, कर्ट शस्टर, एरिक एम स्मिथ, वाई-लैन बॉरो, जेसन वेस्टन द्वारा। -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce से) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. द्वाराअनुसंधान पत्र [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) के साथ जारी किया गया 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (एलेक्सा से) कागज के साथ [बीईआरटी के लिए ऑप्टिमल सबआर्किटेक्चर एक्सट्रैक्शन](https://arxiv.org/abs/ 2010.10499) एड्रियन डी विंटर और डैनियल जे पेरी द्वारा। +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (हरबिन इंस्टिट्यूट ऑफ़ टेक्नोलॉजी/माइक्रोसॉफ्ट रिसर्च एशिया/इंटेल लैब्स से) कागज के साथ [ब्रिजटॉवर: विजन-लैंग्वेज रिप्रेजेंटेशन लर्निंग में एनकोडर्स के बीच ब्रिज बनाना]() by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA से) Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. द्वाराअनुसंधान पत्र [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) के साथ जारी किया गया 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google अनुसंधान से) साथ में कागज [ByT5: पूर्व-प्रशिक्षित बाइट-टू-बाइट मॉडल के साथ एक टोकन-मुक्त भविष्य की ओर] (https://arxiv.org/abs/2105.13626) Linting Xue, Aditya Barua, Noah Constant, रामी अल-रफू, शरण नारंग, मिहिर काले, एडम रॉबर्ट्स, कॉलिन रैफेल द्वारा पोस्ट किया गया। 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा। 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा। 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) के साथ जारी किया गया 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा। 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज। +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI से) Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. द्वाराअनुसंधान पत्र [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) के साथ जारी किया गया 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (माइक्रोसॉफ्ट रिसर्च एशिया से) कागज के साथ [फास्ट ट्रेनिंग कन्वर्जेंस के लिए सशर्त डीईटीआर](https://arxiv. org/abs/2108.06152) डेपू मेंग, ज़ियाओकांग चेन, ज़ेजिया फैन, गैंग ज़ेंग, होउकियांग ली, युहुई युआन, लेई सन, जिंगडोंग वांग द्वारा। 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech से) साथ में कागज [ConvBERT: स्पैन-आधारित डायनेमिक कनवल्शन के साथ BERT में सुधार](https://arxiv .org/abs/2008.02496) जिहांग जियांग, वीहाओ यू, डाकान झोउ, युनपेंग चेन, जियाशी फेंग, शुइचेंग यान द्वारा। 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI से) साथ वाला पेपर [A ConvNet for the 2020s](https://arxiv.org/abs /2201.03545) ज़ुआंग लियू, हेंज़ी माओ, चाओ-युआन वू, क्रिस्टोफ़ फीचटेनहोफ़र, ट्रेवर डेरेल, सैनिंग ज़ी द्वारा। +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (सिंघुआ यूनिवर्सिटी से) साथ में पेपर [सीपीएम: ए लार्ज-स्केल जेनेरेटिव चाइनीज प्री-ट्रेंड लैंग्वेज मॉडल](https : //arxiv.org/abs/2012.00413) झेंग्यान झांग, जू हान, हाओ झोउ, पेई के, युक्सियन गु, डेमिंग ये, युजिया किन, युशेंग सु, हाओझे जी, जियान गुआन, फैंचाओ क्यूई, ज़ियाओझी वांग, यानान झेंग द्वारा , गुओयांग ज़ेंग, हुआनकी काओ, शेंगकी चेन, डाइक्सुआन ली, ज़ेनबो सन, ज़ियुआन लियू, मिनली हुआंग, वेंटाओ हान, जी तांग, जुआनज़ी ली, ज़ियाओयान झू, माओसोंग सन। +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: ए कंडिशनल ट्रांसफॉर्मर लैंग्वेज मॉडल फॉर कंट्रोलेबल जेनरेशन](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया। 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: इंट्रोड्यूसिंग कनवॉल्यूशन टू विजन ट्रांसफॉर्मर्स](https://arxiv.org/ एब्स/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा। 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec: भाषण, दृष्टि और भाषा में स्व-पर्यवेक्षित सीखने के लिए एक सामान्य ढांचा] (https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया। @@ -273,24 +283,34 @@ conda install -c huggingface transformers 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (बर्कले/फेसबुक/गूगल से) पेपर के साथ [डिसीजन ट्रांसफॉर्मर: रीनफोर्समेंट लर्निंग वाया सीक्वेंस मॉडलिंग](https : //arxiv.org/abs/2106.01345) लिली चेन, केविन लू, अरविंद राजेश्वरन, किमिन ली, आदित्य ग्रोवर, माइकल लास्किन, पीटर एबील, अरविंद श्रीनिवास, इगोर मोर्डच द्वारा पोस्ट किया गया। 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (सेंसटाइम रिसर्च से) साथ में पेपर [डिफॉर्मेबल डीईटीआर: डिफॉर्मेबल ट्रांसफॉर्मर्स फॉर एंड-टू-एंड ऑब्जेक्ट डिटेक्शन] (https://arxiv.org/abs/2010.04159) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, जिफेंग दाई द्वारा पोस्ट किया गया। 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (फेसबुक से) साथ में पेपर [ट्रेनिंग डेटा-एफिशिएंट इमेज ट्रांसफॉर्मर और डिस्टिलेशन थ्रू अटेंशन](https://arxiv .org/abs/2012.12877) ह्यूगो टौव्रोन, मैथ्यू कॉर्ड, मैथिज्स डूज़, फ़्रांसिस्को मस्सा, एलेक्ज़ेंडर सबलेरोल्स, हर्वे जेगौ द्वारा। +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI से) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. द्वाराअनुसंधान पत्र [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) के साथ जारी किया गया +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (फेसबुक से) साथ में कागज [ट्रांसफॉर्मर्स के साथ एंड-टू-एंड ऑब्जेक्ट डिटेक्शन](https://arxiv. org/abs/2005.12872) निकोलस कैरियन, फ़्रांसिस्को मस्सा, गेब्रियल सिनेव, निकोलस उसुनियर, अलेक्जेंडर किरिलोव, सर्गेई ज़ागोरुयको द्वारा। 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [DialoGPT: बड़े पैमाने पर जनरेटिव प्री-ट्रेनिंग फॉर कन्वर्सेशनल रिस्पांस जेनरेशन](https ://arxiv.org/abs/1911.00536) यिज़े झांग, सिकी सन, मिशेल गैली, येन-चुन चेन, क्रिस ब्रोकेट, जियांग गाओ, जियानफेंग गाओ, जिंगजिंग लियू, बिल डोलन द्वारा। 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI से) Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. द्वाराअनुसंधान पत्र [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) के साथ जारी किया गया 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (हगिंगफेस से), साथ में कागज [डिस्टिलबर्ट, बीईआरटी का डिस्टिल्ड वर्जन: छोटा, तेज, सस्ता और हल्का] (https://arxiv.org/abs/1910.01108) विक्टर सनह, लिसांड्रे डेब्यू और थॉमस वुल्फ द्वारा पोस्ट किया गया। यही तरीका GPT-2 को [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERta से [DistilRoBERta](https://github.com) पर कंप्रेस करने के लिए भी लागू किया जाता है। / हगिंगफेस/ट्रांसफॉर्मर्स/ट्री/मेन/उदाहरण/डिस्टिलेशन), बहुभाषी BERT से [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) और डिस्टिलबर्ट का जर्मन संस्करण। 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [DiT: सेल्फ सुपरवाइज्ड प्री-ट्रेनिंग फॉर डॉक्यूमेंट इमेज ट्रांसफॉर्मर](https://arxiv.org/abs/2203.02378) जुनलॉन्ग ली, यिहेंग जू, टेंगचाओ लव, लेई कुई, चा झांग द्वारा फुरु वेई द्वारा पोस्ट किया गया। 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER से) साथ में कागज [OCR-मुक्त डॉक्यूमेंट अंडरस्टैंडिंग ट्रांसफॉर्मर](https://arxiv.org/abs /2111.15664) गीवूक किम, टीकग्यू होंग, मूनबिन यिम, जियोंग्योन नाम, जिनयॉन्ग पार्क, जिनयॉन्ग यिम, वोनसेओक ह्वांग, सांगडू यूं, डोंगयून हान, सेउंग्युन पार्क द्वारा। 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (फेसबुक से) साथ में पेपर [ओपन-डोमेन क्वेश्चन आंसरिंग के लिए डेंस पैसेज रिट्रीवल](https://arxiv. org/abs/2004.04906) व्लादिमीर करपुखिन, बरलास ओज़ुज़, सेवन मिन, पैट्रिक लुईस, लेडेल वू, सर्गेई एडुनोव, डैनकी चेन, और वेन-ताऊ यिह द्वारा। 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (इंटेल लैब्स से) साथ में कागज [विज़न ट्रांसफॉर्मर्स फॉर डेंस प्रेडिक्शन](https://arxiv.org /abs/2103.13413) रेने रैनफ्टल, एलेक्सी बोचकोवस्की, व्लादलेन कोल्टन द्वारा। +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google रिसर्च/स्टैनफोर्ड यूनिवर्सिटी से) साथ में दिया गया पेपर [इलेक्ट्रा: जेनरेटर के बजाय भेदभाव करने वाले के रूप में टेक्स्ट एन्कोडर्स का पूर्व-प्रशिक्षण] (https://arxiv.org/abs/2003.10555) केविन क्लार्क, मिन्ह-थांग लुओंग, क्वोक वी. ले, क्रिस्टोफर डी. मैनिंग द्वारा पोस्ट किया गया। +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI से) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. द्वाराअनुसंधान पत्र [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) के साथ जारी किया गया 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google रिसर्च से) साथ में दिया गया पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https:/ /arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा। 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया। +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स। -1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा। 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा। 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा। +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले ​​द्वारा रिहाई। -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा। 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI से) साथ में दिया गया पेपर [जेनरेटिव प्री-ट्रेनिंग द्वारा भाषा की समझ में सुधार](https://blog .openai.com/language-unsupervised/) एलेक रैडफोर्ड, कार्तिक नरसिम्हन, टिम सालिमन्स और इल्या सुत्स्केवर द्वारा। 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI से) रिपॉजिटरी के साथ [EleutherAI/gpt-neo](https://github.com/ EleutherAI /gpt-neo) रिलीज। सिड ब्लैक, स्टेला बिडरमैन, लियो गाओ, फिल वांग और कॉनर लेही द्वारा पोस्ट किया गया। @@ -298,11 +318,18 @@ conda install -c huggingface transformers 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (अबेजा के जरिए) शिन्या ओटानी, ताकायोशी मकाबे, अनुज अरोड़ा, क्यो हटोरी द्वारा। 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (ओपनएआई से) साथ में पेपर [लैंग्वेज मॉडल्स अनसुपरवाइज्ड मल्टीटास्क लर्नर्स हैं](https://blog.openai.com/better-language-models/) एलेक रैडफोर्ड*, जेफरी वू*, रेवन चाइल्ड, डेविड लुआन, डारियो एमोडी* द्वारा * और इल्या सुत्सकेवर** ने पोस्ट किया। 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI से) साथ वाला पेपर [kingoflolz/mesh-transformer-jax](https://github. com/kingoflolz/mesh-transformer-jax/) बेन वांग और अरन कोमात्सुजाकी द्वारा। -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [ह्यूबर्ट: सेल्फ सुपरवाइज्ड स्पीच रिप्रेजेंटेशन लर्निंग बाय मास्क्ड प्रेडिक्शन ऑफ हिडन यूनिट्स](https ://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा। +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. @@ -311,6 +338,8 @@ conda install -c huggingface transformers 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (मेटा AI से) साथ वाला पेपर [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https:/ /arxiv.org/abs/2104.01136) बेन ग्राहम, अलाएल्डिन एल-नौबी, ह्यूगो टौवरन, पियरे स्टॉक, आर्मंड जौलिन, हर्वे जेगौ, मैथिज डूज़ द्वारा। 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (दक्षिण चीन प्रौद्योगिकी विश्वविद्यालय से) साथ में कागज [LiLT: एक सरल लेकिन प्रभावी भाषा-स्वतंत्र लेआउट ट्रांसफार्मर संरचित दस्तावेज़ समझ के लिए](https://arxiv.org/abs/2202.13669) जियापेंग वांग, लियानवेन जिन, काई डिंग द्वारा पोस्ट किया गया। +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI से) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. द्वाराअनुसंधान पत्र [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) के साथ जारी किया गया +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI से) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. द्वाराअनुसंधान पत्र [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) के साथ जारी किया गया 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (मैंडी गुओ, जोशुआ आइंस्ली, डेविड यूथस, सैंटियागो ओंटानन, जियानमो नि, यूं-हुआन सुंग, यिनफेई यांग द्वारा पोस्ट किया गया। 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (स्टूडियो औसिया से) साथ में पेपर [LUKE: डीप कॉन्टेक्स्टुअलाइज्ड एंटिटी रिप्रेजेंटेशन विद एंटिटी-अवेयर सेल्फ-अटेंशन](https ://arxiv.org/abs/2010.01057) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto द्वारा। @@ -319,32 +348,50 @@ conda install -c huggingface transformers 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (फेसबुक से) साथ देने वाला पेपर [बियॉन्ड इंग्लिश-सेंट्रिक मल्टीलिंगुअल मशीन ट्रांसलेशन](https://arxiv.org/ एब्स/2010.11125) एंजेला फैन, श्रुति भोसले, होल्गर श्वेन्क, झी मा, अहमद अल-किश्की, सिद्धार्थ गोयल, मनदीप बैनेस, ओनूर सेलेबी, गुइल्लाम वेन्जेक, विश्रव चौधरी, नमन गोयल, टॉम बर्च, विटाली लिपचिंस्की, सर्गेई एडुनोव, एडौर्ड द्वारा ग्रेव, माइकल औली, आर्मंड जौलिन द्वारा पोस्ट किया गया। 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg द्वारा [OPUS](http://opus.nlpl.eu/) डेटा से प्रशिक्षित मशीनी अनुवाद मॉडल पोस्ट किया गया टाइडेमैन द्वारा। [मैरियन फ्रेमवर्क](https://marian-nmt.github.io/) माइक्रोसॉफ्ट ट्रांसलेटर टीम द्वारा विकसित। 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (माइक्रोसॉफ्ट रिसर्च एशिया से) साथ में पेपर [मार्कअपएलएम: विजुअली-रिच डॉक्यूमेंट अंडरस्टैंडिंग के लिए टेक्स्ट और मार्कअप लैंग्वेज का प्री-ट्रेनिंग] (https://arxiv.org/abs/2110.08518) जुनलॉन्ग ली, यिहेंग जू, लेई कुई, फुरु द्वारा वी द्वारा पोस्ट किया गया। +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC से) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. द्वाराअनुसंधान पत्र [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) के साथ जारी किया गया 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (मेटा और UIUC से) पेपर के साथ जारी किया गया [प्रति-पिक्सेल वर्गीकरण वह सब नहीं है जिसकी आपको सिमेंटिक सेगमेंटेशन की आवश्यकता है] (https://arxiv.org/abs/2107.06278) बोवेन चेंग, अलेक्जेंडर जी. श्विंग, अलेक्जेंडर किरिलोव द्वारा >>>>>> रिबेस ठीक करें +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI से) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. द्वाराअनुसंधान पत्र [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) के साथ जारी किया गया 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [न्यूरल मशीन ट्रांसलेशन के लिए मल्टीलिंगुअल डीनोइजिंग प्री-ट्रेनिंग](https://arxiv. org/abs/2001.08210) यिनहान लियू, जियाताओ गु, नमन गोयल, जियान ली, सर्गेई एडुनोव, मार्जन ग़ज़विनिनेजाद, माइक लुईस, ल्यूक ज़ेटलमॉयर द्वारा। 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [एक्स्टेंसिबल बहुभाषी प्रीट्रेनिंग और फाइनट्यूनिंग के साथ बहुभाषी अनुवाद](https://arxiv युकिंग टैंग, चाउ ट्रान, जियान ली, पेंग-जेन चेन, नमन गोयल, विश्रव चौधरी, जियाताओ गु, एंजेला फैन द्वारा .org/abs/2008.00401)। +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook से) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. द्वाराअनुसंधान पत्र [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) के साथ जारी किया गया 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA से) कागज के साथ [Megatron-LM: मॉडल का उपयोग करके बहु-अरब पैरामीटर भाषा मॉडल का प्रशिक्षण Parallelism](https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा। 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA से) साथ वाला पेपर [Megatron-LM: ट्रेनिंग मल्टी-बिलियन पैरामीटर लैंग्वेज मॉडल्स यूजिंग मॉडल पैरेललिज़्म] (https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा पोस्ट किया गया। +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research से) Peng Wang, Cheng Da, and Cong Yao. द्वाराअनुसंधान पत्र [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) के साथ जारी किया गया +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (फ्रॉम Studio Ousia) साथ में पेपर [mLUKE: द पावर ऑफ एंटिटी रिप्रेजेंटेशन इन मल्टीलिंगुअल प्रीट्रेन्ड लैंग्वेज मॉडल्स](https://arxiv.org/abs/2110.08151) रयोकन री, इकुया यामाडा, और योशिमासा त्सुरोका द्वारा। +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook से) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. द्वाराअनुसंधान पत्र [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) के साथ जारी किया गया 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [मोबाइलबर्ट: संसाधन-सीमित उपकरणों के लिए एक कॉम्पैक्ट टास्क-अज्ञेय बीईआरटी] (https://arxiv.org/abs/2004.02984) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, और Denny Zhou द्वारा पोस्ट किया गया। 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया। +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML से) the MosaicML NLP Team. द्वाराअनुसंधान पत्र [llm-foundry](https://github.com/mosaicml/llm-foundry/) के साथ जारी किया गया +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया। +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा। 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (फ्रॉम मेटा) साथ में पेपर [नो लैंग्वेज लेफ्ट बिहाइंड: स्केलिंग ह्यूमन-सेंटेड मशीन ट्रांसलेशन] (https://arxiv.org/abs/2207.04672) एनएलएलबी टीम द्वारा प्रकाशित। +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta से) the NLLB team. द्वाराअनुसंधान पत्र [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) के साथ जारी किया गया +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI से) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. द्वाराअनुसंधान पत्र [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) के साथ जारी किया गया 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया। +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है। +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया। 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा। 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया। +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT से) Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. द्वाराअनुसंधान पत्र [blog post](https://www.adept.ai/blog/persimmon-8b) के साथ जारी किया गया 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research से) कागज के साथ [PhoBERT: वियतनामी के लिए पूर्व-प्रशिक्षित भाषा मॉडल](https://www .aclweb.org/anthology/2020.findings-emnlp.92/) डैट क्वोक गुयेन और अन्ह तुआन गुयेन द्वारा पोस्ट किया गया। +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google से) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. द्वाराअनुसंधान पत्र [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) के साथ जारी किया गया 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP से) साथ वाला पेपर [प्रोग्राम अंडरस्टैंडिंग एंड जेनरेशन के लिए यूनिफाइड प्री-ट्रेनिंग](https://arxiv .org/abs/2103.06333) वसी उद्दीन अहमद, सैकत चक्रवर्ती, बैशाखी रे, काई-वेई चांग द्वारा। 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू-सीक्वेंस प्री-ट्रेनिंग ](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा पोस्ट किया गया। +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) के साथ जारी किया गया 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA से) साथ वाला पेपर [डीप लर्निंग इंफ़ेक्शन के लिए इंटीजर क्वांटिज़ेशन: प्रिंसिपल्स एंड एम्पिरिकल इवैल्यूएशन](https:// arxiv.org/abs/2004.09602) हाओ वू, पैट्रिक जुड, जिआओजी झांग, मिखाइल इसेव और पॉलियस माइकेविसियस द्वारा। 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (फेसबुक से) साथ में कागज [रिट्रीवल-ऑगमेंटेड जेनरेशन फॉर नॉलेज-इंटेंसिव एनएलपी टास्क](https://arxiv .org/abs/2005.11401) पैट्रिक लुईस, एथन पेरेज़, अलेक्जेंड्रा पिक्टस, फैबियो पेट्रोनी, व्लादिमीर कारपुखिन, नमन गोयल, हेनरिक कुटलर, माइक लुईस, वेन-ताउ यिह, टिम रॉकटाशेल, सेबस्टियन रिडेल, डौवे कीला द्वारा। 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google अनुसंधान से) केल्विन गु, केंटन ली, ज़ोरा तुंग, पानुपोंग पसुपत और मिंग-वेई चांग द्वारा साथ में दिया गया पेपर [REALM: रिट्रीवल-ऑगमेंटेड लैंग्वेज मॉडल प्री-ट्रेनिंग](https://arxiv.org/abs/2002.08909)। @@ -353,52 +400,65 @@ conda install -c huggingface transformers 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (गूगल रिसर्च से) साथ वाला पेपर [पूर्व-प्रशिक्षित भाषा मॉडल में एम्बेडिंग कपलिंग पर पुनर्विचार](https://arxiv .org/pdf/2010.12821.pdf) ह्युंग वोन चुंग, थिबॉल्ट फ़ेवरी, हेनरी त्साई, एम. जॉनसन, सेबेस्टियन रुडर द्वारा। 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (माइक्रोसॉफ्ट रिसर्च से) [डीप रेसिडुअल लर्निंग फॉर इमेज रिकग्निशन] (https://arxiv. org/abs/1512.03385) कैमिंग हे, जियांग्यु झांग, शाओकिंग रेन, जियान सन द्वारा। 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (फेसबुक से), साथ में कागज [मजबूत रूप से अनुकूलित BERT प्रीट्रेनिंग दृष्टिकोण](https://arxiv.org/abs /1907.11692) यिनहान लियू, मायल ओट, नमन गोयल, जिंगफेई डू, मंदार जोशी, डैनकी चेन, ओमर लेवी, माइक लुईस, ल्यूक ज़ेटलमॉयर, वेसेलिन स्टोयानोव द्वारा। -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित। +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा। 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP से) साथ में पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स] (https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योआव आर्टज़ी द्वारा पोस्ट किया गया। +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (फेसबुक से), साथ में पेपर [फेयरसेक S2T: फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग विद फेयरसेक](https: //arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया。 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [लार्ज-स्केल सेल्फ- एंड सेमी-सुपरवाइज्ड लर्निंग फॉर स्पीच ट्रांसलेशन](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया। 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [स्पैन सिलेक्शन को प्री-ट्रेनिंग करके कुछ-शॉट क्वेश्चन आंसरिंग](https:// arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा। 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा। +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा। 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883। -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI)कॉलिन रैफेल और नोम शज़ीर और एडम रॉबर्ट्स और कैथरीन ली और शरण नारंग और माइकल मटेना द्वारा साथ में पेपर [एक एकीकृत टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर के साथ स्थानांतरण सीखने की सीमा की खोज] (https://arxiv.org/abs/1910.10683) और यांकी झोउ और वेई ली और पीटर जे लियू। 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI से) साथ वाला पेपर [google-research/text-to-text-transfer- ट्रांसफॉर्मर](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) कॉलिन रैफेल और नोम शज़ीर और एडम रॉबर्ट्स और कैथरीन ली और शरण नारंग द्वारा और माइकल मटेना और यांकी झोउ और वेई ली और पीटर जे लियू। 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [पबटेबल्स-1एम: टूवर्ड्स कॉम्प्रिहेंसिव टेबल एक्सट्रैक्शन फ्रॉम अनस्ट्रक्चर्ड डॉक्यूमेंट्स ](https://arxiv.org/abs/2110.00061) ब्रैंडन स्मॉक, रोहित पेसाला, रॉबिन अब्राहम द्वारा पोस्ट किया गया। 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI से) साथ में कागज [TAPAS: पूर्व-प्रशिक्षण के माध्यम से कमजोर पर्यवेक्षण तालिका पार्सिंग](https:// arxiv.org/abs/2004.02349) जोनाथन हर्ज़िग, पावेल क्रिज़िस्तोफ़ नोवाक, थॉमस मुलर, फ्रांसेस्को पिकिन्नो और जूलियन मार्टिन ईसेन्च्लोस द्वारा। 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [TAPEX: टेबल प्री-ट्रेनिंग थ्रू लर्निंग अ न्यूरल SQL एक्ज़ीक्यूटर](https: //arxiv.org/abs/2107.07653) कियान लियू, बेई चेन, जियाकी गुओ, मोर्टेज़ा ज़ियादी, ज़ेकी लिन, वीज़ू चेन, जियान-गुआंग लू द्वारा पोस्ट किया गया। 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. -1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal. +1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research से) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. द्वाराअनुसंधान पत्र [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) के साथ जारी किया गया 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा। 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया। +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (सिंघुआ यूनिवर्सिटी और ननकाई यूनिवर्सिटी से) साथ में पेपर [विजुअल अटेंशन नेटवर्क](https://arxiv.org/ pdf/2202.09741.pdf) मेंग-हाओ गुओ, चेंग-ज़े लू, झेंग-निंग लियू, मिंग-मिंग चेंग, शि-मिन हू द्वारा। 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (मल्टीमीडिया कम्प्यूटिंग ग्रुप, नानजिंग यूनिवर्सिटी से) साथ में पेपर [वीडियोएमएई: मास्क्ड ऑटोएन्कोडर स्व-पर्यवेक्षित वीडियो प्री-ट्रेनिंग के लिए डेटा-कुशल सीखने वाले हैं] (https://arxiv.org/abs/2203.12602) ज़ान टोंग, यिबिंग सॉन्ग, जुए द्वारा वांग, लिमिन वांग द्वारा पोस्ट किया गया। 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain से) साथ में कागज [ViLT: Vision-and-Language Transformer बिना कनवल्शन या रीजन सुपरविजन](https://arxiv.org/abs/2102.03334) वोनजे किम, बोक्यूंग सोन, इल्डू किम द्वारा पोस्ट किया गया। 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (गूगल एआई से) कागज के साथ [एक इमेज इज़ वर्थ 16x16 वर्ड्स: ट्रांसफॉर्मर्स फॉर इमेज रिकॉग्निशन एट स्केल](https://arxiv.org/abs/2010.11929) एलेक्सी डोसोवित्स्की, लुकास बेयर, अलेक्जेंडर कोलेसनिकोव, डिर्क वीसेनबोर्न, शियाओहुआ झाई, थॉमस अनटरथिनर, मुस्तफा देहघानी, मैथियास मिंडरर, जॉर्ज हेगोल्ड, सिल्वेन गेली, जैकब उस्ज़कोरेइट द्वारा हॉल्सबी द्वारा पोस्ट किया गया। 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP से) साथ वाला पेपर [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:/ /arxiv.org/pdf/1908.03557) लियुनियन हेरोल्ड ली, मार्क यात्स्कर, दा यिन, चो-जुई हसीह, काई-वेई चांग द्वारा। -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI से) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. द्वाराअनुसंधान पत्र [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) के साथ जारी किया गया 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [मास्कड ऑटोएन्कोडर स्केलेबल विजन लर्नर्स हैं](https://arxiv.org/ एब्स/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा। +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL से) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. द्वाराअनुसंधान पत्र [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) के साथ जारी किया गया 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा। +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise से) Jaehyeon Kim, Jungil Kong, Juhee Son. द्वाराअनुसंधान पत्र [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) के साथ जारी किया गया +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा। 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI से) साथ वाला पेपर [FAIRSEQ S2T: FAIRSEQ के साथ फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग ](https://arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, सरव्या पोपुरी, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया। 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https:/ /arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा। 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (माइक्रोसॉफ्ट रिसर्च से) पेपर के साथ जारी किया गया [WavLM: फुल स्टैक के लिए बड़े पैमाने पर स्व-पर्यवेक्षित पूर्व-प्रशिक्षण स्पीच प्रोसेसिंग] (https://arxiv.org/abs/2110.13900) सानयुआन चेन, चेंगयी वांग, झेंगयांग चेन, यू वू, शुजी लियू, ज़ुओ चेन, जिन्यु ली, नाओयुकी कांडा, ताकुया योशियोका, ज़िओंग जिओ, जियान वू, लॉन्ग झोउ, शुओ रेन, यानमिन कियान, याओ कियान, जियान वू, माइकल ज़ेंग, फुरु वेई। 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI से) साथ में कागज [बड़े पैमाने पर कमजोर पर्यवेक्षण के माध्यम से मजबूत भाषण पहचान](https://cdn. openai.com/papers/whisper.pdf) एलेक रैडफोर्ड, जोंग वूक किम, ताओ जू, ग्रेग ब्रॉकमैन, क्रिस्टीन मैकलीवे, इल्या सुत्स्केवर द्वारा। 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [एक्सपैंडिंग लैंग्वेज-इमेज प्रीट्रेन्ड मॉडल फॉर जनरल वीडियो रिकग्निशन](https: //arxiv.org/abs/2208.02816) बोलिन नी, होउवेन पेंग, मिंगाओ चेन, सोंगयांग झांग, गाओफेंग मेंग, जियानलोंग फू, शिमिंग जियांग, हैबिन लिंग द्वारा। +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI से) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. द्वाराअनुसंधान पत्र [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) के साथ जारी किया गया 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (फेसबुक से) साथ में पेपर [क्रॉस-लिंगुअल लैंग्वेज मॉडल प्रीट्रेनिंग] (https://arxiv.org/abs/1901.07291) गिलाउम लैम्पल और एलेक्सिस कोनो द्वारा। 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में कागज [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू- सीक्वेंस प्री-ट्रेनिंग](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा। 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (फेसबुक एआई से), साथ में पेपर [अनसुपरवाइज्ड क्रॉस-लिंगुअल रिप्रेजेंटेशन लर्निंग एट स्केल] (https://arxiv.org/abs/1911.02116) एलेक्सिस कोन्यू*, कार्तिकेय खंडेलवाल*, नमन गोयल, विश्रव चौधरी, गिलाउम वेनज़ेक, फ्रांसिस्को गुज़मैन द्वारा , एडौर्ड ग्रेव, मायल ओट, ल्यूक ज़ेटलमॉयर और वेसेलिन स्टोयानोव द्वारा। 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI से) साथ में कागज [बहुभाषी नकाबपोश भाषा के लिए बड़े पैमाने पर ट्रांसफॉर्मर ] मॉडलिंग](https://arxiv.org/abs/2105.00572) नमन गोयल, जिंगफेई डू, मायल ओट, गिरि अनंतरामन, एलेक्सिस कोनो द्वारा पोस्ट किया गया। +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU से) साथ वाला पेपर [XLNet: जनरलाइज्ड ऑटोरेग्रेसिव प्रीट्रेनिंग फॉर लैंग्वेज अंडरस्टैंडिंग](https://arxiv ज़ीलिन यांग*, ज़िहांग दाई*, यिमिंग यांग, जैम कार्बोनेल, रुस्लान सलाखुतदीनोव, क्वोक वी. ले ​​द्वारा .org/abs/1906.08237)। 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI से) साथ वाला पेपर [XLS-R: सेल्फ सुपरवाइज्ड क्रॉस-लिंगुअल स्पीच रिप्रेजेंटेशन लर्निंग एट स्केल](https://arxiv.org/abs/2111.09296) अरुण बाबू, चांगहान वांग, एंड्रोस तजंद्रा, कुशाल लखोटिया, कियानटोंग जू, नमन गोयल, कृतिका सिंह, पैट्रिक वॉन प्लैटन, याथार्थ सराफ, जुआन पिनो, एलेक्सी बेवस्की, एलेक्सिस कोन्यू, माइकल औली द्वारा पोस्ट किया गया। 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (फेसबुक एआई से) साथ में पेपर [अनसुपरवाइज्ड क्रॉस-लिंगुअल रिप्रेजेंटेशन लर्निंग फॉर स्पीच रिकग्निशन] (https://arxiv.org/abs/2006.13979) एलेक्सिस कोन्यू, एलेक्सी बेवस्की, रोनन कोलोबर्ट, अब्देलरहमान मोहम्मद, माइकल औली द्वारा। @@ -406,7 +466,7 @@ conda install -c huggingface transformers 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में पेपर [यू ओनली सैंपल (लगभग) ज़ानपेंग ज़ेंग, युनयांग ज़िओंग द्वारा , सत्य एन. रवि, शैलेश आचार्य, ग्लेन फंग, विकास सिंह द्वारा पोस्ट किया गया। 1. एक नए मॉडल में योगदान देना चाहते हैं? नए मॉडल जोड़ने में आपका मार्गदर्शन करने के लिए हमारे पास एक **विस्तृत मार्गदर्शिका और टेम्प्लेट** है। आप उन्हें [`टेम्पलेट्स`](./templates) निर्देशिका में पा सकते हैं। पीआर शुरू करने से पहले [योगदान दिशानिर्देश] (./CONTRIBUTING.md) देखना और अनुरक्षकों से संपर्क करना या प्रतिक्रिया प्राप्त करने के लिए एक नया मुद्दा खोलना याद रखें। -यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका] (https://huggingface.co/ docs/transformers/index#supported) देखें। -फ्रेमवर्क)। +यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका](https://huggingface.co/docs/transformers/index#supported) देखें। -फ्रेमवर्क)। इन कार्यान्वयनों का परीक्षण कई डेटासेट पर किया गया है (देखें केस स्क्रिप्ट का उपयोग करें) और वैनिला कार्यान्वयन के लिए तुलनात्मक रूप से प्रदर्शन करना चाहिए। आप उपयोग के मामले के दस्तावेज़ [इस अनुभाग](https://huggingface.co/docs/transformers/examples) में व्यवहार का विवरण पढ़ सकते हैं। diff --git a/README_ja.md b/README_ja.md index 72f23dbeae3d..1ada3be1f4bc 100644 --- a/README_ja.md +++ b/README_ja.md @@ -53,7 +53,7 @@ user: ユーザ

-

+

Build @@ -82,7 +82,7 @@ user: ユーザ Español | 日本語 | हिन्दी -

+

@@ -258,7 +258,7 @@ And here is the equivalent code for TensorFlow: ### pipにて -このリポジトリは、Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+ でテストされています。 +このリポジトリは、Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, TensorFlow 2.6+ でテストされています。 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。 @@ -298,8 +298,11 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 🤗Transformersは現在、以下のアーキテクチャを提供しています(それぞれのハイレベルな要約は[こちら](https://huggingface.co/docs/transformers/model_summary)を参照してください): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago から) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut から公開された研究論文: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research から) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. から公開された研究論文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) @@ -309,24 +312,31 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research から) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen から公開された研究論文: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby. 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce から) Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. から公開された研究論文 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました. 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (Harbin Institute of Technology/Microsoft Research Asia/Intel Labs から) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA から) Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. から公開された研究論文 [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research から) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel から公開された研究論文: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI から) Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. から公開された研究論文 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia から) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang から公開された研究論文: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech から) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan から公開された研究論文: [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (OpenBMB から) [OpenBMB](https://www.openbmb.org/) から公開されました. 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) @@ -335,36 +345,53 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research から) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai から公開された研究論文: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook から) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou から公開された研究論文: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI から) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. から公開された研究論文 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin から) Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. から公開された研究論文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137) 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI から) Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. から公開された研究論文 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI から) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. から公開された研究論文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) -1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです. **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と **ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) +1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです. **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と **ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI から) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach から公開された研究論文: [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA から) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori からリリース. 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) -1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました. +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) @@ -373,6 +400,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI から) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. から公開された研究論文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) @@ -381,32 +410,50 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team が現在開発中です. 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI から) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. から公開された研究論文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook から) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. から公開された研究論文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research から) Peng Wang, Cheng Da, and Cong Yao. から公開された研究論文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook から) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. から公開された研究論文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML から) the MosaicML NLP Team. から公開された研究論文 [llm-foundry](https://github.com/mosaicml/llm-foundry/) +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI から) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. から公開された研究論文 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT から) Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. から公開された研究論文 [blog post](https://www.adept.ai/blog/persimmon-8b) 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research から) Dat Quoc Nguyen and Anh Tuan Nguyen から公開された研究論文: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google から) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. から公開された研究論文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) @@ -415,52 +462,65 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research から) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun から公開された研究論文: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook から), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov から公開された研究論文: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM) 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research から) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. から公開された研究論文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook から), Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino から公開された研究論文: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開された研究論文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開されたレポジトリー [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (HuggingFace から). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research から) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. から公開された研究論文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741) 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI から) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. から公開された研究論文 [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL から) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. から公開された研究論文 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise から) Jaehyeon Kim, Jungil Kong, Juhee Son. から公開された研究論文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research から) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei から公開された研究論文: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI から) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever から公開された研究論文: [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research から) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling から公開された研究論文: [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI から) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. から公開された研究論文 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li から公開された研究論文: [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook から) Guillaume Lample and Alexis Conneau から公開された研究論文: [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI から), Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov から公開された研究論文: [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI から), Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau から公開された研究論文: [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI から) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa から公開された研究論文: [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU から) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le から公開された研究論文: [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI から) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli から公開された研究論文: [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI から) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) diff --git a/README_ko.md b/README_ko.md index 8d0443fd6f50..35fc4e7f453f 100644 --- a/README_ko.md +++ b/README_ko.md @@ -18,7 +18,7 @@ limitations under the License.

-

+

Build @@ -47,7 +47,7 @@ limitations under the License. Español | 日本語 | हिन्दी -

+

@@ -175,7 +175,7 @@ limitations under the License. ### pip로 설치하기 -이 저장소는 Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+에서 테스트 되었습니다. +이 저장소는 Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, TensorFlow 2.6+에서 테스트 되었습니다. [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요. @@ -213,8 +213,11 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research 에서 제공)은 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.의 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)논문과 함께 발표했습니다. +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. @@ -224,24 +227,31 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (Salesforce 에서 제공)은 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.의 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597)논문과 함께 발표했습니다. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa 에서) Adrian de Wynter and Daniel J. Perry 의 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 논문과 함께 발표했습니다. +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (NAVER CLOVA 에서 제공)은 Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.의 [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539)논문과 함께 발표했습니다. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research 에서) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 의 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 논문과 함께 발표했습니다. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다. 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다. +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다. +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (MetaAI 에서 제공)은 Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.의 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)논문과 함께 발표했습니다. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia 에서) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 의 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 논문과 함께 발표했습니다. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech 에서) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 의 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 논문과 함께 발표했습니다. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI 에서) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 의 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 논문과 함께 발표했습니다. +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University 에서) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 의 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 논문과 함께 발표했습니다. +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다. 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다. 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다. @@ -250,36 +260,53 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다. 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research 에서) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 의 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 논문과 함께 발표했습니다. 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook 에서) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 의 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 논문과 함께 발표했습니다. +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI 에서 제공)은 Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.의 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)논문과 함께 발표했습니다. +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin 에서 제공)은 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.의 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)논문과 함께 발표했습니다. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook 에서) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 의 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 논문과 함께 발표했습니다. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research 에서) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 의 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 논문과 함께 발표했습니다. 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs 에서) Ali Hassani and Humphrey Shi 의 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 논문과 함께 발표했습니다. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (Meta AI 에서 제공)은 Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.의 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)논문과 함께 발표했습니다. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace 에서) Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT 의 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 논문과 함께 발표했습니다. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research 에서) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 의 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 논문과 함께 발표했습니다. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER 에서) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 의 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 논문과 함께 발표했습니다. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook 에서) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 의 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 논문과 함께 발표했습니다. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs 에서) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 의 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 논문과 함께 발표했습니다. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다. +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI 에서 제공)은 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.의 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)논문과 함께 발표했습니다. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다. +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI 에서) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbac 의 [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) 논문과 함께 발표했습니다. 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori. 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI 에서) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 의 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 논문과 함께 발표했습니다. -1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다. +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다. +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다. +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다. +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다. 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다. 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다. 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다. @@ -288,6 +315,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다. 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI 에서) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 의 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 논문과 함께 발표했습니다. 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology 에서) Jiapeng Wang, Lianwen Jin, Kai Ding 의 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 논문과 함께 발표했습니다. +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다. +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..의 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)논문과 함께 발표했습니다. 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다. 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI 에서) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 의 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 논문과 함께 발표했습니다. 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia 에서) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 의 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 논문과 함께 발표했습니다. @@ -296,32 +325,50 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook 에서) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 의 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 논문과 함께 발표했습니다. 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia 에서) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 의 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 논문과 함께 발표했습니다. +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다. 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC 에서) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 의 [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) 논문과 함께 발표했습니다. +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI 에서 제공)은 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.의 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)논문과 함께 발표했습니다. 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 의 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 논문과 함께 발표했습니다. 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 의 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 논문과 함께 발표했습니다. +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook 에서 제공)은 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.의 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)논문과 함께 발표했습니다. 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다. 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다. +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research 에서 제공)은 Peng Wang, Cheng Da, and Cong Yao.의 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)논문과 함께 발표했습니다. +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia 에서) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 의 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 논문과 함께 발표했습니다. +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook 에서 제공)은 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.의 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)논문과 함께 발표했습니다. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain 에서) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 의 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 논문과 함께 발표했습니다. 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다. +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML 에서 제공)은 the MosaicML NLP Team.의 [llm-foundry](https://github.com/mosaicml/llm-foundry/)논문과 함께 발표했습니다. +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다. +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다. 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta 에서) the NLLB team 의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 논문과 함께 발표했습니다. +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다. +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI 에서 제공)은 Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.의 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)논문과 함께 발표했습니다. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다. +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다. +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다. +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT 에서 제공)은 Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.의 [blog post](https://www.adept.ai/blog/persimmon-8b)논문과 함께 발표했습니다. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research 에서) Dat Quoc Nguyen and Anh Tuan Nguyen 의 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 논문과 함께 발표했습니다. +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google 에서 제공)은 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.의 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)논문과 함께 발표했습니다. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP 에서) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 의 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 논문과 함께 발표했습니다. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs 에서) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 의 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 논문과 함께 발표했습니다. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다. +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)논문과 함께 발표했습니다. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA 에서) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 의 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 논문과 함께 발표했습니다. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook 에서) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 의 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 논문과 함께 발표했습니다. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research 에서) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 의 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 논문과 함께 발표했습니다. @@ -330,57 +377,70 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research 에서) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 의 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 논문과 함께 발표했습니다. 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research 에서) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 의 [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 논문과 함께 발표했습니다. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook 에서) Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 의 a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 논문과 함께 발표했습니다. -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다. -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다. +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다. +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다. +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다. 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다. +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research 에서 제공)은 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.의 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)논문과 함께 발표했습니다. 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 의 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다. 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다. 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다. +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다. 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다. -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다. -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다. +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (Google 에서) William Fedus, Barret Zoph, Noam Shazeer. 의 [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) 논문과 함께 발표했습니다. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI 에서) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 의 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 논문과 함께 발표했습니다. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research 에서) Brandon Smock, Rohith Pesala, Robin Abraham 의 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 논문과 함께 발표했습니다. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI 에서) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 의 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 논문과 함께 발표했습니다. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research 에서) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 의 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 논문과 함께 발표했습니다. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다. +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (Facebook 에서) Gedas Bertasius, Heng Wang, Lorenzo Torresani 의 [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) 논문과 함께 발표했습니다. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다. 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다. +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다. 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다. +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research 에서 제공)은 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.의 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)논문과 함께 발표했습니다. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다. 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다. +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다. 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University 에서) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 의 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 논문과 함께 발표했습니다. 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University 에서) Zhan Tong, Yibing Song, Jue Wang, Limin Wang 의 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 논문과 함께 발표했습니다. 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain 에서) Wonjae Kim, Bokyung Son, Ildoo Kim 의 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 논문과 함께 발표했습니다. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP 에서) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 의 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 논문과 함께 발표했습니다. -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다. +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다. +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI 에서 제공)은 Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.의 [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)논문과 함께 발표했습니다. 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다. +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL 에서 제공)은 Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.의 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)논문과 함께 발표했습니다. 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다. +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise 에서 제공)은 Jaehyeon Kim, Jungil Kong, Juhee Son.의 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)논문과 함께 발표했습니다. +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 의 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI 에서) Qiantong Xu, Alexei Baevski, Michael Auli 의 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 논문과 함께 발표했습니다. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research 에서) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei 의 [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) 논문과 함께 발표했습니다. 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 의 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 논문과 함께 발표했습니다. 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research 에서) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 의 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 논문과 함께 발표했습니다. +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI 에서 제공)은 Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.의 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255)논문과 함께 발표했습니다. 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (Facebook AI 에서 제공) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li 의 [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) 논문과 함께 발표했습니다. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook 에서) Guillaume Lample and Alexis Conneau 의 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 논문과 함께 발표했습니다. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI 에서) Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 의 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 논문과 함께 발표했습니다. 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI 에서) Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 의 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 논문과 함께 발표했습니다. +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (Meta AI 에서) Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa 의 [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 논문과 함께 발표했습니다. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU 에서) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 의 [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 논문과 함께 발표했습니다. 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI 에서) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 의 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 논문과 함께 발표했습니다. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI 에서) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 의 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 논문과 함께 발표했습니다. 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology 에서) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 의 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 논문과 함께 발표했습니다. -1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison 에서) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 의 [You Only Sample (Almost) 논문과 함께 발표했습니다. +1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison 에서) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 의 [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) 논문과 함께 발표했습니다. 1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다. 각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요. diff --git a/README_ru.md b/README_ru.md new file mode 100644 index 000000000000..8a15bf871ea7 --- /dev/null +++ b/README_ru.md @@ -0,0 +1,549 @@ + + +

+ + + + Hugging Face Transformers Library + +
+
+

+ +

+ + Build + + + GitHub + + + Documentation + + + GitHub release + + + Contributor Covenant + + DOI +

+ +

+

+ English | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский +

+

+ +

+

Современное машинное обучение для JAX, PyTorch и TensorFlow

+

+ +

+ +

+ +🤗 Transformers предоставляет тысячи предварительно обученных моделей для выполнения различных задач, таких как текст, зрение и аудио. + +Эти модели могут быть применены на: + +* 📝 Текст, для таких задач, как классификация текстов, извлечение информации, ответы на вопросы, обобщение, перевод, генерация текстов, на более чем 100 языках. +* 🖼️ Изображения - для задач классификации изображений, обнаружения объектов и сегментации. +* 🗣️ Аудио - для задач распознавания речи и классификации аудио. + +Модели transformers также могут выполнять несколько задач, такие как ответы на табличные вопросы, распознавание оптических символов, извлечение информации из отсканированных документов, классификация видео и ответы на визуальные вопросы. + +🤗 Transformers предоставляет API для быстрой загрузки и использования предварительно обученных моделей, их тонкой настройки на собственных датасетах и последующего взаимодействия ими с сообществом на нашем [сайте](https://huggingface.co/models). В то же время каждый python модуль, определяющий архитектуру, полностью автономен и может быть модифицирован для проведения быстрых исследовательских экспериментов. + +🤗 Transformers опирается на три самые популярные библиотеки глубокого обучения - [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) и [TensorFlow](https://www.tensorflow.org/) - и легко интегрируется между ними. Это позволяет легко обучать модели с помощью одной из них, а затем загружать их для выводов с помощью другой. + +## Онлайн демонстрация + +Большинство наших моделей можно протестировать непосредственно на их страницах с [сайта](https://huggingface.co/models). Мы также предлагаем [привтаный хостинг моделей, контроль версий и API для выводов](https://huggingface.co/pricing) для публичных и частных моделей. + +Вот несколько примеров: + +В области NLP ( Обработка текстов на естественном языке ): +- [Маскированное заполнение слов с помощью BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France) +- [Распознавание сущностей с помощью Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city) +- [Генерация текста с помощью GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+) +- [Выводы на естественном языке с помощью RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal) +- [Обобщение с помощью BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct) +- [Ответы на вопросы с помощью DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species) +- [Перевод с помощью T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin) + +В области компьютерного зрения: +- [Классификация изображений с помощью ViT](https://huggingface.co/google/vit-base-patch16-224) +- [Обнаружение объектов с помощью DETR](https://huggingface.co/facebook/detr-resnet-50) +- [Семантическая сегментация с помощью SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) +- [Сегментация паноптикума с помощью MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco) +- [Оценка глубины с помощью DPT](https://huggingface.co/docs/transformers/model_doc/dpt) +- [Классификация видео с помощью VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae) +- [Универсальная сегментация с помощью OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large) + +В области звука: +- [Автоматическое распознавание речи с помощью Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) +- [Поиск ключевых слов с помощью Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) +- [Классификация аудиоданных с помощью траснформера аудиоспектрограмм](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) + +В мультимодальных задачах: +- [Ответы на вопросы по таблице с помощью TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq) +- [Визуальные ответы на вопросы с помощью ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) +- [Zero-shot классификация изображений с помощью CLIP](https://huggingface.co/openai/clip-vit-large-patch14) +- [Ответы на вопросы по документам с помощью LayoutLM](https://huggingface.co/impira/layoutlm-document-qa) +- [Zero-shot классификация видео с помощью X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip) + + +## 100 проектов, использующих Transformers + +Transformers - это не просто набор инструментов для использования предварительно обученных моделей: это сообщество проектов, созданное на его основе, и +Hugging Face Hub. Мы хотим, чтобы Transformers позволил разработчикам, исследователям, студентам, профессорам, инженерам и всем желающим +создавать проекты своей мечты. + +Чтобы отпраздновать 100 тысяч звезд Transformers, мы решили сделать акцент на сообществе, и создали страницу [awesome-transformers](./awesome-transformers.md), на которой перечислены 100 +невероятных проектов, созданных с помощью transformers. + +Если вы являетесь владельцем или пользователем проекта, который, по вашему мнению, должен быть включен в этот список, пожалуйста, откройте PR для его добавления! + +## Если вы хотите получить индивидуальную поддержку от команды Hugging Face + + + HuggingFace Expert Acceleration Program +
+ +## Быстрый гайд + +Для использования модели на заданном входе (текст, изображение, звук, ...) мы предоставляем API `pipeline`. Конвейеры объединяют предварительно обученную модель с препроцессингом, который использовался при ее обучении. Вот как можно быстро использовать конвейер для классификации положительных и отрицательных текстов: + +```python +>>> from transformers import pipeline + +# Выделение конвейера для анализа настроений +>>> classifier = pipeline('sentiment-analysis') +>>> classifier('Мы очень рады представить конвейер в transformers.') +[{'label': 'POSITIVE', 'score': 0.9996980428695679}] +``` + +Вторая строка кода загружает и кэширует предварительно обученную модель, используемую конвейером, а третья оценивает ее на заданном тексте. Здесь ответ "POSITIVE" с уверенностью 99,97%. + +Во многих задачах, как в НЛП, так и в компьютерном зрении и речи, уже есть готовый `pipeline`. Например, мы можем легко извлечь обнаруженные объекты на изображении: + +``` python +>>> import requests +>>> from PIL import Image +>>> from transformers import pipeline + +# Скачиваем изображение с милыми котиками +>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" +>>> image_data = requests.get(url, stream=True).raw +>>> image = Image.open(image_data) + +# Выделение конвейера для обнаружения объектов +>>> object_detector = pipeline('object-detection') +>>> object_detector(image) +[{'score': 0.9982201457023621, + 'label': 'remote', + 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}, + {'score': 0.9960021376609802, + 'label': 'remote', + 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}, + {'score': 0.9954745173454285, + 'label': 'couch', + 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}, + {'score': 0.9988006353378296, + 'label': 'cat', + 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}, + {'score': 0.9986783862113953, + 'label': 'cat', + 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}] +``` + +Здесь мы получаем список объектов, обнаруженных на изображении, с рамкой вокруг объекта и оценкой достоверности. Слева - исходное изображение, справа прогнозы: + +

+ + +

+ +Подробнее о задачах, поддерживаемых API `pipeline`, можно узнать в [этом учебном пособии](https://huggingface.co/docs/transformers/task_sum) + +В дополнение к `pipeline`, для загрузки и использования любой из предварительно обученных моделей в заданной задаче достаточно трех строк кода. Вот версия для PyTorch: +```python +>>> from transformers import AutoTokenizer, AutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +>>> model = AutoModel.from_pretrained("bert-base-uncased") + +>>> inputs = tokenizer("Привет мир!", return_tensors="pt") +>>> outputs = model(**inputs) +``` + +А вот эквивалентный код для TensorFlow: +```python +>>> from transformers import AutoTokenizer, TFAutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +>>> model = TFAutoModel.from_pretrained("bert-base-uncased") + +>>> inputs = tokenizer("Привет мир!", return_tensors="tf") +>>> outputs = model(**inputs) +``` + +Токенизатор отвечает за всю предварительную обработку, которую ожидает предварительно обученная модель, и может быть вызван непосредственно с помощью одной строки (как в приведенных выше примерах) или на списке. В результате будет получен словарь, который можно использовать в последующем коде или просто напрямую передать в модель с помощью оператора распаковки аргументов **. + +Сама модель представляет собой обычный [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) или [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (в зависимости от используемого бэкенда), который можно использовать как обычно. [В этом руководстве](https://huggingface.co/docs/transformers/training) рассказывается, как интегрировать такую модель в классический цикл обучения PyTorch или TensorFlow, или как использовать наш API `Trainer` для быстрой тонкой настройки на новом датасете. + +## Почему необходимо использовать transformers? + +1. Простые в использовании современные модели: + - Высокая производительность в задачах понимания и генерации естественного языка, компьютерного зрения и аудио. + - Низкий входной барьер для преподавателей и практиков. + - Небольшое количество абстракций для пользователя и всего три класса для изучения. + - Единый API для использования всех наших предварительно обученных моделей. + +1. Более низкие вычислительные затраты, меньший "углеродный след": + - Исследователи могут обмениваться обученными моделями вместо того, чтобы постоянно их переобучать. + - Практики могут сократить время вычислений и производственные затраты. + - Десятки архитектур с более чем 60 000 предварительно обученных моделей для всех модальностей. + +1. Выбор подходящего фреймворка для каждого этапа жизни модели: + - Обучение самых современных моделей за 3 строки кода. + - Перемещайте одну модель между фреймворками TF2.0/PyTorch/JAX по своему усмотрению. + - Беспрепятственный выбор подходящего фреймворка для обучения, оценки и производства. + +1. Легко настроить модель или пример под свои нужды: + - Мы предоставляем примеры для каждой архитектуры, чтобы воспроизвести результаты, опубликованные их авторами. + - Внутренние компоненты модели раскрываются максимально последовательно. + - Файлы моделей можно использовать независимо от библиотеки для проведения быстрых экспериментов. + +## Почему я не должен использовать transformers? + +- Данная библиотека не является модульным набором строительных блоков для нейронных сетей. Код в файлах моделей специально не рефакторится дополнительными абстракциями, чтобы исследователи могли быстро итеративно работать с каждой из моделей, не погружаясь в дополнительные абстракции/файлы. +- API обучения не предназначен для работы с любой моделью, а оптимизирован для работы с моделями, предоставляемыми библиотекой. Для работы с общими циклами машинного обучения следует использовать другую библиотеку (возможно, [Accelerate](https://huggingface.co/docs/accelerate)). +- Несмотря на то, что мы стремимся представить как можно больше примеров использования, скрипты в нашей папке [примеров](https://github.com/huggingface/transformers/tree/main/examples) являются именно примерами. Предполагается, что они не будут работать "из коробки" для решения вашей конкретной задачи, и вам придется изменить несколько строк кода, чтобы адаптировать их под свои нужды. + +## Установка + +### С помощью pip + +Данный репозиторий протестирован на Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ и TensorFlow 2.6+. + +Устанавливать 🤗 Transformers следует в [виртуальной среде](https://docs.python.org/3/library/venv.html). Если вы не знакомы с виртуальными средами Python, ознакомьтесь с [руководством пользователя](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). + +Сначала создайте виртуальную среду с той версией Python, которую вы собираетесь использовать, и активируйте ее. + +Затем необходимо установить хотя бы один бекенд из Flax, PyTorch или TensorFlow. +Пожалуйста, обратитесь к страницам [TensorFlow установочная страница](https://www.tensorflow.org/install/), [PyTorch установочная страница](https://pytorch.org/get-started/locally/#start-locally) и/или [Flax](https://github.com/google/flax#quick-install) и [Jax](https://github.com/google/jax#installation), где описаны команды установки для вашей платформы. + +После установки одного из этих бэкендов 🤗 Transformers может быть установлен с помощью pip следующим образом: + +```bash +pip install transformers +``` + +Если вы хотите поиграть с примерами или вам нужен самый современный код и вы не можете ждать нового релиза, вы должны [установить библиотеку из исходного кода](https://huggingface.co/docs/transformers/installation#installing-from-source). + +### С помощью conda + +Начиная с версии Transformers v4.0.0, у нас появилсась поддержка conda: `huggingface`. + +Установить Transformers с помощью conda можно следующим образом: + +```bash +conda install -c huggingface transformers +``` + +О том, как установить Flax, PyTorch или TensorFlow с помощью conda, читайте на страницах, посвященных их установке. + +> **_ЗАМЕТКА:_** В операционной системе Windows вам может быть предложено активировать режим разработчика, чтобы воспользоваться преимуществами кэширования. Если для вас это невозможно, сообщите нам об этом [здесь](https://github.com/huggingface/huggingface_hub/issues/1062). + +## Модельные архитектуры + +**[Все контрольные точки моделей](https://huggingface.co/models)**, предоставляемые 🤗 Transformers, беспрепятственно интегрируются с huggingface.co [model hub](https://huggingface.co/models), куда они загружаются непосредственно [пользователями](https://huggingface.co/users) и [организациями](https://huggingface.co/organizations). + +Текущее количество контрольных точек: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen) + +🤗 В настоящее время Transformers предоставляет следующие архитектуры (подробное описание каждой из них см. [здесь](https://huggingface.co/docs/transformers/model_summary)): + +1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. +1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. +1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. +1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. +1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. +1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. +1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. +1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). +1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. +1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. +1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. +1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. +1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. +1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. +1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. +1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. +1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. +1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). +1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. +1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. +1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. +1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. +1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai. +1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. +1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. +1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. +1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. +1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. +1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. +1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. +1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. +1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. +1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. +1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. +1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. +1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. +1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. +1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach +1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori. +1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. +1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. +1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. +1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. +1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. +1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. +1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei. +1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. +1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. +1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding. +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. +1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. +1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. +1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. +1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. +1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. +1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. +1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. +1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. +1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. +1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. +1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. +1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. +1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. +1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. +1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. +1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. +1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. +1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. +1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. +1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). +1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. +1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. +1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. +1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. +1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. +1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. +1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. +1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. +1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. +1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. +1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. +1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. +1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. +1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. +1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. +1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. +1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. +1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. +1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. +1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. +1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. +1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine +1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. +1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal. +1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. +1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. +1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. +1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. +1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. +1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. +1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. +1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. +1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. +1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. +1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. +1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. +1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. +1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. +1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. +1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. +1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa. +1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. +1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. +1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. +1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. +1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. +1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. + +Чтобы проверить, есть ли у каждой модели реализация на Flax, PyTorch или TensorFlow, или связанный с ней токенизатор, поддерживаемый библиотекой 🤗 Tokenizers, обратитесь к [этой таблице](https://huggingface.co/docs/transformers/index#supported-frameworks). + +Эти реализации были протестированы на нескольких наборах данных (см. примеры скриптов) и должны соответствовать производительности оригинальных реализаций. Более подробную информацию о производительности можно найти в разделе "Примеры" [документации](https://github.com/huggingface/transformers/tree/main/examples). + + +## Изучи больше + +| Секция | Описание | +|-|-| +| [Документация](https://huggingface.co/docs/transformers/) | Полная документация по API и гайды | +| [Краткие описания задач](https://huggingface.co/docs/transformers/task_summary) | Задачи поддерживаются 🤗 Transformers | +| [Пособие по предварительной обработке](https://huggingface.co/docs/transformers/preprocessing) | Использование класса `Tokenizer` для подготовки данных для моделей | +| [Обучение и доработка](https://huggingface.co/docs/transformers/training) | Использование моделей, предоставляемых 🤗 Transformers, в цикле обучения PyTorch/TensorFlow и API `Trainer`. | +| [Быстрый тур: Тонкая настройка/скрипты использования](https://github.com/huggingface/transformers/tree/main/examples) | Примеры скриптов для тонкой настройки моделей на широком спектре задач | +| [Совместное использование и загрузка моделей](https://huggingface.co/docs/transformers/model_sharing) | Загружайте и делитесь с сообществом своими доработанными моделями | + +## Цитирование + +Теперь у нас есть [статья](https://www.aclweb.org/anthology/2020.emnlp-demos.6/), которую можно цитировать для библиотеки 🤗 Transformers: +```bibtex +@inproceedings{wolf-etal-2020-transformers, + title = "Transformers: State-of-the-Art Natural Language Processing", + author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", + month = oct, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6", + pages = "38--45" +} +``` diff --git a/README_zh-hans.md b/README_zh-hans.md index 8a7b507599b3..8c03789b8506 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -26,7 +26,7 @@ token: 词符(并用括号标注原英文) tokenize: 词符化(并用括号标注原英文) tokenizer: 词符化器(并用括号标注原英文) transformer: transformer(不翻译) -pipeline: 流水线 +pipeline: 流水线 API: API (不翻译) inference: 推理 Trainer: 训练器。当作为类名出现时不翻译。 @@ -43,7 +43,7 @@ checkpoint: 检查点

-

+

Build @@ -72,7 +72,7 @@ checkpoint: 检查点 Español | 日本語 | हिन्दी -

+

@@ -83,11 +83,11 @@ checkpoint: 检查点

-🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。 +🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。 🤗 Transformers 提供了便于快速下载和使用的API,让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时,每个定义的 Python 模块均完全独立,方便修改和快速研究实验。 -🤗 Transformers 支持三个最热门的深度学习库: [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。 +🤗 Transformers 支持三个最热门的深度学习库: [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。 ## 在线演示 @@ -200,7 +200,7 @@ checkpoint: 检查点 ### 使用 pip -这个仓库已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下经过测试。 +这个仓库已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.10+ 和 TensorFlow 2.6+ 下经过测试。 你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境,请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 @@ -237,8 +237,11 @@ conda install -c huggingface transformers 🤗 Transformers 目前支持如下的架构(模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。 -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。 +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (来自 Google Research) 伴随论文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) 由 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig 发布。 +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。 +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。 @@ -248,24 +251,31 @@ conda install -c huggingface transformers 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。 -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。 -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。 +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (来自 Microsoft Research AI4Science) 伴随论文 [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) 由 Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu 发布。 +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (来自 Google AI) 伴随论文 [Big Transfer (BiT) 由 Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby 发布。 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。 -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。 +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (来自 Salesforce) 伴随论文 [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) 由 Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi 发布。 +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (来自 Salesforce) 伴随论文 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) 由 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi 发布。 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。 +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (来自 NAVER CLOVA) 伴随论文 [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) 由 Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park 发布。 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。 +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。 +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (来自 MetaAI) 伴随论文 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) 由 Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve 发布。 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。 +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。 +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。 @@ -274,36 +284,53 @@ conda install -c huggingface transformers 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (来自 SenseTime Research) 伴随论文 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 由 Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 发布。 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。 +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (来自 Google AI) 伴随论文 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) 由 Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun 发布。 +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (来自 The University of Texas at Austin) 伴随论文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137) 由 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl 发布。 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (来自 SHI Labs) 伴随论文 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 由 Ali Hassani and Humphrey Shi 发布。 +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (来自 Meta AI) 伴随论文 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) 由 Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski 发布。 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。 +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。 +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。 +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (来自 Meta AI) 伴随论文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) 由 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi 发布。 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。 +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。 +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。 -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。 +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (来自 ABEJA) 由 Shinya Otani, Takayoshi Makabe, Anuj Arora, Kyo Hattori。 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。 -1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。 -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。 +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。 +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。 +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。 +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。 @@ -312,6 +339,8 @@ conda install -c huggingface transformers 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。 +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。 +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (来自 The FAIR team of Meta AI) 伴随论文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) 由 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. 发布。 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。 @@ -320,32 +349,50 @@ conda install -c huggingface transformers 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。 -1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov >>>>>>> Fix rebase +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。 +1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (来自 Google AI) 伴随论文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 由 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos 发布。 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。 +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (来自 Facebook) 伴随论文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) 由 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer 发布。 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。 +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (来自 Alibaba Research) 伴随论文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) 由 Peng Wang, Cheng Da, and Cong Yao 发布。 +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。 +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (来自 Facebook) 伴随论文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 由 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli 发布。 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。 +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。 +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (来自 MosaiML) 伴随论文 [llm-foundry](https://github.com/mosaicml/llm-foundry/) 由 the MosaicML NLP Team 发布。 +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。 +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。 +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (来自 Meta AI) 伴随论文 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) 由 Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic 发布。 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。 +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs) 伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。 +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。 +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (来自 ADEPT) 伴随论文 [blog post](https://www.adept.ai/blog/persimmon-8b) 由 Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani 发布。 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。 +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (来自 Google) 伴随论文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 由 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova 发布。 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。 +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。 +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (来自 Nanjing University, The University of Hong Kong etc.) 伴随论文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。 @@ -354,57 +401,70 @@ conda install -c huggingface transformers 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。 -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。 -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。 +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。 +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。 +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。 +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。 +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (来自 Microsoft Research) 伴随论文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 由 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei 发布。 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。 +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。 -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。 -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。 +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。 +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (来自 Google Research) 伴随论文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 由 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant 发布。 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。 +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。 -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。 +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。 +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (来自 Meta AI) 伴随论文 [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) 由 Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He 发布。 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。 +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (来自 HUST-VL) 伴随论文 [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) 由 Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang 发布。 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布. +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (来自 Kakao Enterprise) 伴随论文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) 由 Jaehyeon Kim, Jungil Kong, Juhee Son 发布。 +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (来自 OpenAI) 伴随论文 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 由 Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 发布。 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (来自 Microsoft Research) 伴随论文 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 由 Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 发布。 +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (来自 Meta AI) 伴随论文 [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) 由 Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe 发布。 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (来自 Facebook AI) 伴随论文 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 由 Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 发布。 +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (来自 Meta AI) 伴随论文 [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) 由 Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa 发布。 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (来自 Huazhong University of Science & Technology) 伴随论文 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 由 Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 发布。 -1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。 +1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。 1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。 要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现,或其是否在 🤗 Tokenizers 库中有对应词符化器(tokenizer),敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。 @@ -416,7 +476,7 @@ conda install -c huggingface transformers | 章节 | 描述 | |-|-| -| [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 | +| [文档](https://huggingface.co/docs/transformers/) | 完整的 API 文档和教程 | | [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 | | [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 | | [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 | diff --git a/README_zh-hant.md b/README_zh-hant.md index 5d0f1b9057a3..e16a47713c5e 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -55,7 +55,7 @@ user: 使用者

-

+

Build @@ -84,7 +84,7 @@ user: 使用者 Español | 日本語 | हिन्दी -

+

@@ -212,7 +212,7 @@ Tokenizer 為所有的預訓練模型提供了預處理,並可以直接轉換 ### 使用 pip -這個 Repository 已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下經過測試。 +這個 Repository 已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.10+ 和 TensorFlow 2.6+ 下經過測試。 你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境,請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 @@ -249,8 +249,11 @@ conda install -c huggingface transformers 🤗 Transformers 目前支援以下的架構(模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)): 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. +1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team. 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. @@ -260,24 +263,31 @@ conda install -c huggingface transformers 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. -1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. +1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/). 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. @@ -286,36 +296,53 @@ conda install -c huggingface transformers 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai. 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. +1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT. 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. +1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. 1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach 1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori. 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. -1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. +1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). +1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. +1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. +1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. @@ -324,6 +351,8 @@ conda install -c huggingface transformers 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding. +1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. +1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. @@ -332,32 +361,50 @@ conda install -c huggingface transformers 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. +1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov +1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. +1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the paper [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. +1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. +1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. @@ -366,57 +413,70 @@ conda install -c huggingface transformers 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. 1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. -1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng. 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. 1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. -1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. -1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace). -1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal. 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. +1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. +1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. +1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. -1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. +1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. 1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。 要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作,或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer,敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。 diff --git a/awesome-transformers.md b/awesome-transformers.md new file mode 100644 index 000000000000..013f88259c91 --- /dev/null +++ b/awesome-transformers.md @@ -0,0 +1,609 @@ +# Awesome projects built with Transformers + +This page lists awesome projects built on top of Transformers. Transformers is more than a toolkit to use pretrained +models: it's a community of projects built around it and the Hugging Face Hub. We want Transformers to enable +developers, researchers, students, professors, engineers, and anyone else to build their dream projects. + +In this list, we showcase incredibly impactful and novel projects that have pushed the field forward. We celebrate +100 of these projects as we reach the milestone of 100k stars as a community; but we're very open to pull requests +adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR +to add it. + +## [gpt4all](https://github.com/nomic-ai/gpt4all) + +[gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style. + +Keywords: Open-source, LLaMa, GPT-J, instruction, assistant + +## [recommenders](https://github.com/microsoft/recommenders) + +This repository contains examples and best practices for building recommendation systems, provided as Jupyter notebooks. It goes over several aspects required to build efficient recommendation systems: data preparation, modeling, evaluation, model selection & optimization, as well as operationalization + +Keywords: Recommender systems, AzureML + +## [lama-cleaner](https://github.com/Sanster/lama-cleaner) + +Image inpainting tool powered by Stable Diffusion. Remove any unwanted object, defect, people from your pictures or erase and replace anything on your pictures. + +Keywords: inpainting, SD, Stable Diffusion + +## [flair](https://github.com/flairNLP/flair) + +FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things. + +Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis + +## [mindsdb](https://github.com/mindsdb/mindsdb) + +MindsDB is a low-code ML platform, which automates and integrates several ML frameworks into the data stack as "AI Tables" to streamline the integration of AI into applications, making it accessible to developers of all skill levels. + +Keywords: Database, low-code, AI table + +## [langchain](https://github.com/hwchase17/langchain) + +[langchain](https://github.com/hwchase17/langchain) is aimed at assisting in the development of apps merging both LLMs and other sources of knowledge. The library allows chaining calls to applications, creating a sequence across many tools. + +Keywords: LLMs, Large Language Models, Agents, Chains + +## [LlamaIndex](https://github.com/jerryjliu/llama_index) + +[LlamaIndex](https://github.com/jerryjliu/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results. + +Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation + +## [ParlAI](https://github.com/facebookresearch/ParlAI) + +[ParlAI](https://github.com/facebookresearch/ParlAI) is a python framework for sharing, training and testing dialogue models, from open-domain chitchat, to task-oriented dialogue, to visual question answering. It provides more than 100 datasets under the same API, a large zoo of pretrained models, a set of agents, and has several integrations. + +Keywords: Dialogue, Chatbots, VQA, Datasets, Agents + +## [sentence-transformers](https://github.com/UKPLab/sentence-transformers) + +This framework provides an easy method to compute dense vector representations for sentences, paragraphs, and images. The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and achieve state-of-the-art performance in various task. Text is embedding in vector space such that similar text is close and can efficiently be found using cosine similarity. + +Keywords: Dense vector representations, Text embeddings, Sentence embeddings + +## [ludwig](https://github.com/ludwig-ai/ludwig) + +Ludwig is a declarative machine learning framework that makes it easy to define machine learning pipelines using a simple and flexible data-driven configuration system. Ludwig is targeted at a wide variety of AI tasks. It provides a data-driven configuration system, training, prediction, and evaluation scripts, as well as a programmatic API. + +Keywords: Declarative, Data-driven, ML Framework + +## [InvokeAI](https://github.com/invoke-ai/InvokeAI) + +[InvokeAI](https://github.com/invoke-ai/InvokeAI) is an engine for Stable Diffusion models, aimed at professionals, artists, and enthusiasts. It leverages the latest AI-driven technologies through CLI as well as a WebUI. + +Keywords: Stable-Diffusion, WebUI, CLI + +## [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) + +[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) is an easy-to-use and powerful NLP library particularly targeted at the Chinese languages. It has support for multiple pre-trained model zoos, and supports a wide-range of NLP tasks from research to industrial applications. + +Keywords: NLP, Chinese, Research, Industry + +## [stanza](https://github.com/stanfordnlp/stanza) + +The Stanford NLP Group's official Python NLP library. It contains support for running various accurate natural language processing tools on 60+ languages and for accessing the Java Stanford CoreNLP software from Python. + +Keywords: NLP, Multilingual, CoreNLP + +## [DeepPavlov](https://github.com/deeppavlov/DeepPavlov) + +[DeepPavlov](https://github.com/deeppavlov/DeepPavlov) is an open-source conversational AI library. It is designed for the development of production ready chat-bots and complex conversational systems, as well as research in the area of NLP and, particularly, of dialog systems. + +Keywords: Conversational, Chatbot, Dialog + +## [alpaca-lora](https://github.com/tloen/alpaca-lora) + +Alpaca-lora contains code for reproducing the Stanford Alpaca results using low-rank adaptation (LoRA). The repository provides training (fine-tuning) as well as generation scripts. + +Keywords: LoRA, Parameter-efficient fine-tuning + +## [imagen-pytorch](https://github.com/lucidrains/imagen-pytorch) + +An open-source Implementation of Imagen, Google's closed-source Text-to-Image Neural Network that beats DALL-E2. As of release, it is the new SOTA for text-to-image synthesis. + +Keywords: Imagen, Text-to-image + +## [adapter-transformers](https://github.com/adapter-hub/adapter-transformers) + +[adapter-transformers](https://github.com/adapter-hub/adapter-transformers) is an extension of HuggingFace's Transformers library, integrating adapters into state-of-the-art language models by incorporating AdapterHub, a central repository for pre-trained adapter modules. It is a drop-in replacement for transformers, which is regularly updated to stay up-to-date with the developments of transformers. + +Keywords: Adapters, LoRA, Parameter-efficient fine-tuning, Hub + +## [NeMo](https://github.com/NVIDIA/NeMo) + +NVIDIA [NeMo](https://github.com/NVIDIA/NeMo) is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR), text-to-speech synthesis (TTS), large language models (LLMs), and natural language processing (NLP). The primary objective of [NeMo](https://github.com/NVIDIA/NeMo) is to help researchers from industry and academia to reuse prior work (code and pretrained models) and make it easier to create new https://developer.nvidia.com/conversational-ai#started. + +Keywords: Conversational, ASR, TTS, LLMs, NLP + +## [Runhouse](https://github.com/run-house/runhouse) + +[Runhouse](https://github.com/run-house/runhouse) allows to send code and data to any of your compute or data infra, all in Python, and continue to interact with them normally from your existing code and environment. Runhouse developers mention: + +> Think of it as an expansion pack to your Python interpreter that lets it take detours to remote machines or manipulate remote data. + +Keywords: MLOps, Infrastructure, Data storage, Modeling + +## [MONAI](https://github.com/Project-MONAI/MONAI) + +[MONAI](https://github.com/Project-MONAI/MONAI) is a PyTorch-based, open-source framework for deep learning in healthcare imaging, part of PyTorch Ecosystem. Its ambitions are: +- developing a community of academic, industrial and clinical researchers collaborating on a common foundation; +- creating state-of-the-art, end-to-end training workflows for healthcare imaging; +- providing researchers with the optimized and standardized way to create and evaluate deep learning models. + +Keywords: Healthcare imaging, Training, Evaluation + +## [simpletransformers](https://github.com/ThilinaRajapakse/simpletransformers) + +Simple Transformers lets you quickly train and evaluate Transformer models. Only 3 lines of code are needed to initialize, train, and evaluate a model. It supports a wide variety of NLP tasks. + +Keywords: Framework, simplicity, NLP + +## [JARVIS](https://github.com/microsoft/JARVIS) + +[JARVIS](https://github.com/microsoft/JARVIS) is a system attempting to merge LLMs such as GPT-4 with the rest of the open-source ML community: leveraging up to 60 downstream models in order to perform tasks identified by the LLM. + +Keywords: LLM, Agents, HF Hub + +## [transformers.js](https://xenova.github.io/transformers.js/) + +[transformers.js](https://xenova.github.io/transformers.js/) is a JavaScript library targeted at running models from transformers directly within the browser. + +Keywords: Transformers, JavaScript, browser + +## [bumblebee](https://github.com/elixir-nx/bumblebee) + +Bumblebee provides pre-trained Neural Network models on top of Axon, a neural networks library for the Elixir language. It includes integration with 🤗 Models, allowing anyone to download and perform Machine Learning tasks with few lines of code. + +Keywords: Elixir, Axon + +## [argilla](https://github.com/argilla-io/argilla) + +Argilla is an open-source platform providing advanced NLP labeling, monitoring, and workspaces. It is compatible with many open source ecosystems such as Hugging Face, Stanza, FLAIR, and others. + +Keywords: NLP, Labeling, Monitoring, Workspaces + +## [haystack](https://github.com/deepset-ai/haystack) + +Haystack is an open source NLP framework to interact with your data using Transformer models and LLMs. It offers production-ready tools to quickly build complex decision making, question answering, semantic search, text generation applications, and more. + +Keywords: NLP, Framework, LLM + +## [spaCy](https://github.com/explosion/spaCy) + +[spaCy](https://github.com/explosion/spaCy) is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. It offers support for transformers models through its third party package, spacy-transformers. + +Keywords: NLP, Framework + +## [speechbrain](https://github.com/speechbrain/speechbrain) + +SpeechBrain is an open-source and all-in-one conversational AI toolkit based on PyTorch. +The goal is to create a single, flexible, and user-friendly toolkit that can be used to easily develop state-of-the-art speech technologies, including systems for speech recognition, speaker recognition, speech enhancement, speech separation, language identification, multi-microphone signal processing, and many others. + +Keywords: Conversational, Speech + +## [skorch](https://github.com/skorch-dev/skorch) + +Skorch is a scikit-learn compatible neural network library that wraps PyTorch. It has support for models within transformers, and tokenizers from tokenizers. + +Keywords: Scikit-Learn, PyTorch + +## [bertviz](https://github.com/jessevig/bertviz) + +BertViz is an interactive tool for visualizing attention in Transformer language models such as BERT, GPT2, or T5. It can be run inside a Jupyter or Colab notebook through a simple Python API that supports most Huggingface models. + +Keywords: Visualization, Transformers + +## [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) + +[mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) is a haiku library using the xmap/pjit operators in JAX for model parallelism of transformers. This library is designed for scalability up to approximately 40B parameters on TPUv3s. It was the library used to train the GPT-J model. + +Keywords: Haiku, Model parallelism, LLM, TPU + +## [deepchem](https://github.com/deepchem/deepchem) + +DeepChem aims to provide a high quality open-source toolchain that democratizes the use of deep-learning in drug discovery, materials science, quantum chemistry, and biology. + +Keywords: Drug discovery, Materials Science, Quantum Chemistry, Biology + +## [OpenNRE](https://github.com/thunlp/OpenNRE) + +An Open-Source Package for Neural Relation Extraction (NRE). It is targeted at a wide range of users, from newcomers to relation extraction, to developers, researchers, or students. + +Keywords: Neural Relation Extraction, Framework + +## [pycorrector](https://github.com/shibing624/pycorrector) + +PyCorrector is a Chinese Text Error Correction Tool. It uses a language model to detect errors, pinyin feature and shape feature to correct Chinese text errors. it can be used for Chinese Pinyin and stroke input method. + +Keywords: Chinese, Error correction tool, Language model, Pinyin + +## [nlpaug](https://github.com/makcedward/nlpaug) + +This python library helps you with augmenting nlp for machine learning projects. It is a lightweight library featuring synthetic data generation for improving model performance, support for audio and text, and compatibility with several ecosystems (scikit-learn, pytorch, tensorflow). + +Keywords: Data augmentation, Synthetic data generation, Audio, NLP + +## [dream-textures](https://github.com/carson-katri/dream-textures) + +[dream-textures](https://github.com/carson-katri/dream-textures) is a library targeted at bringing stable-diffusion support within Blender. It supports several use-cases, such as image generation, texture projection, inpainting/outpainting, ControlNet, and upscaling. + +Keywords: Stable-Diffusion, Blender + +## [seldon-core](https://github.com/SeldonIO/seldon-core) + +Seldon core converts your ML models (Tensorflow, Pytorch, H2o, etc.) or language wrappers (Python, Java, etc.) into production REST/GRPC microservices. +Seldon handles scaling to thousands of production machine learning models and provides advanced machine learning capabilities out of the box including Advanced Metrics, Request Logging, Explainers, Outlier Detectors, A/B Tests, Canaries and more. + +Keywords: Microservices, Modeling, Language wrappers + +## [open_model_zoo](https://github.com/openvinotoolkit/open_model_zoo) + +This repository includes optimized deep learning models and a set of demos to expedite development of high-performance deep learning inference applications. Use these free pre-trained models instead of training your own models to speed-up the development and production deployment process. + +Keywords: Optimized models, Demos + +## [ml-stable-diffusion](https://github.com/apple/ml-stable-diffusion) + +ML-Stable-Diffusion is a repository by Apple bringing Stable Diffusion support to Core ML, on Apple Silicon devices. It supports stable diffusion checkpoints hosted on the Hugging Face Hub. + +Keywords: Stable Diffusion, Apple Silicon, Core ML + +## [stable-dreamfusion](https://github.com/ashawkey/stable-dreamfusion) + +Stable-Dreamfusion is a pytorch implementation of the text-to-3D model Dreamfusion, powered by the Stable Diffusion text-to-2D model. + +Keywords: Text-to-3D, Stable Diffusion + +## [txtai](https://github.com/neuml/txtai) + +[txtai](https://github.com/neuml/txtai) is an open-source platform for semantic search and workflows powered by language models. txtai builds embeddings databases, which are a union of vector indexes and relational databases enabling similarity search with SQL. Semantic workflows connect language models together into unified applications. + +Keywords: Semantic search, LLM + +## [djl](https://github.com/deepjavalibrary/djl) + +Deep Java Library (DJL) is an open-source, high-level, engine-agnostic Java framework for deep learning. DJL is designed to be easy to get started with and simple to use for developers. DJL provides a native Java development experience and functions like any other regular Java library. DJL offers [a Java binding](https://github.com/deepjavalibrary/djl/tree/master/extensions/tokenizers) for HuggingFace Tokenizers and easy conversion toolkit for HuggingFace model to deploy in Java. + +Keywords: Java, Framework + +## [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/) + +This project provides a unified framework to test generative language models on a large number of different evaluation tasks. It has support for more than 200 tasks, and supports different ecosystems: HF Transformers, GPT-NeoX, DeepSpeed, as well as the OpenAI API. + +Keywords: LLM, Evaluation, Few-shot + +## [gpt-neox](https://github.com/EleutherAI/gpt-neox) + +This repository records EleutherAI's library for training large-scale language models on GPUs. The framework is based on NVIDIA's Megatron Language Model and has been augmented with techniques from DeepSpeed as well as some novel optimizations. It is focused on training multi-billion-parameter models. + +Keywords: Training, LLM, Megatron, DeepSpeed + +## [muzic](https://github.com/microsoft/muzic) + +Muzic is a research project on AI music that empowers music understanding and generation with deep learning and artificial intelligence. Muzic was created by researchers from Microsoft Research Asia. + +Keywords: Music understanding, Music generation + +## [dalle-flow](https://github.com/jina-ai/dalle-flow) + +DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt. +The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR. + +Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR + +## [lightseq](https://github.com/bytedance/lightseq) + +LightSeq is a high performance training and inference library for sequence processing and generation implemented in CUDA. It enables highly efficient computation of modern NLP and CV models such as BERT, GPT, Transformer, etc. It is therefore best useful for machine translation, text generation, image classification, and other sequence related tasks. + +Keywords: Training, Inference, Sequence Processing, Sequence Generation + +## [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR) + +The goal of this project is to create a learning based system that takes an image of a math formula and returns corresponding LaTeX code. + +Keywords: OCR, LaTeX, Math formula + +## [open_clip](https://github.com/mlfoundations/open_clip) + +OpenCLIP is an open source implementation of OpenAI's CLIP. + +The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift. +The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset. + +Specifically, a ResNet-50 model trained with this codebase on OpenAI's 15 million image subset of YFCC achieves 32.7% top-1 accuracy on ImageNet. + +Keywords: CLIP, Open-source, Contrastive, Image-text + +## [dalle-playground](https://github.com/saharmor/dalle-playground) + +A playground to generate images from any text prompt using Stable Diffusion and Dall-E mini. + +Keywords: WebUI, Stable Diffusion, Dall-E mini + +## [FedML](https://github.com/FedML-AI/FedML) + +[FedML](https://github.com/FedML-AI/FedML) is a federated learning and analytics library enabling secure and collaborative machine learning on decentralized data anywhere at any scale. + +It supports large-scale cross-silo federated learning, and cross-device federated learning on smartphones/IoTs, and research simulation. + +Keywords: Federated Learning, Analytics, Collaborative ML, Decentralized + +## [gpt-code-clippy](https://github.com/CodedotAl/gpt-code-clippy) + +GPT-Code-Clippy (GPT-CC) is an open source version of GitHub Copilot, a language model -- based on GPT-3, called GPT-Codex -- that is fine-tuned on publicly available code from GitHub. + +Keywords: LLM, Code + +## [TextAttack](https://github.com/QData/TextAttack) + +[TextAttack](https://github.com/QData/TextAttack) 🐙 is a Python framework for adversarial attacks, data augmentation, and model training in NLP. + +Keywords: Adversarial attacks, Data augmentation, NLP + +## [OpenPrompt](https://github.com/thunlp/OpenPrompt) + +Prompt-learning is a paradigm to adapt pre-trained language models (PLMs) to downstream NLP tasks, which modify the input text with a textual template and directly uses PLMs to conduct pre-trained tasks. This library provides a standard, flexible and extensible framework to deploy the prompt-learning pipeline. [OpenPrompt](https://github.com/thunlp/OpenPrompt) supports loading PLMs directly from https://github.com/huggingface/transformers. + +## [text-generation-webui](https://github.com/oobabooga/text-generation-webui/) + +[text-generation-webui](https://github.com/oobabooga/text-generation-webui/) is a Gradio Web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, and GALACTICA. + +Keywords: LLM, WebUI + +## [libra](https://github.com/Palashio/libra) + +An ergonomic machine learning [libra](https://github.com/Palashio/libra)ry for non-technical users. It focuses on ergonomics and on ensuring that training a model is as simple as it can be. + +Keywords: Ergonomic, Non-technical + +## [alibi](https://github.com/SeldonIO/alibi) + +Alibi is an open source Python library aimed at machine learning model inspection and interpretation. The focus of the library is to provide high-quality implementations of black-box, white-box, local and global explanation methods for classification and regression models. + +Keywords: Model inspection, Model interpretation, Black-box, White-box + +## [tortoise-tts](https://github.com/neonbjb/tortoise-tts) + +Tortoise is a text-to-speech program built with the following priorities: strong multi-voice capabilities, and highly realistic prosody and intonation. + +Keywords: Text-to-speech + +## [flower](https://github.com/adap/flower) + +Flower (flwr) is a framework for building federated learning systems. The design of Flower is based on a few guiding principles: customizability, extendability, framework agnosticity, and ease-of-use. + +Keywords: Federated learning systems, Customizable, Extendable, Framework-agnostic, Simplicity + +## [fast-bert](https://github.com/utterworks/fast-bert) + +Fast-Bert is a deep learning library that allows developers and data scientists to train and deploy BERT and XLNet based models for natural language processing tasks beginning with Text Classification. It is aimed at simplicity. + +Keywords: Deployment, BERT, XLNet + +## [towhee](https://github.com/towhee-io/towhee) + +Towhee makes it easy to build neural data processing pipelines for AI applications. We provide hundreds of models, algorithms, and transformations that can be used as standard pipeline building blocks. Users can use Towhee's Pythonic API to build a prototype of their pipeline and automatically optimize it for production-ready environments. + +Keywords: Data processing pipeline, Optimization + +## [alibi-detect](https://github.com/SeldonIO/alibi-detect) + +Alibi Detect is an open source Python library focused on outlier, adversarial and drift detection. The package aims to cover both online and offline detectors for tabular data, text, images and time series. Both TensorFlow and PyTorch backends are supported for drift detection. + +Keywords: Adversarial, Outlier, Drift detection + +## [FARM](https://github.com/deepset-ai/FARM) + +[FARM](https://github.com/deepset-ai/FARM) makes Transfer Learning with BERT & Co simple, fast and enterprise-ready. It's built upon transformers and provides additional features to simplify the life of developers: Parallelized preprocessing, highly modular design, multi-task learning, experiment tracking, easy debugging and close integration with AWS SageMaker. + +Keywords: Transfer Learning, Modular design, Multi-task learning, Experiment tracking + +## [aitextgen](https://github.com/minimaxir/aitextgen) + +A robust Python tool for text-based AI training and generation using OpenAI's GPT-2 and EleutherAI's GPT Neo/GPT-3 architecture. +[aitextgen](https://github.com/minimaxir/aitextgen) is a Python package that leverages PyTorch, Hugging Face Transformers and pytorch-lightning with specific optimizations for text generation using GPT-2, plus many added features. + +Keywords: Training, Generation + +## [diffgram](https://github.com/diffgram/diffgram) + +Diffgram aims to integrate human supervision into platforms. We support your team programmatically changing the UI (Schema, layout, etc.) like in Streamlit. This means that you can collect and annotate timely data from users. In other words, we are the platform behind your platform, an integrated part of your application, to ship new & better AI products faster. + +Keywords: Human supervision, Platform + +## [ecco](https://github.com/jalammar/ecco) + +Explain, analyze, and visualize NLP language models. Ecco creates interactive visualizations directly in Jupyter notebooks explaining the behavior of Transformer-based language models (like GPT2, BERT, RoBERTA, T5, and T0). + +Keywords: Model explainability + +## [s3prl](https://github.com/s3prl/s3prl) + +[s3prl](https://github.com/s3prl/s3prl) stands for Self-Supervised Speech Pre-training and Representation Learning. Self-supervised speech pre-trained models are called upstream in this toolkit, and are utilized in various downstream tasks. + +Keywords: Speech, Training + +## [ru-dalle](https://github.com/ai-forever/ru-dalle) + +RuDALL-E aims to be similar to DALL-E, targeted to Russian. + +Keywords: DALL-E, Russian + +## [DeepKE](https://github.com/zjunlp/DeepKE) + +[DeepKE](https://github.com/zjunlp/DeepKE) is a knowledge extraction toolkit for knowledge graph construction supporting cnSchema,low-resource, document-level and multimodal scenarios for entity, relation and attribute extraction. + +Keywords: Knowledge Extraction, Knowledge Graphs + +## [Nebuly](https://github.com/nebuly-ai/nebuly) + +Nebuly is the next-generation platform to monitor and optimize your AI costs in one place. The platform connects to all your AI cost sources (compute, API providers, AI software licenses, etc) and centralizes them in one place to give you full visibility on a model basis. The platform also provides optimization recommendations and a co-pilot model that can guide during the optimization process. The platform builds on top of the open-source tools allowing you to optimize the different steps of your AI stack to squeeze out the best possible cost performances. + +Keywords: Optimization, Performance, Monitoring + +## [imaginAIry](https://github.com/brycedrennan/imaginAIry) + +Offers a CLI and a Python API to generate images with Stable Diffusion. It has support for many tools, like image structure control (controlnet), instruction-based image edits (InstructPix2Pix), prompt-based masking (clipseg), among others. + +Keywords: Stable Diffusion, CLI, Python API + +## [sparseml](https://github.com/neuralmagic/sparseml) + +SparseML is an open-source model optimization toolkit that enables you to create inference-optimized sparse models using pruning, quantization, and distillation algorithms. Models optimized with SparseML can then be exported to the ONNX and deployed with DeepSparse for GPU-class performance on CPU hardware. + +Keywords: Model optimization, Pruning, Quantization, Distillation + +## [opacus](https://github.com/pytorch/opacus) + +Opacus is a library that enables training PyTorch models with differential privacy. It supports training with minimal code changes required on the client, has little impact on training performance, and allows the client to online track the privacy budget expended at any given moment. + +Keywords: Differential privacy + +## [LAVIS](https://github.com/salesforce/LAVIS) + +[LAVIS](https://github.com/salesforce/LAVIS) is a Python deep learning library for LAnguage-and-VISion intelligence research and applications. This library aims to provide engineers and researchers with a one-stop solution to rapidly develop models for their specific multimodal scenarios, and benchmark them across standard and customized datasets. It features a unified interface design to access + +Keywords: Multimodal, NLP, Vision + +## [buzz](https://github.com/chidiwilliams/buzz) + +Buzz transcribes and translates audio offline on your personal computer. Powered by OpenAI's Whisper. + +Keywords: Audio transcription, Translation + +## [rust-bert](https://github.com/guillaume-be/rust-bert) + +Rust-native state-of-the-art Natural Language Processing models and pipelines. Port of Hugging Face's Transformers library, using the tch-rs crate and pre-processing from rust-tokenizers. Supports multi-threaded tokenization and GPU inference. This repository exposes the model base architecture, task-specific heads and ready-to-use pipelines. + +Keywords: Rust, BERT, Inference + +## [EasyNLP](https://github.com/alibaba/EasyNLP) + +[EasyNLP](https://github.com/alibaba/EasyNLP) is an easy-to-use NLP development and application toolkit in PyTorch, first released inside Alibaba in 2021. It is built with scalable distributed training strategies and supports a comprehensive suite of NLP algorithms for various NLP applications. [EasyNLP](https://github.com/alibaba/EasyNLP) integrates knowledge distillation and few-shot learning for landing large pre-trained models, together with various popular multi-modality pre-trained models. It provides a unified framework of model training, inference, and deployment for real-world applications. + +Keywords: NLP, Knowledge distillation, Few-shot learning, Multi-modality, Training, Inference, Deployment + +## [TurboTransformers](https://github.com/Tencent/TurboTransformers) + +A fast and user-friendly runtime for transformer inference (Bert, Albert, GPT2, Decoders, etc) on CPU and GPU. + +Keywords: Optimization, Performance + +## [hivemind](https://github.com/learning-at-home/hivemind) + +Hivemind is a PyTorch library for decentralized deep learning across the Internet. Its intended usage is training one large model on hundreds of computers from different universities, companies, and volunteers. + +Keywords: Decentralized training + +## [docquery](https://github.com/impira/docquery) + +DocQuery is a library and command-line tool that makes it easy to analyze semi-structured and unstructured documents (PDFs, scanned images, etc.) using large language models (LLMs). You simply point DocQuery at one or more documents and specify a question you want to ask. DocQuery is created by the team at Impira. + +Keywords: Semi-structured documents, Unstructured documents, LLM, Document Question Answering + +## [CodeGeeX](https://github.com/THUDM/CodeGeeX) + +[CodeGeeX](https://github.com/THUDM/CodeGeeX) is a large-scale multilingual code generation model with 13 billion parameters, pre-trained on a large code corpus of more than 20 programming languages. It has several unique features: +- Multilingual code generation +- Crosslingual code translation +- Is a customizable programming assistant + +Keywords: Code Generation Model + +## [ktrain](https://github.com/amaiya/ktrain) + +[ktrain](https://github.com/amaiya/ktrain) is a lightweight wrapper for the deep learning library TensorFlow Keras (and other libraries) to help build, train, and deploy neural networks and other machine learning models. Inspired by ML framework extensions like fastai and ludwig, [ktrain](https://github.com/amaiya/ktrain) is designed to make deep learning and AI more accessible and easier to apply for both newcomers and experienced practitioners. + +Keywords: Keras wrapper, Model building, Training, Deployment + +## [FastDeploy](https://github.com/PaddlePaddle/FastDeploy) + +[FastDeploy](https://github.com/PaddlePaddle/FastDeploy) is an Easy-to-use and High Performance AI model deployment toolkit for Cloud, Mobile and Edge with packageout-of-the-box and unified experience, endend-to-end optimization for over fire160+ Text, Vision, Speech and Cross-modal AI models. Including image classification, object detection, OCR, face detection, matting, pp-tracking, NLP, stable diffusion, TTS and other tasks to meet developers' industrial deployment needs for multi-scenario, multi-hardware and multi-platform. + +Keywords: Model deployment, CLoud, Mobile, Edge + +## [underthesea](https://github.com/undertheseanlp/underthesea) + +[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing. + +Keywords: Vietnamese, NLP + +## [hasktorch](https://github.com/hasktorch/hasktorch) + +Hasktorch is a library for tensors and neural networks in Haskell. It is an independent open source community project which leverages the core C++ libraries shared by PyTorch. + +Keywords: Haskell, Neural Networks + +## [donut](https://github.com/clovaai/donut) + +Donut, or Document understanding transformer, is a new method of document understanding that utilizes an OCR-free end-to-end Transformer model. + +Donut does not require off-the-shelf OCR engines/APIs, yet it shows state-of-the-art performances on various visual document understanding tasks, such as visual document classification or information extraction (a.k.a. document parsing). + +Keywords: Document Understanding + +## [transformers-interpret](https://github.com/cdpierse/transformers-interpret) + +Transformers Interpret is a model explainability tool designed to work exclusively with the transformers package. + +In line with the philosophy of the Transformers package Transformers Interpret allows any transformers model to be explained in just two lines. Explainers are available for both text and computer vision models. Visualizations are also available in notebooks and as savable png and html files + +Keywords: Model interpretation, Visualization + +## [mlrun](https://github.com/mlrun/mlrun) + +MLRun is an open MLOps platform for quickly building and managing continuous ML applications across their lifecycle. MLRun integrates into your development and CI/CD environment and automates the delivery of production data, ML pipelines, and online applications, significantly reducing engineering efforts, time to production, and computation resources. With MLRun, you can choose any IDE on your local machine or on the cloud. MLRun breaks the silos between data, ML, software, and DevOps/MLOps teams, enabling collaboration and fast continuous improvements. + +Keywords: MLOps + +## [FederatedScope](https://github.com/alibaba/FederatedScope) + +[FederatedScope](https://github.com/alibaba/FederatedScope) is a comprehensive federated learning platform that provides convenient usage and flexible customization for various federated learning tasks in both academia and industry. Based on an event-driven architecture, [FederatedScope](https://github.com/alibaba/FederatedScope) integrates rich collections of functionalities to satisfy the burgeoning demands from federated learning, and aims to build up an easy-to-use platform for promoting learning safely and effectively. + +Keywords: Federated learning, Event-driven + +## [pythainlp](https://github.com/PyThaiNLP/pythainlp) + +PyThaiNLP is a Python package for text processing and linguistic analysis, similar to NLTK with focus on Thai language. + +Keywords: Thai, NLP, NLTK + +## [FlagAI](https://github.com/FlagAI-Open/FlagAI) + +[FlagAI](https://github.com/FlagAI-Open/FlagAI) (Fast LArge-scale General AI models) is a fast, easy-to-use and extensible toolkit for large-scale model. Our goal is to support training, fine-tuning, and deployment of large-scale models on various downstream tasks with multi-modality. + +Keywords: Large models, Training, Fine-tuning, Deployment, Multi-modal + +## [pyserini](https://github.com/castorini/pyserini) + +[pyserini](https://github.com/castorini/pyserini) is a Python toolkit for reproducible information retrieval research with sparse and dense representations. Retrieval using sparse representations is provided via integration with the group's Anserini IR toolkit. Retrieval using dense representations is provided via integration with Facebook's Faiss library. + +Keywords: IR, Information Retrieval, Dense, Sparse + +## [baal](https://github.com/baal-org/baal) + +[baal](https://github.com/baal-org/baal) is an active learning library that supports both industrial applications and research usecases. [baal](https://github.com/baal-org/baal) currently supports Monte-Carlo Dropout, MCDropConnect, deep ensembles, and semi-supervised learning. + +Keywords: Active Learning, Research, Labeling + +## [cleanlab](https://github.com/cleanlab/cleanlab) + +[cleanlab](https://github.com/cleanlab/cleanlab) is the standard data-centric AI package for data quality and machine learning with messy, real-world data and labels. For text, image, tabular, audio (among others) datasets, you can use cleanlab to automatically: detect data issues (outliers, label errors, near duplicates, etc), train robust ML models, infer consensus + annotator-quality for multi-annotator data, suggest data to (re)label next (active learning). + +Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active Learning + +## [BentoML](https://github.com/bentoml/BentoML) + +[BentoML](https://github.com/bentoml) is the unified framework for for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. +All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage. + +Keywords: BentoML, Framework, Deployment, AI Applications + +## [LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning) + +[LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning). + +Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen + diff --git a/conftest.py b/conftest.py index c3d4f70326d9..247e5eb92d53 100644 --- a/conftest.py +++ b/conftest.py @@ -20,6 +20,10 @@ import warnings from os.path import abspath, dirname, join +import _pytest + +from transformers.testing_utils import HfDoctestModule, HfDocTestParser + # allow having multiple repository checkouts and not needing to remember to rerun # 'pip install -e .[dev]' when switching between checkouts and running tests. @@ -38,7 +42,10 @@ def pytest_configure(config): config.addinivalue_line( "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested" ) + config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested") config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment") + config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate") + config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule") def pytest_addoption(parser): @@ -62,7 +69,7 @@ def pytest_sessionfinish(session, exitstatus): # Doctest custom flag to ignore output. -IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT') +IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT") OutputChecker = doctest.OutputChecker @@ -75,3 +82,5 @@ def check_output(self, want, got, optionflags): doctest.OutputChecker = CustomOutputChecker +_pytest.doctest.DoctestModule = HfDoctestModule +doctest.DocTestParser = HfDocTestParser diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 1c79983d3b63..a6c672e1a9df 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,11 +9,11 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='1.13.0' +ARG PYTORCH='2.0.1' # (not always a valid torch version) ARG INTEL_TORCH_EXT='1.11.0' # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu116' +ARG CUDA='cu118' RUN apt update RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs @@ -22,7 +22,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] # TODO: Handle these in a python utility script RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile @@ -32,26 +31,35 @@ RUN echo torch=$VERSION # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI). RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA -RUN python3 -m pip install --no-cache-dir -U tensorflow==2.11 -RUN python3 -m pip install --no-cache-dir -U tensorflow_probability -RUN python3 -m pip uninstall -y flax jax +RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability -# To include the change in this commit https://github.com/onnx/tensorflow-onnx/commit/ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3 -# Otherwise, we get tf2onnx==1.8 (caused by `flatbuffers` version), and some tests fail with `ValueError: from_keras requires input_signature`. -# TODO: remove this line once the conflict is resolved in these libraries. -RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow-onnx.git@ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3 +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] + +RUN python3 -m pip uninstall -y flax jax -RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable +RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://developer.intel.com/ipex-whl-stable-cpu RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract RUN python3 -m pip install -U "itsdangerous<2.1.0" RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft + # Add bitsandbytes for mixed int8 testing RUN python3 -m pip install --no-cache-dir bitsandbytes -RUN python3 -m pip install --no-cache-dir decord +# Add auto-gptq for gtpq quantization testing +RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ + +# Add einops for additional model testing +RUN python3 -m pip install --no-cache-dir einops + +# For bettertransformer + gptq +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum + +# For video model testing +RUN python3 -m pip install --no-cache-dir decord av==9.2.0 # For `dinat` model RUN python3 -m pip install --no-cache-dir natten -f https://shi-labs.com/natten/wheels/$CUDA/ diff --git a/docker/transformers-doc-builder/Dockerfile b/docker/transformers-doc-builder/Dockerfile index 0e5b072d4889..c9f6adb63e0c 100644 --- a/docker/transformers-doc-builder/Dockerfile +++ b/docker/transformers-doc-builder/Dockerfile @@ -11,7 +11,6 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed] RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract -RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com RUN python3 -m pip install -U "itsdangerous<2.1.0" # Test if the image could successfully build the doc. before publishing the image diff --git a/docker/transformers-past-gpu/Dockerfile b/docker/transformers-past-gpu/Dockerfile index 99fb550c6a35..0cdc9ff07124 100644 --- a/docker/transformers-past-gpu/Dockerfile +++ b/docker/transformers-past-gpu/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_DOCKER_IMAGE="nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04" +ARG BASE_DOCKER_IMAGE FROM $BASE_DOCKER_IMAGE LABEL maintainer="Hugging Face" @@ -8,7 +8,7 @@ ARG DEBIAN_FRONTEND=noninteractive SHELL ["sh", "-lc"] RUN apt update -RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev RUN git lfs install RUN python3 -m pip install --no-cache-dir --upgrade pip @@ -23,9 +23,11 @@ RUN cd transformers && python3 setup.py develop ARG FRAMEWORK ARG VERSION +# Control `setuptools` version to avoid some issues +RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5" + # Remove all frameworks -# (`accelerate` requires `torch`, and this causes import issues for TF-only testing) -RUN python3 -m pip uninstall -y torch torchvision torchaudio accelerate tensorflow jax flax +RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax # Get the libraries and their versions to install, and write installation command to `~/.profile`. RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION @@ -34,4 +36,24 @@ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --ve RUN echo "INSTALL_CMD = $INSTALL_CMD" RUN $INSTALL_CMD +RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] + +# Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing +# We will install `accelerate@main` in Past CI workflow file +RUN python3 -m pip uninstall -y accelerate + +# Uninstall `torch-tensorrt` and `apex` shipped with the base image +RUN python3 -m pip uninstall -y torch-tensorrt apex + +# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) +RUN python3 -m pip uninstall -y deepspeed +# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.) +# Issue: https://github.com/microsoft/DeepSpeed/issues/2010 +# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \ +# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 + RUN python3 -m pip install -U "itsdangerous<2.1.0" + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile new file mode 100644 index 000000000000..f19cd4edb0e4 --- /dev/null +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -0,0 +1,31 @@ +FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1 +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt update && \ + apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" + +# If set to nothing, will install the latest version +ARG PYTORCH='2.0.1' +ARG TORCH_VISION='0.15.2' +ARG TORCH_AUDIO='2.0.2' +ARG ROCM='5.6' + +RUN git clone --depth 1 --branch v$TORCH_AUDIO https://github.com/pytorch/audio.git +RUN cd audio && USE_ROCM=1 USE_CUDA=0 python setup.py install + +ARG REF=main +WORKDIR / +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video] + +RUN python3 -m pip uninstall -y tensorflow flax + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index d19092c2dcd4..c2ce626b474e 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -1,12 +1,12 @@ -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-04.html#rel_22-04 -FROM nvcr.io/nvidia/pytorch:22.04-py3 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12 +FROM nvcr.io/nvidia/pytorch:22.12-py3 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTORCH='1.13.0' +ARG PYTORCH='2.0.1' # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu116' +ARG CUDA='cu118' RUN apt -y update RUN apt install -y libaio-dev @@ -15,6 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF +RUN python3 -m pip uninstall -y torch torchvision torchaudio + # Install latest release PyTorch # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) @@ -22,25 +24,31 @@ RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchau RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] -RUN python3 -m pip install torch-tensorrt==1.3.0 --find-links https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0 +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + +# Uninstall `transformer-engine` shipped with the base image +RUN python3 -m pip uninstall -y transformer-engine + +# Uninstall `torch-tensorrt` shipped with the base image +RUN python3 -m pip uninstall -y torch-tensorrt # recompile apex RUN python3 -m pip uninstall -y apex RUN git clone https://github.com/NVIDIA/apex # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners -RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . +RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) RUN python3 -m pip uninstall -y deepspeed # This has to be run (again) inside the GPU VMs running the tests. # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. # TODO: Find out why test fail. -RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails -RUN python3 -m pip install -U --no-cache-dir pydantic +RUN python3 -m pip install -U --no-cache-dir "pydantic<2" RUN python3 -c "from deepspeed.launcher.runner import main" diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile index 573e09c22a9c..b3ead0c61547 100644 --- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile @@ -1,10 +1,11 @@ -FROM nvcr.io/nvidia/pytorch:21.03-py3 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12 +FROM nvcr.io/nvidia/pytorch:22.12-py3 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu113' +ARG CUDA='cu118' RUN apt -y update RUN apt install -y libaio-dev @@ -13,6 +14,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF +RUN python3 -m pip uninstall -y torch torchvision torchaudio + # Install **nightly** release PyTorch (flag `--pre`) # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) @@ -20,30 +23,38 @@ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + +# Uninstall `transformer-engine` shipped with the base image +RUN python3 -m pip uninstall -y transformer-engine + +# Uninstall `torch-tensorrt` and `apex` shipped with the base image +RUN python3 -m pip uninstall -y torch-tensorrt apex + # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) RUN python3 -m pip uninstall -y deepspeed # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.) # Issue: https://github.com/microsoft/DeepSpeed/issues/2010 # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \ -# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 - -# For `torchdynamo` tests -# (see https://github.com/huggingface/transformers/pull/17765) -RUN git clone https://github.com/pytorch/functorch -RUN python3 -m pip install --no-cache-dir ./functorch[aot] -RUN cd functorch && python3 setup.py develop - -RUN git clone https://github.com/pytorch/torchdynamo -RUN python3 -m pip install -r ./torchdynamo/requirements.txt -RUN cd torchdynamo && python3 setup.py develop - -# install TensorRT -RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex -RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2 +# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 -# install torch_tensorrt (fx path) -RUN git clone https://github.com/pytorch/TensorRT.git -RUN cd TensorRT/py && python3 setup.py install --fx-only +## For `torchdynamo` tests +## (see https://github.com/huggingface/transformers/pull/17765) +#RUN git clone https://github.com/pytorch/functorch +#RUN python3 -m pip install --no-cache-dir ./functorch[aot] +#RUN cd functorch && python3 setup.py develop +# +#RUN git clone https://github.com/pytorch/torchdynamo +#RUN python3 -m pip install -r ./torchdynamo/requirements.txt +#RUN cd torchdynamo && python3 setup.py develop +# +## install TensorRT +#RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex +#RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2 +# +## install torch_tensorrt (fx path) +#RUN git clone https://github.com/pytorch/TensorRT.git +#RUN cd TensorRT/py && python3 setup.py install --fx-only # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index d34dcc116aeb..702a837abd01 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,19 +9,20 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video] # If set to nothing, will install the latest version -ARG PYTORCH='1.13.0' +ARG PYTORCH='2.1.0' ARG TORCH_VISION='' ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu116' +ARG CUDA='cu118' RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video] + RUN python3 -m pip uninstall -y tensorflow flax RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index 09e8512f2ce8..df9039a0c4d2 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing] # If set to nothing, will install the latest version -ARG TENSORFLOW='2.11' +ARG TENSORFLOW='2.13' RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION RUN python3 -m pip uninstall -y torch flax diff --git a/docs/README.md b/docs/README.md index 9aa74d4de94b..9269cc5bd291 100644 --- a/docs/README.md +++ b/docs/README.md @@ -81,10 +81,10 @@ The `preview` command only works with existing doc files. When you add a complet ## Adding a new element to the navigation bar -Accepted files are Markdown (.md or .mdx). +Accepted files are Markdown (.md). Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting -the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/_toctree.yml) file. +the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml) file. ## Renaming section headers and moving sections @@ -109,7 +109,7 @@ Sections that were moved: Use the relative style to link to the new file so that the versioned docs continue to work. -For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.mdx). +For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md). ## Writing Documentation - Specification @@ -138,7 +138,7 @@ When translating, refer to the guide at [./TRANSLATING.md](https://github.com/hu When adding a new model: -- Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template). +- Create a file `xxx.md` or under `./source/model_doc` (don't hesitate to copy an existing file as template). - Link that file in `./source/_toctree.yml`. - Write a short overview of the model: - Overview with paper & authors @@ -147,7 +147,7 @@ When adding a new model: - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow. The order is generally: - - Configuration, + - Configuration - Tokenizer - PyTorch base model - PyTorch head models @@ -364,25 +364,9 @@ We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to v For Transformers, the doctests are run on a daily basis via GitHub Actions as can be seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml). -To include your example in the daily doctests, you need to add the filename that -contains the example docstring to the [documentation_tests.txt](../utils/documentation_tests.txt). - ### For Python files -You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files): - -```bash -python utils/prepare_for_doc_test.py src docs -``` - -If you work on a specific python module, say `modeling_wav2vec2.py`, you can run the command as follows (to avoid the unnecessary temporary changes in irrelevant files): - -```bash -python utils/prepare_for_doc_test.py src/transformers/utils/doc.py src/transformers/models/wav2vec2/modeling_wav2vec2.py -``` -(`utils/doc.py` should always be included) - -Then you can run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance: +Run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance: ```bash pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure @@ -394,30 +378,12 @@ If you want to isolate a specific docstring, just add `::` after the file name t pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure ``` -Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing: - -```bash -python utils/prepare_for_doc_test.py src docs --remove_new_line -``` - ### For Markdown files -You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files): - -```bash -python utils/prepare_for_doc_test.py src docs -``` - -Then you can test locally a given file with this command (here testing the quicktour): - -```bash -pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx" -``` - -Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing: +You can test locally a given file with this command (here testing the quicktour): ```bash -python utils/prepare_for_doc_test.py src docs --remove_new_line +pytest --doctest-modules docs/source/quicktour.md -sv --doctest-continue-on-failure --doctest-glob="*.md" ``` ### Writing doctests diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md index c6f5c45baf02..420e7a8b16a1 100644 --- a/docs/TRANSLATING.md +++ b/docs/TRANSLATING.md @@ -54,4 +54,4 @@ The fields you should add are `local` (with the name of the file containing the Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter. -> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @sgugger. +> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu and @MKhalusova. diff --git a/docs/source/de/_toctree.yml b/docs/source/de/_toctree.yml index 8b15c2c53e7c..d18a14ce9298 100644 --- a/docs/source/de/_toctree.yml +++ b/docs/source/de/_toctree.yml @@ -15,8 +15,28 @@ title: Vorverarbeiten - local: training title: Optimierung eines vortrainierten Modells + - local: run_scripts + title: Trainieren mit einem Skript - local: accelerate title: Verteiltes Training mit 🤗 Accelerate + - local: peft + title: Laden und Trainieren von Adaptern mit 🤗 PEFT - local: model_sharing title: Ein Modell teilen + - local: transformers_agents + title: Agents + - local: llm_tutorial + title: Generation with LLMs title: Tutorials +- sections: + - local: add_new_model + title: Wie fügt man ein Modell zu 🤗 Transformers hinzu? + - local: add_tensorflow_model + title: Wie konvertiert man ein 🤗 Transformers-Modell in TensorFlow? + - local: add_new_pipeline + title: Wie fügt man eine Pipeline zu 🤗 Transformers hinzu? + - local: testing + title: Testen + - local: pr_checks + title: Überprüfung einer Pull Request + title: Contribute \ No newline at end of file diff --git a/docs/source/de/accelerate.md b/docs/source/de/accelerate.md new file mode 100644 index 000000000000..98a11cbdc417 --- /dev/null +++ b/docs/source/de/accelerate.md @@ -0,0 +1,136 @@ + + +# Verteiltes Training mit 🤗 Accelerate + +Da die Modelle immer größer werden, hat sich die Parallelität als Strategie zum Trainieren größerer Modelle auf begrenzter Hardware und zur Beschleunigung der Trainingsgeschwindigkeit um mehrere Größenordnungen erwiesen. Bei Hugging Face haben wir die Bibliothek [🤗 Accelerate](https://huggingface.co/docs/accelerate) entwickelt, um Nutzern zu helfen, ein 🤗 Transformers-Modell auf jeder Art von verteiltem Setup zu trainieren, egal ob es sich um mehrere GPUs auf einer Maschine oder mehrere GPUs auf mehreren Maschinen handelt. In diesem Tutorial lernen Sie, wie Sie Ihre native PyTorch-Trainingsschleife anpassen, um das Training in einer verteilten Umgebung zu ermöglichen. + +## Einrichtung + +Beginnen Sie mit der Installation von 🤗 Accelerate: + +```bash +pip install accelerate +``` + +Dann importieren und erstellen Sie ein [`~accelerate.Accelerator`]-Objekt. Der [`~accelerate.Accelerator`] wird automatisch Ihre Art der verteilten Einrichtung erkennen und alle notwendigen Komponenten für das Training initialisieren. Sie müssen Ihr Modell nicht explizit auf einem Gerät platzieren. + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## Vorbereiten auf die Beschleunigung + +Der nächste Schritt ist die Übergabe aller relevanten Trainingsobjekte an die Methode [`~accelerate.Accelerator.prepare`]. Dazu gehören Ihre Trainings- und Evaluierungs-DataLoader, ein Modell und ein Optimierer: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## Rückwärts + +Die letzte Ergänzung besteht darin, das typische `loss.backward()` in der Trainingsschleife durch die 🤗 Accelerate-Methode [`~accelerate.Accelerator.backward`] zu ersetzen: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +Wie Sie im folgenden Code sehen können, müssen Sie nur vier zusätzliche Codezeilen zu Ihrer Trainingsschleife hinzufügen, um verteiltes Training zu ermöglichen! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## Trainieren + +Sobald Sie die entsprechenden Codezeilen hinzugefügt haben, starten Sie Ihr Training in einem Skript oder einem Notebook wie Colaboratory. + +### Trainieren mit einem Skript + +Wenn Sie Ihr Training mit einem Skript durchführen, führen Sie den folgenden Befehl aus, um eine Konfigurationsdatei zu erstellen und zu speichern: + +```bash +accelerate config +``` + +Dann starten Sie Ihr Training mit: + +```bash +accelerate launch train.py +``` + +### Trainieren mit einem Notebook + +🤗 Accelerate kann auch in einem Notebook laufen, wenn Sie planen, die TPUs von Colaboratory zu verwenden. Verpacken Sie den gesamten Code, der für das Training verantwortlich ist, in eine Funktion und übergeben Sie diese an [`~accelerate.notebook_launcher`]: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +Weitere Informationen über 🤗 Accelerate und seine umfangreichen Funktionen finden Sie in der [Dokumentation](https://huggingface.co/docs/accelerate). \ No newline at end of file diff --git a/docs/source/de/accelerate.mdx b/docs/source/de/accelerate.mdx deleted file mode 100644 index 64f85f205f8a..000000000000 --- a/docs/source/de/accelerate.mdx +++ /dev/null @@ -1,132 +0,0 @@ - - -# Verteiltes Training mit 🤗 Accelerate - -Da die Modelle immer größer werden, hat sich die Parallelität als Strategie zum Trainieren größerer Modelle auf begrenzter Hardware und zur Beschleunigung der Trainingsgeschwindigkeit um mehrere Größenordnungen erwiesen. Bei Hugging Face haben wir die Bibliothek [🤗 Accelerate](https://huggingface.co/docs/accelerate) entwickelt, um Nutzern zu helfen, ein 🤗 Transformers-Modell auf jeder Art von verteiltem Setup zu trainieren, egal ob es sich um mehrere GPUs auf einer Maschine oder mehrere GPUs auf mehreren Maschinen handelt. In diesem Tutorial lernen Sie, wie Sie Ihre native PyTorch-Trainingsschleife anpassen, um das Training in einer verteilten Umgebung zu ermöglichen. - -## Einrichtung - -Beginnen Sie mit der Installation von 🤗 Accelerate: - -```bash -pip install accelerate -``` - -Dann importieren und erstellen Sie ein [`~accelerate.Accelerator`]-Objekt. Der [`~accelerate.Accelerator`] wird automatisch Ihre Art der verteilten Einrichtung erkennen und alle notwendigen Komponenten für das Training initialisieren. Sie müssen Ihr Modell nicht explizit auf einem Gerät platzieren. - -```py ->>> from accelerate import Accelerator - ->>> accelerator = Accelerator() -``` - -## Vorbereiten auf die Beschleunigung - -Der nächste Schritt ist die Übergabe aller relevanten Trainingsobjekte an die Methode [`~accelerate.Accelerator.prepare`]. Dazu gehören Ihre Trainings- und Evaluierungs-DataLoader, ein Modell und ein Optimierer: - -```py ->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -... train_dataloader, eval_dataloader, model, optimizer -... ) -``` - -## Rückwärts - -Die letzte Ergänzung besteht darin, das typische `loss.backward()` in der Trainingsschleife durch die 🤗 Accelerate-Methode [`~accelerate.Accelerator.backward`] zu ersetzen: - -```py ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... outputs = model(**batch) -... loss = outputs.loss -... accelerator.backward(loss) - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -Wie Sie im folgenden Code sehen können, müssen Sie nur vier zusätzliche Codezeilen zu Ihrer Trainingsschleife hinzufügen, um verteiltes Training zu ermöglichen! - -```diff -+ from accelerate import Accelerator - from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler - -+ accelerator = Accelerator() - - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) - optimizer = AdamW(model.parameters(), lr=3e-5) - -- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -- model.to(device) - -+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -+ train_dataloader, eval_dataloader, model, optimizer -+ ) - - num_epochs = 3 - num_training_steps = num_epochs * len(train_dataloader) - lr_scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=num_training_steps - ) - - progress_bar = tqdm(range(num_training_steps)) - - model.train() - for epoch in range(num_epochs): - for batch in train_dataloader: -- batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss -- loss.backward() -+ accelerator.backward(loss) - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) -``` - -## Trainieren - -Sobald Sie die entsprechenden Codezeilen hinzugefügt haben, starten Sie Ihr Training in einem Skript oder einem Notebook wie Colaboratory. - -### Trainieren mit einem Skript - -Wenn Sie Ihr Training mit einem Skript durchführen, führen Sie den folgenden Befehl aus, um eine Konfigurationsdatei zu erstellen und zu speichern: - -```bash -accelerate config -``` - -Dann starten Sie Ihr Training mit: - -```bash -accelerate launch train.py -``` - -### Trainieren mit einem Notebook - -🤗 Accelerate kann auch in einem Notebook laufen, wenn Sie planen, die TPUs von Colaboratory zu verwenden. Verpacken Sie den gesamten Code, der für das Training verantwortlich ist, in eine Funktion und übergeben Sie diese an [`~accelerate.notebook_launcher`]: - -```py ->>> from accelerate import notebook_launcher - ->>> notebook_launcher(training_function) -``` - -Weitere Informationen über 🤗 Accelerate und seine umfangreichen Funktionen finden Sie in der [Dokumentation](https://huggingface.co/docs/accelerate). \ No newline at end of file diff --git a/docs/source/de/add_new_model.md b/docs/source/de/add_new_model.md new file mode 100644 index 000000000000..2c1f0f6a00ad --- /dev/null +++ b/docs/source/de/add_new_model.md @@ -0,0 +1,895 @@ + + +# Wie kann ich ein Modell zu 🤗 Transformers hinzufügen? + +Die 🤗 Transformers-Bibliothek ist dank der Beiträge der Community oft in der Lage, neue Modelle anzubieten. Aber das kann ein anspruchsvolles Projekt sein und erfordert eine eingehende Kenntnis der 🤗 Transformers-Bibliothek und des zu implementierenden Modells. Bei Hugging Face versuchen wir, mehr Mitgliedern der Community die Möglichkeit zu geben, aktiv Modelle hinzuzufügen, und wir haben diese Anleitung zusammengestellt, die Sie durch den Prozess des Hinzufügens eines PyTorch-Modells führt (stellen Sie sicher, dass Sie [PyTorch installiert haben](https://pytorch.org/get-started/locally/)). + + + +Wenn Sie daran interessiert sind, ein TensorFlow-Modell zu implementieren, werfen Sie einen Blick in die Anleitung [How to convert a 🤗 Transformers model to TensorFlow](add_tensorflow_model)! + + + +Auf dem Weg dorthin, werden Sie: + +- Einblicke in bewährte Open-Source-Verfahren erhalten +- die Konstruktionsprinzipien hinter einer der beliebtesten Deep-Learning-Bibliotheken verstehen +- lernen Sie, wie Sie große Modelle effizient testen können +- lernen Sie, wie Sie Python-Hilfsprogramme wie `black`, `ruff` und `make fix-copies` integrieren, um sauberen und lesbaren Code zu gewährleisten + +Ein Mitglied des Hugging Face-Teams wird Ihnen dabei zur Seite stehen, damit Sie nicht alleine sind. 🤗 ❤️ + +Um loszulegen, öffnen Sie eine [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) Ausgabe für das Modell, das Sie in 🤗 Transformers sehen möchten. Wenn Sie nicht besonders wählerisch sind, wenn es darum geht, ein bestimmtes Modell beizusteuern, können Sie nach dem [New model label](https://github.com/huggingface/transformers/labels/New%20model) filtern, um zu sehen, ob es noch unbeanspruchte Modellanfragen gibt, und daran arbeiten. + +Sobald Sie eine neue Modellanfrage eröffnet haben, sollten Sie sich zunächst mit 🤗 Transformers vertraut machen, falls Sie das noch nicht sind! + +## Allgemeiner Überblick über 🤗 Transformers + +Zunächst sollten Sie sich einen allgemeinen Überblick über 🤗 Transformers verschaffen. 🤗 Transformers ist eine sehr meinungsfreudige Bibliothek, es ist also möglich, dass +Es besteht also die Möglichkeit, dass Sie mit einigen der Philosophien oder Designentscheidungen der Bibliothek nicht einverstanden sind. Aus unserer Erfahrung heraus haben wir jedoch +dass die grundlegenden Designentscheidungen und Philosophien der Bibliothek entscheidend sind, um 🤗 Transformers effizient zu skalieren. +Transformatoren zu skalieren und gleichzeitig die Wartungskosten auf einem vernünftigen Niveau zu halten. + +Ein guter erster Ansatzpunkt, um die Bibliothek besser zu verstehen, ist die Lektüre der [Dokumentation unserer Philosophie](Philosophie). Als Ergebnis unserer Arbeitsweise gibt es einige Entscheidungen, die wir versuchen, auf alle Modelle anzuwenden: + +- Komposition wird im Allgemeinen gegenüber Abstraktion bevorzugt +- Die Duplizierung von Code ist nicht immer schlecht, wenn sie die Lesbarkeit oder Zugänglichkeit eines Modells stark verbessert +- Modelldateien sind so in sich geschlossen wie möglich, so dass Sie, wenn Sie den Code eines bestimmten Modells lesen, idealerweise nur + in die entsprechende Datei `modeling_....py` schauen müssen. + +Unserer Meinung nach ist der Code der Bibliothek nicht nur ein Mittel, um ein Produkt bereitzustellen, *z.B.* die Möglichkeit, BERT für +Inferenz zu verwenden, sondern auch als das Produkt selbst, das wir verbessern wollen. Wenn Sie also ein Modell hinzufügen, ist der Benutzer nicht nur die +Person, die Ihr Modell verwenden wird, sondern auch jeder, der Ihren Code liest, zu verstehen versucht und ihn möglicherweise verbessert. + +Lassen Sie uns daher ein wenig tiefer in das allgemeine Design der Bibliothek einsteigen. + +### Überblick über die Modelle + +Um ein Modell erfolgreich hinzuzufügen, ist es wichtig, die Interaktion zwischen Ihrem Modell und seiner Konfiguration zu verstehen, +[`PreTrainedModel`] und [`PretrainedConfig`]. Als Beispiel werden wir +das Modell, das zu 🤗 Transformers hinzugefügt werden soll, `BrandNewBert` nennen. + +Schauen wir uns das mal an: + + + +Wie Sie sehen, machen wir in 🤗 Transformers von der Vererbung Gebrauch, aber wir beschränken die Abstraktionsebene auf ein absolutes Minimum. +Minimum. Es gibt nie mehr als zwei Abstraktionsebenen für ein Modell in der Bibliothek. `BrandNewBertModel` +erbt von `BrandNewBertPreTrainedModel`, das wiederum von [`PreTrainedModel`] erbt und +das war's. In der Regel wollen wir sicherstellen, dass ein neues Modell nur von +[`PreTrainedModel`] abhängt. Die wichtigen Funktionalitäten, die jedem neuen Modell automatisch zur Verfügung gestellt werden, sind +Modell automatisch bereitgestellt werden, sind [`~PreTrainedModel.from_pretrained`] und +[`~PreTrainedModel.save_pretrained`], die für die Serialisierung und Deserialisierung verwendet werden. Alle +anderen wichtigen Funktionalitäten, wie `BrandNewBertModel.forward` sollten vollständig in der neuen +Skript `modeling_brand_new_bert.py` definiert werden. Als nächstes wollen wir sicherstellen, dass ein Modell mit einer bestimmten Kopfebene, wie z.B. +`BrandNewBertForMaskedLM` nicht von `BrandNewBertModel` erbt, sondern `BrandNewBertModel` verwendet +als Komponente, die im Forward Pass aufgerufen werden kann, um die Abstraktionsebene niedrig zu halten. Jedes neue Modell erfordert eine +Konfigurationsklasse, genannt `BrandNewBertConfig`. Diese Konfiguration wird immer als ein Attribut in +[PreTrainedModel] gespeichert und kann daher über das Attribut `config` für alle Klassen aufgerufen werden +die von `BrandNewBertPreTrainedModel` erben: + +```python +model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") +model.config # model has access to its config +``` + +Ähnlich wie das Modell erbt die Konfiguration grundlegende Serialisierungs- und Deserialisierungsfunktionalitäten von +[`PretrainedConfig`]. Beachten Sie, dass die Konfiguration und das Modell immer in zwei verschiedene Formate serialisiert werden +unterschiedliche Formate serialisiert werden - das Modell in eine *pytorch_model.bin* Datei und die Konfiguration in eine *config.json* Datei. Aufruf von +[~PreTrainedModel.save_pretrained`] wird automatisch +[~PretrainedConfig.save_pretrained`] auf, so dass sowohl das Modell als auch die Konfiguration gespeichert werden. + + +### Code-Stil + +Wenn Sie Ihr neues Modell kodieren, sollten Sie daran denken, dass Transformers eine Bibliothek mit vielen Meinungen ist und dass wir selbst ein paar Macken haben +wie der Code geschrieben werden sollte :-) + +1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen + Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem + `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160) + für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). +2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten + beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen. + Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife. +3. Generell ziehen wir längeren expliziten Code einem kurzen magischen Code vor. +4. Vermeiden Sie die Unterklassifizierung von `nn.Sequential` in PyTorch, sondern unterklassifizieren Sie `nn.Module` und schreiben Sie den Vorwärtspass, so dass jeder + so dass jeder, der Ihren Code verwendet, ihn schnell debuggen kann, indem er Druckanweisungen oder Haltepunkte hinzufügt. +5. Ihre Funktionssignatur sollte mit einer Typ-Annotation versehen sein. Im Übrigen sind gute Variablennamen viel lesbarer und verständlicher + verständlicher als Typ-Anmerkungen. + +### Übersicht der Tokenizer + +Noch nicht ganz fertig :-( Dieser Abschnitt wird bald hinzugefügt! + +## Schritt-für-Schritt-Rezept zum Hinzufügen eines Modells zu 🤗 Transformers + +Jeder hat andere Vorlieben, was die Portierung eines Modells angeht. Daher kann es sehr hilfreich sein, wenn Sie sich Zusammenfassungen ansehen +wie andere Mitwirkende Modelle auf Hugging Face portiert haben. Hier ist eine Liste von Blogbeiträgen aus der Community, wie man ein Modell portiert: + +1. [Portierung eines GPT2-Modells](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) von [Thomas](https://huggingface.co/thomwolf) +2. [Portierung des WMT19 MT-Modells](https://huggingface.co/blog/porting-fsmt) von [Stas](https://huggingface.co/stas) + +Aus Erfahrung können wir Ihnen sagen, dass die wichtigsten Dinge, die Sie beim Hinzufügen eines Modells beachten müssen, sind: + +- Erfinden Sie das Rad nicht neu! Die meisten Teile des Codes, den Sie für das neue 🤗 Transformers-Modell hinzufügen werden, existieren bereits + irgendwo in 🤗 Transformers. Nehmen Sie sich etwas Zeit, um ähnliche, bereits vorhandene Modelle und Tokenizer zu finden, die Sie kopieren können + von. [grep](https://www.gnu.org/software/grep/) und [rg](https://github.com/BurntSushi/ripgrep) sind Ihre + Freunde. Beachten Sie, dass es sehr gut möglich ist, dass der Tokenizer Ihres Modells auf einer Modellimplementierung basiert und + und der Modellierungscode Ihres Modells auf einer anderen. *Z.B.* Der Modellierungscode von FSMT basiert auf BART, während der Tokenizer-Code von FSMT + auf XLM basiert. +- Es handelt sich eher um eine technische als um eine wissenschaftliche Herausforderung. Sie sollten mehr Zeit auf die Schaffung einer + eine effiziente Debugging-Umgebung zu schaffen, als zu versuchen, alle theoretischen Aspekte des Modells in dem Papier zu verstehen. +- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Modelle sind der Kernbestandteil von 🤗 Transformers, so dass wir bei Hugging Face mehr als + mehr als glücklich, Ihnen bei jedem Schritt zu helfen, um Ihr Modell hinzuzufügen. Zögern Sie nicht zu fragen, wenn Sie merken, dass Sie nicht weiterkommen. + Fortschritte machen. + +Im Folgenden versuchen wir, Ihnen ein allgemeines Rezept an die Hand zu geben, das uns bei der Portierung eines Modells auf 🤗 Transformers am nützlichsten erschien. + +Die folgende Liste ist eine Zusammenfassung all dessen, was getan werden muss, um ein Modell hinzuzufügen und kann von Ihnen als To-Do verwendet werden +Liste verwenden: + +☐ (Optional) Verstehen der theoretischen Aspekte des Modells
+☐ Vorbereiten der 🤗 Transformers-Entwicklungsumgebung
+☐ Debugging-Umgebung des ursprünglichen Repositorys eingerichtet
+☐ Skript erstellt, das den Durchlauf `forward()` unter Verwendung des ursprünglichen Repositorys und des Checkpoints erfolgreich durchführt
+☐ Erfolgreich das Modellskelett zu 🤗 Transformers hinzugefügt
+☐ Erfolgreiche Umwandlung des ursprünglichen Prüfpunkts in den 🤗 Transformers-Prüfpunkt
+☐ Erfolgreich den Durchlauf `forward()` in 🤗 Transformers ausgeführt, der eine identische Ausgabe wie der ursprüngliche Prüfpunkt liefert
+☐ Modell-Tests in 🤗 Transformers abgeschlossen
+☐ Erfolgreich Tokenizer in 🤗 Transformers hinzugefügt
+☐ End-to-End-Integrationstests ausgeführt
+☐ Docs fertiggestellt
+☐ Modellgewichte in den Hub hochgeladen
+☐ Die Pull-Anfrage eingereicht
+☐ (Optional) Hinzufügen eines Demo-Notizbuchs + +Für den Anfang empfehlen wir in der Regel, mit einem guten theoretischen Verständnis von `BrandNewBert` zu beginnen. Wie auch immer, +wenn Sie es vorziehen, die theoretischen Aspekte des Modells *on-the-job* zu verstehen, dann ist es völlig in Ordnung, direkt in die +in die Code-Basis von `BrandNewBert` einzutauchen. Diese Option könnte für Sie besser geeignet sein, wenn Ihre technischen Fähigkeiten besser sind als +als Ihre theoretischen Fähigkeiten, wenn Sie Schwierigkeiten haben, die Arbeit von `BrandNewBert` zu verstehen, oder wenn Sie einfach Spaß am Programmieren +mehr Spaß am Programmieren haben als am Lesen wissenschaftlicher Abhandlungen. + +### 1. (Optional) Theoretische Aspekte von BrandNewBert + +Sie sollten sich etwas Zeit nehmen, um die Abhandlung von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. Möglicherweise gibt es große +Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist +ist es nicht, ein tiefes theoretisches Verständnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um +das Modell effektiv in 🤗 Transformers zu implementieren. Das heißt, Sie müssen nicht zu viel Zeit auf die +theoretischen Aspekten verbringen, sondern sich lieber auf die praktischen Aspekte konzentrieren, nämlich: + +- Welche Art von Modell ist *brand_new_bert*? BERT-ähnliches Modell nur für den Encoder? GPT2-ähnliches reines Decoder-Modell? BART-ähnliches + Encoder-Decoder-Modell? Sehen Sie sich die [model_summary](model_summary) an, wenn Sie mit den Unterschieden zwischen diesen Modellen nicht vertraut sind. +- Was sind die Anwendungen von *brand_new_bert*? Textklassifizierung? Texterzeugung? Seq2Seq-Aufgaben, *z.B.,* + Zusammenfassungen? +- Was ist die neue Eigenschaft des Modells, die es von BERT/GPT-2/BART unterscheidet? +- Welches der bereits existierenden [🤗 Transformers-Modelle](https://huggingface.co/transformers/#contents) ist am ähnlichsten + ähnlich wie *brand_new_bert*? +- Welche Art von Tokenizer wird verwendet? Ein Satzteil-Tokenisierer? Ein Wortstück-Tokenisierer? Ist es derselbe Tokenisierer, der für + für BERT oder BART? + +Nachdem Sie das Gefühl haben, einen guten Überblick über die Architektur des Modells erhalten zu haben, können Sie dem +Hugging Face Team schreiben und Ihre Fragen stellen. Dazu können Fragen zur Architektur des Modells gehören, +seiner Aufmerksamkeitsebene usw. Wir werden Ihnen gerne weiterhelfen. + +### 2. Bereiten Sie als nächstes Ihre Umgebung vor + +1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfläche 'Fork' klicken. + Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt. + +2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und fügen Sie das Basis-Repository als Remote hinzu: + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + +3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausführen: + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +Abhängig von Ihrem Betriebssystem und da die Anzahl der optionalen Abhängigkeiten von Transformers wächst, kann es sein, dass Sie bei diesem Befehl einen +Fehler mit diesem Befehl. Stellen Sie in diesem Fall sicher, dass Sie das Deep Learning Framework, mit dem Sie arbeiten, installieren +(PyTorch, TensorFlow und/oder Flax) und führen Sie es aus: + +```bash +pip install -e ".[quality]" +``` + +was für die meisten Anwendungsfälle ausreichend sein sollte. Sie können dann zum übergeordneten Verzeichnis zurückkehren + +```bash +cd .. +``` + +4. Wir empfehlen, die PyTorch-Version von *brand_new_bert* zu Transformers hinzuzufügen. Um PyTorch zu installieren, folgen Sie bitte den + Anweisungen auf https://pytorch.org/get-started/locally/. + +**Anmerkung:** Sie müssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU zum Laufen zu bringen. + +5. Um *brand_new_bert* zu portieren, benötigen Sie außerdem Zugriff auf das Original-Repository: + +```bash +git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +cd brand_new_bert +pip install -e . +``` + +Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *brand_new_bert* auf 🤗 Transformers zu portieren. + +### 3.-4. Führen Sie einen Pre-Training-Checkpoint mit dem Original-Repository durch + +Zunächst werden Sie mit dem ursprünglichen *brand_new_bert* Repository arbeiten. Oft ist die ursprüngliche Implementierung sehr +"forschungslastig". Das bedeutet, dass es an Dokumentation mangeln kann und der Code schwer zu verstehen sein kann. Aber das sollte +genau Ihre Motivation sein, *brand_new_bert* neu zu implementieren. Eines unserer Hauptziele bei Hugging Face ist es, *die Menschen dazu zu bringen +auf den Schultern von Giganten zu stehen*, was sich hier sehr gut darin ausdrückt, dass wir ein funktionierendes Modell nehmen und es umschreiben, um es so +es so **zugänglich, benutzerfreundlich und schön** wie möglich zu machen. Dies ist die wichtigste Motivation für die Neuimplementierung von +Modelle in 🤗 Transformers umzuwandeln - der Versuch, komplexe neue NLP-Technologie für **jeden** zugänglich zu machen. + +Sie sollten damit beginnen, indem Sie in das Original-Repository eintauchen. + +Die erfolgreiche Ausführung des offiziellen Pre-Trainingsmodells im Original-Repository ist oft **der schwierigste** Schritt. +Unserer Erfahrung nach ist es sehr wichtig, dass Sie einige Zeit damit verbringen, sich mit der ursprünglichen Code-Basis vertraut zu machen. Sie müssen +das Folgende herausfinden: + +- Wo finden Sie die vortrainierten Gewichte? +- Wie lädt man die vorab trainierten Gewichte in das entsprechende Modell? +- Wie kann der Tokenizer unabhängig vom Modell ausgeführt werden? +- Verfolgen Sie einen Forward Pass, damit Sie wissen, welche Klassen und Funktionen für einen einfachen Forward Pass erforderlich sind. Normalerweise, + müssen Sie nur diese Funktionen reimplementieren. +- Sie müssen in der Lage sein, die wichtigen Komponenten des Modells zu finden: Wo befindet sich die Klasse des Modells? Gibt es Unterklassen des Modells, + *z.B.* EncoderModel, DecoderModel? Wo befindet sich die Selbstaufmerksamkeitsschicht? Gibt es mehrere verschiedene Aufmerksamkeitsebenen, + *z.B.* *Selbstaufmerksamkeit*, *Kreuzaufmerksamkeit*...? +- Wie können Sie das Modell in der ursprünglichen Umgebung des Repo debuggen? Müssen Sie *print* Anweisungen hinzufügen, können Sie + mit einem interaktiven Debugger wie *ipdb* arbeiten oder sollten Sie eine effiziente IDE zum Debuggen des Modells verwenden, wie z.B. PyCharm? + +Es ist sehr wichtig, dass Sie, bevor Sie mit der Portierung beginnen, den Code im Original-Repository **effizient** debuggen können +Repository können! Denken Sie auch daran, dass Sie mit einer Open-Source-Bibliothek arbeiten, also zögern Sie nicht, ein Problem oder +oder sogar eine Pull-Anfrage im Original-Repository zu stellen. Die Betreuer dieses Repositorys sind wahrscheinlich sehr froh darüber +dass jemand in ihren Code schaut! + +An diesem Punkt liegt es wirklich an Ihnen, welche Debugging-Umgebung und Strategie Sie zum Debuggen des ursprünglichen +Modell zu debuggen. Wir raten dringend davon ab, eine kostspielige GPU-Umgebung einzurichten, sondern arbeiten Sie einfach auf einer CPU, sowohl wenn Sie mit dem +in das ursprüngliche Repository einzutauchen und auch, wenn Sie beginnen, die 🤗 Transformers-Implementierung des Modells zu schreiben. Nur +ganz am Ende, wenn das Modell bereits erfolgreich auf 🤗 Transformers portiert wurde, sollte man überprüfen, ob das +Modell auch auf der GPU wie erwartet funktioniert. + +Im Allgemeinen gibt es zwei mögliche Debugging-Umgebungen für die Ausführung des Originalmodells + +- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb) +- Lokale Python-Skripte. + +Jupyter-Notebooks haben den Vorteil, dass sie eine zellenweise Ausführung ermöglichen, was hilfreich sein kann, um logische Komponenten besser voneinander zu trennen und +logische Komponenten voneinander zu trennen und schnellere Debugging-Zyklen zu haben, da Zwischenergebnisse gespeichert werden können. Außerdem, +Außerdem lassen sich Notebooks oft leichter mit anderen Mitwirkenden teilen, was sehr hilfreich sein kann, wenn Sie das Hugging Face Team um Hilfe bitten möchten. +Face Team um Hilfe bitten. Wenn Sie mit Jupyter-Notizbüchern vertraut sind, empfehlen wir Ihnen dringend, mit ihnen zu arbeiten. + +Der offensichtliche Nachteil von Jupyter-Notizbüchern ist, dass Sie, wenn Sie nicht daran gewöhnt sind, mit ihnen zu arbeiten, einige Zeit damit verbringen müssen +einige Zeit damit verbringen müssen, sich an die neue Programmierumgebung zu gewöhnen, und dass Sie möglicherweise Ihre bekannten Debugging-Tools nicht mehr verwenden können +wie z.B. `ipdb` nicht mehr verwenden können. + +Für jede Codebasis ist es immer ein guter erster Schritt, einen **kleinen** vortrainierten Checkpoint zu laden und in der Lage zu sein, einen +einzelnen Vorwärtsdurchlauf mit einem Dummy-Integer-Vektor von Eingabe-IDs als Eingabe zu reproduzieren. Ein solches Skript könnte wie folgt aussehen (in +Pseudocode): + +```python +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +``` + +Was die Debugging-Strategie anbelangt, so können Sie im Allgemeinen aus mehreren Strategien wählen: + +- Zerlegen Sie das ursprüngliche Modell in viele kleine testbare Komponenten und führen Sie für jede dieser Komponenten einen Vorwärtsdurchlauf zur + Überprüfung +- Zerlegen Sie das ursprüngliche Modell nur in den ursprünglichen *Tokenizer* und das ursprüngliche *Modell*, führen Sie einen Vorwärtsdurchlauf für diese Komponenten durch + und verwenden Sie dazwischenliegende Druckanweisungen oder Haltepunkte zur Überprüfung. + +Auch hier bleibt es Ihnen überlassen, welche Strategie Sie wählen. Oft ist die eine oder die andere Strategie vorteilhaft, je nach der ursprünglichen Codebasis +Basis. + +Wenn die ursprüngliche Codebasis es Ihnen erlaubt, das Modell in kleinere Teilkomponenten zu zerlegen, *z.B.* wenn die ursprüngliche +Code-Basis problemlos im Eager-Modus ausgeführt werden kann, lohnt es sich in der Regel, dies zu tun. Es gibt einige wichtige Vorteile +am Anfang den schwierigeren Weg zu gehen: + +- Wenn Sie später das ursprüngliche Modell mit der Hugging Face-Implementierung vergleichen, können Sie automatisch überprüfen, ob + für jede Komponente einzeln überprüfen, ob die entsprechende Komponente der 🤗 Transformers-Implementierung übereinstimmt, anstatt sich auf + anstatt sich auf den visuellen Vergleich über Druckanweisungen zu verlassen +- können Sie das große Problem der Portierung eines Modells in kleinere Probleme der Portierung einzelner Komponenten zerlegen + einzelnen Komponenten zu zerlegen und so Ihre Arbeit besser zu strukturieren +- Die Aufteilung des Modells in logisch sinnvolle Komponenten hilft Ihnen, einen besseren Überblick über das Design des Modells zu bekommen + und somit das Modell besser zu verstehen +- In einem späteren Stadium helfen Ihnen diese komponentenweisen Tests dabei, sicherzustellen, dass keine Regressionen auftreten, während Sie fortfahren + Ihren Code ändern + +[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) Integrationstests für ELECTRA +gibt ein schönes Beispiel dafür, wie dies geschehen kann. + +Wenn die ursprüngliche Codebasis jedoch sehr komplex ist oder nur die Ausführung von Zwischenkomponenten in einem kompilierten Modus erlaubt, +könnte es zu zeitaufwändig oder sogar unmöglich sein, das Modell in kleinere testbare Teilkomponenten zu zerlegen. Ein gutes +Beispiel ist die [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) Bibliothek, die sehr komplex ist +sehr komplex ist und keine einfache Möglichkeit bietet, das Modell in seine Unterkomponenten zu zerlegen. Bei solchen Bibliotheken ist man +oft auf die Überprüfung von Druckanweisungen angewiesen. + +Unabhängig davon, welche Strategie Sie wählen, ist die empfohlene Vorgehensweise oft die gleiche, nämlich dass Sie mit der Fehlersuche in den +die Anfangsebenen zuerst und die Endebenen zuletzt debuggen. + +Es wird empfohlen, dass Sie die Ausgaben der folgenden Ebenen abrufen, entweder durch Druckanweisungen oder Unterkomponentenfunktionen +Schichten in der folgenden Reihenfolge abrufen: + +1. Rufen Sie die Eingabe-IDs ab, die an das Modell übergeben wurden +2. Rufen Sie die Worteinbettungen ab +3. Rufen Sie die Eingabe der ersten Transformer-Schicht ab +4. Rufen Sie die Ausgabe der ersten Transformer-Schicht ab +5. Rufen Sie die Ausgabe der folgenden n - 1 Transformer-Schichten ab +6. Rufen Sie die Ausgabe des gesamten BrandNewBert Modells ab + +Die Eingabe-IDs sollten dabei aus einem Array von Ganzzahlen bestehen, *z.B.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]` + +Die Ausgaben der folgenden Schichten bestehen oft aus mehrdimensionalen Float-Arrays und können wie folgt aussehen: + +``` +[[ + [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], + ..., + [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], +``` + +Wir erwarten, dass jedes zu 🤗 Transformers hinzugefügte Modell eine Reihe von Integrationstests besteht, was bedeutet, dass das ursprüngliche +Modell und die neu implementierte Version in 🤗 Transformers exakt dieselbe Ausgabe liefern müssen, und zwar mit einer Genauigkeit von 0,001! +Da es normal ist, dass das exakt gleiche Modell, das in verschiedenen Bibliotheken geschrieben wurde, je nach Bibliotheksrahmen eine leicht unterschiedliche Ausgabe liefern kann +eine leicht unterschiedliche Ausgabe liefern kann, akzeptieren wir eine Fehlertoleranz von 1e-3 (0,001). Es reicht nicht aus, wenn das Modell +fast das gleiche Ergebnis liefert, sie müssen fast identisch sein. Daher werden Sie sicherlich die Zwischenergebnisse +Zwischenergebnisse der 🤗 Transformers-Version mehrfach mit den Zwischenergebnissen der ursprünglichen Implementierung von +*brand_new_bert* vergleichen. In diesem Fall ist eine **effiziente** Debugging-Umgebung des ursprünglichen Repositorys absolut +wichtig ist. Hier sind einige Ratschläge, um Ihre Debugging-Umgebung so effizient wie möglich zu gestalten. + +- Finden Sie den besten Weg, um Zwischenergebnisse zu debuggen. Ist das ursprüngliche Repository in PyTorch geschrieben? Dann sollten Sie + dann sollten Sie sich wahrscheinlich die Zeit nehmen, ein längeres Skript zu schreiben, das das ursprüngliche Modell in kleinere Unterkomponenten zerlegt, um + Zwischenwerte abzurufen. Ist das ursprüngliche Repository in Tensorflow 1 geschrieben? Dann müssen Sie sich möglicherweise auf die + TensorFlow Druckoperationen wie [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) verlassen, um die + Zwischenwerte auszugeben. Ist das ursprüngliche Repository in Jax geschrieben? Dann stellen Sie sicher, dass das Modell **nicht jitted** ist, wenn + wenn Sie den Vorwärtsdurchlauf ausführen, *z.B.* schauen Sie sich [dieser Link](https://github.com/google/jax/issues/196) an. +- Verwenden Sie den kleinsten vortrainierten Prüfpunkt, den Sie finden können. Je kleiner der Prüfpunkt ist, desto schneller wird Ihr Debugging-Zyklus + wird. Es ist nicht effizient, wenn Ihr vorab trainiertes Modell so groß ist, dass Ihr Vorwärtsdurchlauf mehr als 10 Sekunden dauert. + Falls nur sehr große Checkpoints verfügbar sind, kann es sinnvoller sein, ein Dummy-Modell in der neuen + Umgebung mit zufällig initialisierten Gewichten zu erstellen und diese Gewichte zum Vergleich mit der 🤗 Transformers-Version + Ihres Modells +- Vergewissern Sie sich, dass Sie den einfachsten Weg wählen, um einen Forward Pass im ursprünglichen Repository aufzurufen. Idealerweise sollten Sie + die Funktion im originalen Repository finden, die **nur** einen einzigen Vorwärtspass aufruft, *d.h.* die oft aufgerufen wird + Vorhersagen", "Auswerten", "Vorwärts" oder "Aufruf" genannt wird. Sie wollen keine Funktion debuggen, die `forward` aufruft + mehrfach aufruft, *z.B.* um Text zu erzeugen, wie `autoregressive_sample`, `generate`. +- Versuchen Sie, die Tokenisierung vom *Forward*-Pass des Modells zu trennen. Wenn das Original-Repository Beispiele zeigt, bei denen + Sie eine Zeichenkette eingeben müssen, dann versuchen Sie herauszufinden, an welcher Stelle im Vorwärtsaufruf die Zeichenketteneingabe in Eingabe-IDs geändert wird + geändert wird und beginnen Sie an dieser Stelle. Das könnte bedeuten, dass Sie möglicherweise selbst ein kleines Skript schreiben oder den + Originalcode so ändern müssen, dass Sie die ids direkt eingeben können, anstatt eine Zeichenkette einzugeben. +- Vergewissern Sie sich, dass sich das Modell in Ihrem Debugging-Setup **nicht** im Trainingsmodus befindet, der oft dazu führt, dass das Modell + Dies führt häufig zu zufälligen Ergebnissen, da das Modell mehrere Dropout-Schichten enthält. Stellen Sie sicher, dass der Vorwärtsdurchlauf in Ihrer Debugging + Umgebung **deterministisch** ist, damit die Dropout-Schichten nicht verwendet werden. Oder verwenden Sie *transformers.utils.set_seed*. + wenn sich die alte und die neue Implementierung im selben Framework befinden. + +Im folgenden Abschnitt finden Sie genauere Details/Tipps, wie Sie dies für *brand_new_bert* tun können. + +### 5.-14. Portierung von BrandNewBert auf 🤗 Transformatoren + +Als nächstes können Sie endlich damit beginnen, neuen Code zu 🤗 Transformers hinzuzufügen. Gehen Sie in den Klon Ihres 🤗 Transformers Forks: + +```bash +cd transformers +``` + +In dem speziellen Fall, dass Sie ein Modell hinzufügen, dessen Architektur genau mit der Modellarchitektur eines +Modells übereinstimmt, müssen Sie nur ein Konvertierungsskript hinzufügen, wie in [diesem Abschnitt](#write-a-conversion-script) beschrieben. +In diesem Fall können Sie einfach die gesamte Modellarchitektur des bereits vorhandenen Modells wiederverwenden. + +Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Sie haben hier zwei Möglichkeiten: + +- `transformers-cli add-new-model-like`, um ein neues Modell wie ein bestehendes hinzuzufügen +- `transformers-cli add-new-model`, um ein neues Modell aus unserer Vorlage hinzuzufügen (sieht dann aus wie BERT oder Bart, je nachdem, welche Art von Modell Sie wählen) + +In beiden Fällen werden Sie mit einem Fragebogen aufgefordert, die grundlegenden Informationen zu Ihrem Modell auszufüllen. Für den zweiten Befehl müssen Sie `cookiecutter` installieren, weitere Informationen dazu finden Sie [hier](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model). + +**Eröffnen Sie einen Pull Request auf dem Haupt-Repositorium huggingface/transformers** + +Bevor Sie mit der Anpassung des automatisch generierten Codes beginnen, ist es nun an der Zeit, einen "Work in progress (WIP)" Pull +Anfrage, *z.B.* "[WIP] Add *brand_new_bert*", in 🤗 Transformers zu öffnen, damit Sie und das Hugging Face Team +Seite an Seite an der Integration des Modells in 🤗 Transformers arbeiten können. + +Sie sollten Folgendes tun: + +1. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung + +```bash +git checkout -b add_brand_new_bert +``` + +2. Bestätigen Sie den automatisch generierten Code: + +```bash +git add . +git commit +``` + +3. Abrufen und zurücksetzen auf die aktuelle Haupt + +```bash +git fetch upstream +git rebase upstream/main +``` + +4. Übertragen Sie die Änderungen auf Ihr Konto mit: + +```bash +git push -u origin a-descriptive-name-for-my-changes +``` + +5. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das + GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzufügen, damit das Hugging Face-Team über zukünftige Änderungen informiert wird. + zukünftige Änderungen benachrichtigt wird. + +6. Ändern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken. + +Vergessen Sie im Folgenden nicht, wenn Sie Fortschritte gemacht haben, Ihre Arbeit zu committen und in Ihr Konto zu pushen, damit sie in der Pull-Anfrage erscheint. +damit sie in der Pull-Anfrage angezeigt wird. Außerdem sollten Sie darauf achten, dass Sie Ihre Arbeit von Zeit zu Zeit mit dem aktuellen main +von Zeit zu Zeit zu aktualisieren, indem Sie dies tun: + +```bash +git fetch upstream +git merge upstream/main +``` + +Generell sollten Sie alle Fragen, die Sie in Bezug auf das Modell oder Ihre Implementierung haben, in Ihrem PR stellen und +in der PR diskutiert/gelöst werden. Auf diese Weise wird das Hugging Face Team immer benachrichtigt, wenn Sie neuen Code einreichen oder +wenn Sie eine Frage haben. Es ist oft sehr hilfreich, das Hugging Face-Team auf Ihren hinzugefügten Code hinzuweisen, damit das Hugging Face-Team Ihr Problem oder Ihre Frage besser verstehen kann. +Face-Team Ihr Problem oder Ihre Frage besser verstehen kann. + +Gehen Sie dazu auf die Registerkarte "Geänderte Dateien", auf der Sie alle Ihre Änderungen sehen, gehen Sie zu einer Zeile, zu der Sie eine Frage stellen möchten +eine Frage stellen möchten, und klicken Sie auf das "+"-Symbol, um einen Kommentar hinzuzufügen. Wenn eine Frage oder ein Problem gelöst wurde, +können Sie auf die Schaltfläche "Lösen" des erstellten Kommentars klicken. + +Auf dieselbe Weise wird das Hugging Face-Team Kommentare öffnen, wenn es Ihren Code überprüft. Wir empfehlen, die meisten Fragen +auf GitHub in Ihrem PR zu stellen. Für einige sehr allgemeine Fragen, die für die Öffentlichkeit nicht sehr nützlich sind, können Sie das +Hugging Face Team per Slack oder E-Mail zu stellen. + +**5. Passen Sie den Code der generierten Modelle für brand_new_bert** an. + +Zunächst werden wir uns nur auf das Modell selbst konzentrieren und uns nicht um den Tokenizer kümmern. Den gesamten relevanten Code sollten Sie +finden Sie in den generierten Dateien `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` und +`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. + +Jetzt können Sie endlich mit dem Programmieren beginnen :). Der generierte Code in +`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` wird entweder die gleiche Architektur wie BERT haben, wenn +wenn es sich um ein reines Encoder-Modell handelt oder BART, wenn es sich um ein Encoder-Decoder-Modell handelt. An diesem Punkt sollten Sie sich daran erinnern, was +was Sie am Anfang über die theoretischen Aspekte des Modells gelernt haben: *Wie unterscheidet sich das Modell von BERT oder +BART?*". Implementieren Sie diese Änderungen, was oft bedeutet, dass Sie die *Selbstaufmerksamkeitsschicht*, die Reihenfolge der Normalisierungsschicht usw. ändern müssen. +Schicht usw... Auch hier ist es oft nützlich, sich die ähnliche Architektur bereits bestehender Modelle in Transformers anzusehen, um ein besseres Gefühl dafür zu bekommen +ein besseres Gefühl dafür zu bekommen, wie Ihr Modell implementiert werden sollte. + +**Beachten Sie**, dass Sie an diesem Punkt nicht sehr sicher sein müssen, dass Ihr Code völlig korrekt oder sauber ist. Vielmehr ist es +Sie sollten vielmehr eine erste *unbereinigte*, kopierte Version des ursprünglichen Codes in +src/transformers/models/brand_new_bert/modeling_brand_new_bert.py" hinzuzufügen, bis Sie das Gefühl haben, dass der gesamte notwendige Code +hinzugefügt wurde. Unserer Erfahrung nach ist es viel effizienter, schnell eine erste Version des erforderlichen Codes hinzuzufügen und +den Code iterativ mit dem Konvertierungsskript zu verbessern/korrigieren, wie im nächsten Abschnitt beschrieben. Das einzige, was +zu diesem Zeitpunkt funktionieren muss, ist, dass Sie die 🤗 Transformers-Implementierung von *brand_new_bert* instanziieren können, *d.h.* der +folgende Befehl sollte funktionieren: + +```python +from transformers import BrandNewBertModel, BrandNewBertConfig + +model = BrandNewBertModel(BrandNewBertConfig()) +``` + +Der obige Befehl erstellt ein Modell gemäß den Standardparametern, die in `BrandNewBertConfig()` definiert sind, mit +zufälligen Gewichten und stellt damit sicher, dass die `init()` Methoden aller Komponenten funktionieren. + +Beachten Sie, dass alle zufälligen Initialisierungen in der Methode `_init_weights` Ihres `BrandnewBertPreTrainedModel` stattfinden sollten. +Klasse erfolgen sollte. Sie sollte alle Blattmodule in Abhängigkeit von den Variablen der Konfiguration initialisieren. Hier ist ein Beispiel mit der +BERT `_init_weights` Methode: + +```py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) +``` + +Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in +`Wav2Vec2ForPreTraining` müssen die letzten beiden linearen Schichten die Initialisierung des regulären PyTorch `nn.Linear` haben. +aber alle anderen sollten eine Initialisierung wie oben verwenden. Dies ist wie folgt kodiert: + +```py +def _init_weights(self, module): + """Initialize the weights""" + if isinstnace(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() +``` + +Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf +True` für `module.project_q` und `module.project_hid` setzen, stellen wir sicher, dass die benutzerdefinierte Initialisierung, die wir vorgenommen haben, später nicht überschrieben wird, +die Funktion `_init_weights` nicht auf sie angewendet wird. + +**6. Schreiben Sie ein Konvertierungsskript** + +Als nächstes sollten Sie ein Konvertierungsskript schreiben, mit dem Sie den Checkpoint, den Sie zum Debuggen von *brand_new_bert* im +im ursprünglichen Repository in einen Prüfpunkt konvertieren, der mit Ihrer gerade erstellten 🤗 Transformers-Implementierung von +*brand_new_bert*. Es ist nicht ratsam, das Konvertierungsskript von Grund auf neu zu schreiben, sondern die bereits +bestehenden Konvertierungsskripten in 🤗 Transformers nach einem Skript zu suchen, das für die Konvertierung eines ähnlichen Modells verwendet wurde, das im +demselben Framework wie *brand_new_bert* geschrieben wurde. Normalerweise reicht es aus, ein bereits vorhandenes Konvertierungsskript zu kopieren und +es für Ihren Anwendungsfall leicht anzupassen. Zögern Sie nicht, das Hugging Face Team zu bitten, Sie auf ein ähnliches, bereits vorhandenes +Konvertierungsskript für Ihr Modell zu finden. + +- Wenn Sie ein Modell von TensorFlow nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BERT [hier] (https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91) +- Wenn Sie ein Modell von PyTorch nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BART [hier](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py) + +Im Folgenden werden wir kurz erklären, wie PyTorch-Modelle Ebenengewichte speichern und Ebenennamen definieren. In PyTorch wird der +Name einer Ebene durch den Namen des Klassenattributs definiert, das Sie der Ebene geben. Lassen Sie uns ein Dummy-Modell in +PyTorch, das wir `SimpleModel` nennen, wie folgt: + +```python +from torch import nn + + +class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.dense = nn.Linear(10, 10) + self.intermediate = nn.Linear(10, 10) + self.layer_norm = nn.LayerNorm(10) +``` + +Jetzt können wir eine Instanz dieser Modelldefinition erstellen, die alle Gewichte ausfüllt: `dense`, `intermediate`, +`layer_norm` mit zufälligen Gewichten. Wir können das Modell ausdrucken, um seine Architektur zu sehen + +```python +model = SimpleModel() + +print(model) +``` + +Dies gibt folgendes aus: + +``` +SimpleModel( + (dense): Linear(in_features=10, out_features=10, bias=True) + (intermediate): Linear(in_features=10, out_features=10, bias=True) + (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) +) +``` + +Wir können sehen, dass die Ebenennamen durch den Namen des Klassenattributs in PyTorch definiert sind. Sie können die Gewichtswerte +Werte einer bestimmten Ebene anzeigen lassen: + +```python +print(model.dense.weight.data) +``` + +um zu sehen, dass die Gewichte zufällig initialisiert wurden + +``` +tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, + -0.2077, 0.2157], + [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, + 0.2166, -0.0212], + [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, + -0.1023, -0.0447], + [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, + -0.1876, -0.2467], + [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, + 0.2577, 0.0402], + [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, + 0.2132, 0.1680], + [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, + 0.2707, -0.2509], + [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, + 0.1829, -0.1568], + [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, + 0.0333, -0.0536], + [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, + 0.2220, 0.2358]]). +``` + +Im Konvertierungsskript sollten Sie diese zufällig initialisierten Gewichte mit den genauen Gewichten der +entsprechenden Ebene im Kontrollpunkt. *Z.B.* + +```python +# retrieve matching layer weights, e.g. by +# recursive algorithm +layer_name = "dense" +pretrained_weight = array_of_dense_layer + +model_pointer = getattr(model, "dense") + +model_pointer.weight.data = torch.from_numpy(pretrained_weight) +``` + +Dabei müssen Sie sicherstellen, dass jedes zufällig initialisierte Gewicht Ihres PyTorch-Modells und sein entsprechendes +Checkpoint-Gewicht in **Form und Name** genau übereinstimmen. Zu diesem Zweck ist es **notwendig**, assert +Anweisungen für die Form hinzuzufügen und die Namen der Checkpoint-Gewichte auszugeben. Sie sollten z.B. Anweisungen hinzufügen wie: + +```python +assert ( + model_pointer.weight.shape == pretrained_weight.shape +), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" +``` + +Außerdem sollten Sie die Namen der beiden Gewichte ausdrucken, um sicherzustellen, dass sie übereinstimmen, *z.B.*. + +```python +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +``` + +Wenn entweder die Form oder der Name nicht übereinstimmt, haben Sie wahrscheinlich das falsche Kontrollpunktgewicht einer zufällig +Ebene der 🤗 Transformers-Implementierung zugewiesen. + +Eine falsche Form ist höchstwahrscheinlich auf eine falsche Einstellung der Konfigurationsparameter in `BrandNewBertConfig()` zurückzuführen, die +nicht genau mit denen übereinstimmen, die für den zu konvertierenden Prüfpunkt verwendet wurden. Es könnte aber auch sein, dass +die PyTorch-Implementierung eines Layers erfordert, dass das Gewicht vorher transponiert wird. + +Schließlich sollten Sie auch überprüfen, ob **alle** erforderlichen Gewichte initialisiert sind und alle Checkpoint-Gewichte ausgeben, die +die nicht zur Initialisierung verwendet wurden, um sicherzustellen, dass das Modell korrekt konvertiert wurde. Es ist völlig normal, dass die +Konvertierungsversuche entweder mit einer falschen Shape-Anweisung oder einer falschen Namenszuweisung fehlschlagen. Das liegt höchstwahrscheinlich daran, dass entweder +Sie haben falsche Parameter in `BrandNewBertConfig()` verwendet, haben eine falsche Architektur in der 🤗 Transformers +Implementierung, Sie haben einen Fehler in den `init()` Funktionen einer der Komponenten der 🤗 Transformers +Implementierung oder Sie müssen eine der Kontrollpunktgewichte transponieren. + +Dieser Schritt sollte mit dem vorherigen Schritt wiederholt werden, bis alle Gewichte des Kontrollpunkts korrekt in das +Transformers-Modell geladen sind. Nachdem Sie den Prüfpunkt korrekt in die 🤗 Transformers-Implementierung geladen haben, können Sie das Modell +das Modell unter einem Ordner Ihrer Wahl `/path/to/converted/checkpoint/folder` speichern, der dann sowohl ein +Datei `pytorch_model.bin` und eine Datei `config.json` enthalten sollte: + +```python +model.save_pretrained("/path/to/converted/checkpoint/folder") +``` + +**7. Implementieren Sie den Vorwärtspass** + +Nachdem es Ihnen gelungen ist, die trainierten Gewichte korrekt in die 🤗 Transformers-Implementierung zu laden, sollten Sie nun dafür sorgen +sicherstellen, dass der Forward Pass korrekt implementiert ist. In [Machen Sie sich mit dem ursprünglichen Repository vertraut](#34-run-a-pretrained-checkpoint-using-the-original-repository) haben Sie bereits ein Skript erstellt, das einen Forward Pass +Durchlauf des Modells unter Verwendung des Original-Repositorys durchführt. Jetzt sollten Sie ein analoges Skript schreiben, das die 🤗 Transformers +Implementierung anstelle der Originalimplementierung verwenden. Es sollte wie folgt aussehen: + +```python +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states +``` + +Es ist sehr wahrscheinlich, dass die 🤗 Transformers-Implementierung und die ursprüngliche Modell-Implementierung nicht genau die gleiche Ausgabe liefern. +beim ersten Mal nicht die gleiche Ausgabe liefern oder dass der Vorwärtsdurchlauf einen Fehler auslöst. Seien Sie nicht enttäuscht - das ist zu erwarten! Erstens, +sollten Sie sicherstellen, dass der Vorwärtsdurchlauf keine Fehler auslöst. Es passiert oft, dass die falschen Dimensionen verwendet werden +verwendet werden, was zu einem *Dimensionality mismatch* Fehler führt oder dass der falsche Datentyp verwendet wird, *z.B.* `torch.long` +anstelle von `torch.float32`. Zögern Sie nicht, das Hugging Face Team um Hilfe zu bitten, wenn Sie bestimmte Fehler nicht lösen können. +bestimmte Fehler nicht lösen können. + +Um sicherzustellen, dass die Implementierung von 🤗 Transformers korrekt funktioniert, müssen Sie sicherstellen, dass die Ausgaben +einer Genauigkeit von `1e-3` entsprechen. Zunächst sollten Sie sicherstellen, dass die Ausgabeformen identisch sind, *d.h.*. +Die Ausgabeform *outputs.shape* sollte für das Skript der 🤗 Transformers-Implementierung und die ursprüngliche +Implementierung ergeben. Als nächstes sollten Sie sicherstellen, dass auch die Ausgabewerte identisch sind. Dies ist einer der schwierigsten +Teile des Hinzufügens eines neuen Modells. Häufige Fehler, warum die Ausgaben nicht identisch sind, sind: + +- Einige Ebenen wurden nicht hinzugefügt, *d.h.* eine *Aktivierungsebene* wurde nicht hinzugefügt, oder die Restverbindung wurde vergessen +- Die Worteinbettungsmatrix wurde nicht gebunden +- Es werden die falschen Positionseinbettungen verwendet, da die ursprüngliche Implementierung einen Offset verwendet +- Dropout wird während des Vorwärtsdurchlaufs angewendet. Um dies zu beheben, stellen Sie sicher, dass *model.training auf False* steht und dass keine Dropout + Schicht während des Vorwärtsdurchlaufs fälschlicherweise aktiviert wird, *d.h.* übergeben Sie *self.training* an [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout) + +Der beste Weg, das Problem zu beheben, besteht normalerweise darin, sich den Vorwärtsdurchlauf der ursprünglichen Implementierung und die 🤗 +Transformers-Implementierung nebeneinander zu sehen und zu prüfen, ob es Unterschiede gibt. Idealerweise sollten Sie die +Zwischenergebnisse beider Implementierungen des Vorwärtsdurchlaufs debuggen/ausdrucken, um die genaue Position im Netzwerk zu finden, an der die 🤗 +Transformers-Implementierung eine andere Ausgabe zeigt als die ursprüngliche Implementierung. Stellen Sie zunächst sicher, dass die +hartcodierten `input_ids` in beiden Skripten identisch sind. Überprüfen Sie dann, ob die Ausgaben der ersten Transformation von +der `input_ids` (normalerweise die Worteinbettungen) identisch sind. Und dann arbeiten Sie sich bis zur allerletzten Schicht des +Netzwerks. Irgendwann werden Sie einen Unterschied zwischen den beiden Implementierungen feststellen, der Sie auf den Fehler +in der Implementierung von 🤗 Transformers hinweist. Unserer Erfahrung nach ist ein einfacher und effizienter Weg, viele Druckanweisungen hinzuzufügen +sowohl in der Original-Implementierung als auch in der 🤗 Transformers-Implementierung an den gleichen Stellen im Netzwerk +hinzuzufügen und nacheinander Druckanweisungen zu entfernen, die dieselben Werte für Zwischenpräsentationen anzeigen. + +Wenn Sie sicher sind, dass beide Implementierungen die gleiche Ausgabe liefern, überprüfen Sie die Ausgaben mit +`torch.allclose(original_output, output, atol=1e-3)` überprüfen, haben Sie den schwierigsten Teil hinter sich! Herzlichen Glückwunsch - die +Arbeit, die noch zu erledigen ist, sollte ein Kinderspiel sein 😊. + +**8. Hinzufügen aller notwendigen Modelltests** + +An diesem Punkt haben Sie erfolgreich ein neues Modell hinzugefügt. Es ist jedoch sehr gut möglich, dass das Modell noch nicht +noch nicht vollständig mit dem erforderlichen Design übereinstimmt. Um sicherzustellen, dass die Implementierung vollständig kompatibel mit 🤗 Transformers ist, sollten alle +gemeinsamen Tests bestehen. Der Cookiecutter sollte automatisch eine Testdatei für Ihr Modell hinzugefügt haben, wahrscheinlich unter +demselben `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Führen Sie diese Testdatei aus, um zu überprüfen, ob alle gängigen +Tests bestehen: + +```bash +pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py +``` + +Nachdem Sie alle allgemeinen Tests festgelegt haben, müssen Sie nun sicherstellen, dass all die schöne Arbeit, die Sie geleistet haben, gut getestet ist, damit + +- a) die Community Ihre Arbeit leicht nachvollziehen kann, indem sie sich spezifische Tests von *brand_new_bert* ansieht +- b) zukünftige Änderungen an Ihrem Modell keine wichtigen Funktionen des Modells zerstören. + +Als erstes sollten Sie Integrationstests hinzufügen. Diese Integrationstests tun im Wesentlichen dasselbe wie die Debugging-Skripte +die Sie zuvor zur Implementierung des Modells in 🤗 Transformers verwendet haben. Eine Vorlage für diese Modelltests wurde bereits von dem +Cookiecutter hinzugefügt, die `BrandNewBertModelIntegrationTests` heißt und nur noch von Ihnen ausgefüllt werden muss. Um sicherzustellen, dass diese +Tests erfolgreich sind, führen Sie + +```bash +RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +``` + + + +Falls Sie Windows verwenden, sollten Sie `RUN_SLOW=1` durch `SET RUN_SLOW=1` ersetzen. + + + +Zweitens sollten alle Funktionen, die speziell für *brand_new_bert* sind, zusätzlich in einem separaten Test getestet werden unter +`BrandNewBertModelTester`/``BrandNewBertModelTest`. Dieser Teil wird oft vergessen, ist aber in zweierlei Hinsicht äußerst nützlich +Weise: + +- Er hilft dabei, das Wissen, das Sie während der Modellerweiterung erworben haben, an die Community weiterzugeben, indem er zeigt, wie die + speziellen Funktionen von *brand_new_bert* funktionieren sollten. +- Künftige Mitwirkende können Änderungen am Modell schnell testen, indem sie diese speziellen Tests ausführen. + + +**9. Implementieren Sie den Tokenizer** + +Als nächstes sollten wir den Tokenizer von *brand_new_bert* hinzufügen. Normalerweise ist der Tokenizer äquivalent oder sehr ähnlich zu einem +bereits vorhandenen Tokenizer von 🤗 Transformers. + +Es ist sehr wichtig, die ursprüngliche Tokenizer-Datei zu finden/extrahieren und es zu schaffen, diese Datei in die 🤗 +Transformers Implementierung des Tokenizers zu laden. + +Um sicherzustellen, dass der Tokenizer korrekt funktioniert, empfiehlt es sich, zunächst ein Skript im ursprünglichen Repository zu erstellen +zu erstellen, das eine Zeichenkette eingibt und die `input_ids` zurückgibt. Es könnte etwa so aussehen (in Pseudocode): + +```python +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = model.tokenize(input_str) +``` + +Möglicherweise müssen Sie noch einmal einen Blick in das ursprüngliche Repository werfen, um die richtige Tokenizer-Funktion zu finden, oder Sie müssen +Sie müssen vielleicht sogar Änderungen an Ihrem Klon des Original-Repositorys vornehmen, um nur die `input_ids` auszugeben. Nach dem Schreiben +ein funktionierendes Tokenisierungsskript geschrieben, das das ursprüngliche Repository verwendet, sollten Sie ein analoges Skript für 🤗 Transformers +erstellt werden. Es sollte ähnlich wie dieses aussehen: + +```python +from transformers import BrandNewBertTokenizer + +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." + +tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") + +input_ids = tokenizer(input_str).input_ids +``` + +Wenn beide `input_ids` die gleichen Werte ergeben, sollte als letzter Schritt auch eine Tokenizer-Testdatei hinzugefügt werden. + +Analog zu den Modellierungstestdateien von *brand_new_bert* sollten auch die Tokenisierungs-Testdateien von *brand_new_bert* +eine Reihe von fest kodierten Integrationstests enthalten. + +**10. Führen Sie End-to-End-Integrationstests aus** + +Nachdem Sie den Tokenizer hinzugefügt haben, sollten Sie auch ein paar End-to-End-Integrationstests, die sowohl das Modell als auch den +Tokenizer zu `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers. +Ein solcher Test sollte bei einem aussagekräftigen +Text-zu-Text-Beispiel zeigen, dass die Implementierung von 🤗 Transformers wie erwartet funktioniert. Ein aussagekräftiges Text-zu-Text-Beispiel kann +z.B. *ein Quell-zu-Ziel-Übersetzungspaar, ein Artikel-zu-Zusammenfassung-Paar, ein Frage-zu-Antwort-Paar, usw... Wenn keiner der +der portierten Prüfpunkte in einer nachgelagerten Aufgabe feinabgestimmt wurde, genügt es, sich einfach auf die Modelltests zu verlassen. In einem +letzten Schritt, um sicherzustellen, dass das Modell voll funktionsfähig ist, sollten Sie alle Tests auch auf der GPU durchführen. Es kann +Es kann vorkommen, dass Sie vergessen haben, einige `.to(self.device)` Anweisungen zu internen Tensoren des Modells hinzuzufügen, was in einem solchen +Test zu einem Fehler führen würde. Falls Sie keinen Zugang zu einem Grafikprozessor haben, kann das Hugging Face Team diese Tests für Sie durchführen. +Tests für Sie übernehmen. + +**11. Docstring hinzufügen** + +Nun sind alle notwendigen Funktionen für *brand_new_bert* hinzugefügt - Sie sind fast fertig! Das Einzige, was Sie noch hinzufügen müssen, ist +ein schöner Docstring und eine Doku-Seite. Der Cookiecutter sollte eine Vorlagendatei namens +`docs/source/model_doc/brand_new_bert.md` hinzugefügt haben, die Sie ausfüllen sollten. Die Benutzer Ihres Modells werden in der Regel zuerst einen Blick auf +diese Seite ansehen, bevor sie Ihr Modell verwenden. Daher muss die Dokumentation verständlich und prägnant sein. Es ist sehr nützlich für +die Gemeinschaft, einige *Tipps* hinzuzufügen, um zu zeigen, wie das Modell verwendet werden sollte. Zögern Sie nicht, das Hugging Face-Team anzupingen +bezüglich der Docstrings. + +Stellen Sie als nächstes sicher, dass der zu `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` hinzugefügte docstring +korrekt ist und alle erforderlichen Eingaben und Ausgaben enthält. Wir haben eine ausführliche Anleitung zum Schreiben von Dokumentationen und unserem Docstring-Format [hier](writing-documentation). Es ist immer gut, sich daran zu erinnern, dass die Dokumentation +mindestens so sorgfältig behandelt werden sollte wie der Code in 🤗 Transformers, denn die Dokumentation ist in der Regel der erste Kontaktpunkt der +Berührungspunkt der Community mit dem Modell ist. + +**Code refactor** + +Großartig, jetzt haben Sie den gesamten erforderlichen Code für *brand_new_bert* hinzugefügt. An diesem Punkt sollten Sie einige mögliche +falschen Codestil korrigieren, indem Sie ausführen: + +```bash +make style +``` + +und überprüfen Sie, ob Ihr Kodierungsstil die Qualitätsprüfung besteht: + +```bash +make quality +``` + +Es gibt noch ein paar andere sehr strenge Designtests in 🤗 Transformers, die möglicherweise noch fehlschlagen, was sich in den +den Tests Ihres Pull Requests. Dies liegt oft an fehlenden Informationen im Docstring oder an einer falschen +Benennung. Das Hugging Face Team wird Ihnen sicherlich helfen, wenn Sie hier nicht weiterkommen. + +Und schließlich ist es immer eine gute Idee, den eigenen Code zu refaktorisieren, nachdem man sichergestellt hat, dass er korrekt funktioniert. Wenn alle +Tests bestanden haben, ist es nun an der Zeit, den hinzugefügten Code noch einmal durchzugehen und einige Überarbeitungen vorzunehmen. + +Sie haben nun den Codierungsteil abgeschlossen, herzlichen Glückwunsch! 🎉 Sie sind großartig! 😎 + +**12. Laden Sie die Modelle in den Model Hub hoch** + +In diesem letzten Teil sollten Sie alle Checkpoints konvertieren und in den Modell-Hub hochladen und eine Modellkarte für jeden +hochgeladenen Modell-Kontrollpunkt. Sie können sich mit den Hub-Funktionen vertraut machen, indem Sie unsere [Model sharing and uploading Page](model_sharing) lesen. Hier sollten Sie mit dem Hugging Face-Team zusammenarbeiten, um einen passenden Namen für jeden +Checkpoint festzulegen und die erforderlichen Zugriffsrechte zu erhalten, um das Modell unter der Organisation des Autors *brand_new_bert* hochladen zu können. +*brand_new_bert*. Die Methode `push_to_hub`, die in allen Modellen in `transformers` vorhanden ist, ist ein schneller und effizienter Weg, Ihren Checkpoint in den Hub zu pushen. Ein kleines Snippet ist unten eingefügt: + +```python +brand_new_bert.push_to_hub("brand_new_bert") +# Uncomment the following line to push to an organization. +# brand_new_bert.push_to_hub("/brand_new_bert") +``` + +Es lohnt sich, etwas Zeit darauf zu verwenden, für jeden Kontrollpunkt passende Musterkarten zu erstellen. Die Modellkarten sollten die +spezifischen Merkmale dieses bestimmten Prüfpunkts hervorheben, * z.B.* auf welchem Datensatz wurde der Prüfpunkt +vortrainiert/abgestimmt? Für welche nachgelagerte Aufgabe sollte das Modell verwendet werden? Und fügen Sie auch etwas Code bei, wie Sie +wie das Modell korrekt verwendet wird. + +**13. (Optional) Notizbuch hinzufügen** + +Es ist sehr hilfreich, ein Notizbuch hinzuzufügen, in dem im Detail gezeigt wird, wie *brand_new_bert* für Schlussfolgerungen verwendet werden kann und/oder +bei einer nachgelagerten Aufgabe feinabgestimmt wird. Dies ist nicht zwingend erforderlich, um Ihren PR zusammenzuführen, aber sehr nützlich für die Gemeinschaft. + +**14. Reichen Sie Ihren fertigen PR ein** + +Sie sind jetzt mit der Programmierung fertig und können zum letzten Schritt übergehen, nämlich der Zusammenführung Ihres PR mit main. Normalerweise hat das +Hugging Face Team Ihnen an diesem Punkt bereits geholfen haben, aber es lohnt sich, sich etwas Zeit zu nehmen, um Ihrem fertigen +PR eine schöne Beschreibung zu geben und eventuell Kommentare zu Ihrem Code hinzuzufügen, wenn Sie Ihren Gutachter auf bestimmte Designentscheidungen hinweisen wollen. +Gutachter hinweisen wollen. + +### Teilen Sie Ihre Arbeit!! + +Jetzt ist es an der Zeit, von der Community Anerkennung für Ihre Arbeit zu bekommen! Die Fertigstellung einer Modellergänzung ist ein wichtiger +Beitrag zu Transformers und der gesamten NLP-Gemeinschaft. Ihr Code und die portierten vortrainierten Modelle werden sicherlich +von Hunderten und vielleicht sogar Tausenden von Entwicklern und Forschern genutzt werden. Sie sollten stolz auf Ihre Arbeit sein und Ihre +Ihre Leistung mit der Gemeinschaft teilen. + +**Sie haben ein weiteres Modell erstellt, das für jeden in der Community super einfach zugänglich ist! 🤯** diff --git a/docs/source/de/add_new_pipeline.md b/docs/source/de/add_new_pipeline.md new file mode 100644 index 000000000000..7615ac7bfd59 --- /dev/null +++ b/docs/source/de/add_new_pipeline.md @@ -0,0 +1,258 @@ + + +# Wie erstellt man eine benutzerdefinierte Pipeline? + +In dieser Anleitung sehen wir uns an, wie Sie eine benutzerdefinierte Pipeline erstellen und sie auf dem [Hub](hf.co/models) freigeben oder sie der +🤗 Transformers-Bibliothek hinzufügen. + +Zuallererst müssen Sie entscheiden, welche Roheingaben die Pipeline verarbeiten kann. Es kann sich um Strings, rohe Bytes, +Dictionaries oder was auch immer die wahrscheinlichste gewünschte Eingabe ist. Versuchen Sie, diese Eingaben so rein wie möglich in Python zu halten +denn das macht die Kompatibilität einfacher (auch mit anderen Sprachen über JSON). Dies werden die Eingaben der +Pipeline (`Vorverarbeitung`). + +Definieren Sie dann die `Outputs`. Dieselbe Richtlinie wie für die Eingänge. Je einfacher, desto besser. Dies werden die Ausgaben der +Methode `Postprocess`. + +Beginnen Sie damit, die Basisklasse `Pipeline` mit den 4 Methoden zu erben, die für die Implementierung von `preprocess` benötigt werden, +Weiterleiten", "Nachbearbeitung" und "Parameter säubern". + + +```python +from transformers import Pipeline + + +class MyPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} + + def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} + + def _forward(self, model_inputs): + # model_inputs == {"model_input": model_input} + outputs = self.model(**model_inputs) + # Maybe {"logits": Tensor(...)} + return outputs + + def postprocess(self, model_outputs): + best_class = model_outputs["logits"].softmax(-1) + return best_class +``` + +Die Struktur dieser Aufteilung soll eine relativ nahtlose Unterstützung für CPU/GPU ermöglichen und gleichzeitig die Durchführung von +Vor-/Nachbearbeitung auf der CPU in verschiedenen Threads + +Preprocess" nimmt die ursprünglich definierten Eingaben und wandelt sie in etwas um, das in das Modell eingespeist werden kann. Es kann +mehr Informationen enthalten und ist normalerweise ein `Dict`. + +`_forward` ist das Implementierungsdetail und ist nicht dafür gedacht, direkt aufgerufen zu werden. Weiterleiten" ist die bevorzugte +aufgerufene Methode, da sie Sicherheitsvorkehrungen enthält, die sicherstellen, dass alles auf dem erwarteten Gerät funktioniert. Wenn etwas +mit einem realen Modell verknüpft ist, gehört es in die Methode `_forward`, alles andere gehört in die Methoden preprocess/postprocess. + +Die Methode `Postprocess` nimmt die Ausgabe von `_forward` und verwandelt sie in die endgültige Ausgabe, die zuvor festgelegt wurde. +zuvor entschieden wurde. + +Die Methode `_sanitize_parameters` ermöglicht es dem Benutzer, beliebige Parameter zu übergeben, wann immer er möchte, sei es bei der Initialisierung +Zeit `pipeline(...., maybe_arg=4)` oder zur Aufrufzeit `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. + +Die Rückgabe von `_sanitize_parameters` sind die 3 Dicts von kwargs, die direkt an `preprocess` übergeben werden, +`_forward` und `postprocess` übergeben werden. Füllen Sie nichts aus, wenn der Aufrufer keinen zusätzlichen Parameter angegeben hat. Das +erlaubt es, die Standardargumente in der Funktionsdefinition beizubehalten, was immer "natürlicher" ist. + +Ein klassisches Beispiel wäre das Argument `top_k` in der Nachbearbeitung bei Klassifizierungsaufgaben. + +```python +>>> pipe = pipeline("my-new-task") +>>> pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] + +>>> pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] +``` + +In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit +`_sanitize_parameters` to allow this new parameter. + + +```python +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # Add logic to handle top_k + return best_class + + +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + + postprocess_kwargs = {} + if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] + return preprocess_kwargs, {}, postprocess_kwargs +``` + +Versuchen Sie, die Eingaben/Ausgaben sehr einfach und idealerweise JSON-serialisierbar zu halten, da dies die Verwendung der Pipeline sehr einfach macht +ohne dass die Benutzer neue Arten von Objekten verstehen müssen. Es ist auch relativ üblich, viele verschiedene Arten von Argumenten zu unterstützen +von Argumenten zu unterstützen (Audiodateien, die Dateinamen, URLs oder reine Bytes sein können). + + + +## Hinzufügen zur Liste der unterstützten Aufgaben + +Um Ihre `neue Aufgabe` in die Liste der unterstützten Aufgaben aufzunehmen, müssen Sie sie zur `PIPELINE_REGISTRY` hinzufügen: + +```python +from transformers.pipelines import PIPELINE_REGISTRY + +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) +``` + +Wenn Sie möchten, können Sie ein Standardmodell angeben. In diesem Fall sollte es mit einer bestimmten Revision (die der Name einer Verzweigung oder ein Commit-Hash sein kann, hier haben wir `"abcdef"` genommen) sowie mit dem Typ versehen sein: + +```python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # current support type: text, audio, image, multimodal +) +``` + +## Teilen Sie Ihre Pipeline auf dem Hub + +Um Ihre benutzerdefinierte Pipeline auf dem Hub freizugeben, müssen Sie lediglich den benutzerdefinierten Code Ihrer `Pipeline`-Unterklasse in einer +Python-Datei speichern. Nehmen wir zum Beispiel an, Sie möchten eine benutzerdefinierte Pipeline für die Klassifizierung von Satzpaaren wie folgt verwenden: + +```py +import numpy as np + +from transformers import Pipeline + + +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} + + def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + + def _forward(self, model_inputs): + return self.model(**model_inputs) + + def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} +``` + +Die Implementierung ist Framework-unabhängig und funktioniert für PyTorch- und TensorFlow-Modelle. Wenn wir dies in einer Datei +einer Datei namens `pair_classification.py` gespeichert haben, können wir sie importieren und wie folgt registrieren: + +```py +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification + +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) +``` + +Sobald dies geschehen ist, können wir es mit einem vortrainierten Modell verwenden. Zum Beispiel wurde `sgugger/finetuned-bert-mrpc` auf den +auf den MRPC-Datensatz abgestimmt, der Satzpaare als Paraphrasen oder nicht klassifiziert. + +```py +from transformers import pipeline + +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") +``` + +Dann können wir sie auf dem Hub mit der Methode `save_pretrained` in einem `Repository` freigeben: + +```py +from huggingface_hub import Repository + +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() +``` + +Dadurch wird die Datei, in der Sie `PairClassificationPipeline` definiert haben, in den Ordner `"test-dynamic-pipeline"` kopiert, +und speichert das Modell und den Tokenizer der Pipeline, bevor Sie alles in das Repository verschieben +`{Ihr_Benutzername}/test-dynamic-pipeline`. Danach kann jeder die Pipeline verwenden, solange er die Option +`trust_remote_code=True` angeben: + +```py +from transformers import pipeline + +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) +``` + +## Hinzufügen der Pipeline zu 🤗 Transformers + +Wenn Sie Ihre Pipeline zu 🤗 Transformers beitragen möchten, müssen Sie ein neues Modul im Untermodul `pipelines` hinzufügen +mit dem Code Ihrer Pipeline hinzufügen. Fügen Sie es dann der Liste der in `pipelines/__init__.py` definierten Aufgaben hinzu. + +Dann müssen Sie noch Tests hinzufügen. Erstellen Sie eine neue Datei `tests/test_pipelines_MY_PIPELINE.py` mit Beispielen für die anderen Tests. + +Die Funktion `run_pipeline_test` ist sehr allgemein gehalten und läuft auf kleinen Zufallsmodellen auf jeder möglichen +Architektur, wie durch `model_mapping` und `tf_model_mapping` definiert. + +Dies ist sehr wichtig, um die zukünftige Kompatibilität zu testen, d.h. wenn jemand ein neues Modell für +`XXXForQuestionAnswering` hinzufügt, wird der Pipeline-Test versuchen, mit diesem Modell zu arbeiten. Da die Modelle zufällig sind, ist es +ist es unmöglich, die tatsächlichen Werte zu überprüfen. Deshalb gibt es eine Hilfsfunktion `ANY`, die einfach versucht, die +Ausgabe der Pipeline TYPE. + +Außerdem *müssen* Sie 2 (idealerweise 4) Tests implementieren. + +- test_small_model_pt` : Definieren Sie 1 kleines Modell für diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben) + und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_tf`. +- test_small_model_tf : Definieren Sie 1 kleines Modell für diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben) + und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_pt`. +- test_large_model_pt` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse + Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu präsentieren und sicherzustellen + sicherzustellen, dass es in zukünftigen Versionen keine Abweichungen gibt. +- test_large_model_tf` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse + Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu präsentieren und sicherzustellen + sicherzustellen, dass es in zukünftigen Versionen keine Abweichungen gibt. diff --git a/docs/source/de/add_tensorflow_model.md b/docs/source/de/add_tensorflow_model.md new file mode 100644 index 000000000000..cc640aeb5e64 --- /dev/null +++ b/docs/source/de/add_tensorflow_model.md @@ -0,0 +1,356 @@ + + +# Wie konvertiert man ein 🤗 Transformers-Modell in TensorFlow? + +Die Tatsache, dass mehrere Frameworks für die Verwendung mit 🤗 Transformers zur Verfügung stehen, gibt Ihnen die Flexibilität, deren Stärken beim Entwurf Ihrer Anwendung auszuspielen. +Ihre Anwendung zu entwerfen, aber das bedeutet auch, dass die Kompatibilität für jedes Modell einzeln hinzugefügt werden muss. Die gute Nachricht ist, dass +das Hinzufügen von TensorFlow-Kompatibilität zu einem bestehenden Modell einfacher ist als [das Hinzufügen eines neuen Modells von Grund auf](add_new_model)! +Ob Sie ein tieferes Verständnis für große TensorFlow-Modelle haben möchten, einen wichtigen Open-Source-Beitrag leisten oder +TensorFlow für das Modell Ihrer Wahl aktivieren wollen, dieser Leitfaden ist für Sie. + +Dieser Leitfaden befähigt Sie, ein Mitglied unserer Gemeinschaft, TensorFlow-Modellgewichte und/oder +Architekturen beizusteuern, die in 🤗 Transformers verwendet werden sollen, und zwar mit minimaler Betreuung durch das Hugging Face Team. Das Schreiben eines neuen Modells +ist keine Kleinigkeit, aber ich hoffe, dass dieser Leitfaden dazu beiträgt, dass es weniger eine Achterbahnfahrt 🎢 und mehr ein Spaziergang im Park 🚶 ist. +Die Nutzung unserer kollektiven Erfahrungen ist absolut entscheidend, um diesen Prozess immer einfacher zu machen, und deshalb möchten wir +ermutigen Sie daher, Verbesserungsvorschläge für diesen Leitfaden zu machen! + +Bevor Sie tiefer eintauchen, empfehlen wir Ihnen, die folgenden Ressourcen zu lesen, wenn Sie neu in 🤗 Transformers sind: +- [Allgemeiner Überblick über 🤗 Transformers](add_new_model#general-overview-of-transformers) +- [Die TensorFlow-Philosophie von Hugging Face](https://huggingface.co/blog/tensorflow-philosophy) + +Im Rest dieses Leitfadens werden Sie lernen, was nötig ist, um eine neue TensorFlow Modellarchitektur hinzuzufügen, die +Verfahren zur Konvertierung von PyTorch in TensorFlow-Modellgewichte und wie Sie Unstimmigkeiten zwischen ML +Frameworks. Legen Sie los! + + + +Sind Sie unsicher, ob das Modell, das Sie verwenden möchten, bereits eine entsprechende TensorFlow-Architektur hat? + +  + +Überprüfen Sie das Feld `model_type` in der `config.json` des Modells Ihrer Wahl +([Beispiel](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). Wenn der entsprechende Modellordner in +🤗 Transformers eine Datei hat, deren Name mit "modeling_tf" beginnt, bedeutet dies, dass es eine entsprechende TensorFlow +Architektur hat ([Beispiel](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)). + + + + +## Schritt-für-Schritt-Anleitung zum Hinzufügen von TensorFlow-Modellarchitektur-Code + +Es gibt viele Möglichkeiten, eine große Modellarchitektur zu entwerfen, und viele Möglichkeiten, diesen Entwurf zu implementieren. Wie auch immer, +Sie erinnern sich vielleicht an unseren [allgemeinen Überblick über 🤗 Transformers](add_new_model#general-overview-of-transformers) +wissen, dass wir ein meinungsfreudiger Haufen sind - die Benutzerfreundlichkeit von 🤗 Transformers hängt von konsistenten Designentscheidungen ab. Aus +Erfahrung können wir Ihnen ein paar wichtige Dinge über das Hinzufügen von TensorFlow-Modellen sagen: + +- Erfinden Sie das Rad nicht neu! In den meisten Fällen gibt es mindestens zwei Referenzimplementierungen, die Sie überprüfen sollten: das +PyTorch-Äquivalent des Modells, das Sie implementieren, und andere TensorFlow-Modelle für dieselbe Klasse von Problemen. +- Gute Modellimplementierungen überleben den Test der Zeit. Dies geschieht nicht, weil der Code hübsch ist, sondern eher +sondern weil der Code klar, einfach zu debuggen und darauf aufzubauen ist. Wenn Sie den Maintainern das Leben mit Ihrer +TensorFlow-Implementierung leicht machen, indem Sie die gleichen Muster wie in anderen TensorFlow-Modellen nachbilden und die Abweichung +zur PyTorch-Implementierung minimieren, stellen Sie sicher, dass Ihr Beitrag lange Bestand haben wird. +- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Das 🤗 Transformers-Team ist da, um zu helfen, und wir haben wahrscheinlich Lösungen für die gleichen +Probleme gefunden, vor denen Sie stehen. + +Hier finden Sie einen Überblick über die Schritte, die zum Hinzufügen einer TensorFlow-Modellarchitektur erforderlich sind: +1. Wählen Sie das Modell, das Sie konvertieren möchten +2. Bereiten Sie die Transformers-Entwicklungsumgebung vor. +3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung +4. Implementieren Sie die Modellarchitektur +5. Implementieren Sie Modelltests +6. Reichen Sie den Pull-Antrag ein +7. (Optional) Erstellen Sie Demos und teilen Sie diese mit der Welt + +### 1.-3. Bereiten Sie Ihren Modellbeitrag vor + +**1. Wählen Sie das Modell, das Sie konvertieren möchten** + +Beginnen wir mit den Grundlagen: Als erstes müssen Sie die Architektur kennen, die Sie konvertieren möchten. Wenn Sie +Sie sich nicht auf eine bestimmte Architektur festgelegt haben, ist es eine gute Möglichkeit, das 🤗 Transformers-Team um Vorschläge zu bitten. +Wir werden Sie zu den wichtigsten Architekturen führen, die auf der TensorFlow-Seite noch fehlen. +Seite fehlen. Wenn das spezifische Modell, das Sie mit TensorFlow verwenden möchten, bereits eine Implementierung der TensorFlow-Architektur in +🤗 Transformers, aber es fehlen Gewichte, können Sie direkt in den +Abschnitt [Gewichtskonvertierung](#adding-tensorflow-weights-to-hub) +auf dieser Seite. + +Der Einfachheit halber wird im Rest dieser Anleitung davon ausgegangen, dass Sie sich entschieden haben, mit der TensorFlow-Version von +*BrandNewBert* (dasselbe Beispiel wie in der [Anleitung](add_new_model), um ein neues Modell von Grund auf hinzuzufügen). + + + +Bevor Sie mit der Arbeit an einer TensorFlow-Modellarchitektur beginnen, sollten Sie sich vergewissern, dass es keine laufenden Bemühungen in dieser Richtung gibt. +Sie können nach `BrandNewBert` auf der +[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr), um zu bestätigen, dass es keine +TensorFlow-bezogene Pull-Anfrage gibt. + + + + +**2. Transformers-Entwicklungsumgebung vorbereiten** + +Nachdem Sie die Modellarchitektur ausgewählt haben, öffnen Sie einen PR-Entwurf, um Ihre Absicht zu signalisieren, daran zu arbeiten. Folgen Sie den +Anweisungen, um Ihre Umgebung einzurichten und einen PR-Entwurf zu öffnen. + +1. Forken Sie das [repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfläche 'Fork' klicken. + Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt. + +2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und fügen Sie das Basis-Repository als Remote hinzu: + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + +3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausführen: + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +Abhängig von Ihrem Betriebssystem und da die Anzahl der optionalen Abhängigkeiten von Transformers wächst, kann es sein, dass Sie bei diesem Befehl einen +Fehler mit diesem Befehl erhalten. Wenn das der Fall ist, stellen Sie sicher, dass Sie TensorFlow installieren und dann ausführen: + +```bash +pip install -e ".[quality]" +``` + +**Hinweis:** Sie müssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU laufen zu lassen. + +4. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung + +```bash +git checkout -b add_tf_brand_new_bert +``` + +5. Abrufen und zurücksetzen auf die aktuelle Hauptversion + +```bash +git fetch upstream +git rebase upstream/main +``` + +6. Fügen Sie eine leere `.py` Datei in `transformers/src/models/brandnewbert/` mit dem Namen `modeling_tf_brandnewbert.py` hinzu. Dies wird +Ihre TensorFlow-Modelldatei sein. + +7. Übertragen Sie die Änderungen auf Ihr Konto mit: + +```bash +git add . +git commit -m "initial commit" +git push -u origin add_tf_brand_new_bert +``` + +8. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das + GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzufügen, damit das Hugging Face-Team über zukünftige Änderungen informiert wird. + zukünftige Änderungen benachrichtigt wird. + +9. Ändern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken. + + +Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *BrandNewBert* nach TensorFlow in 🤗 Transformers zu portieren. + + +**3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung** + +Sie sollten sich etwas Zeit nehmen, um die Arbeit von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. Möglicherweise gibt es große +Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist +ist es nicht, ein tiefes theoretisches Verständnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um +das Modell mit Hilfe von TensorFlow effektiv in 🤗 Transformers neu zu implementieren. Das heißt, Sie müssen nicht zu viel Zeit auf die +viel Zeit auf die theoretischen Aspekte verwenden, sondern sich lieber auf die praktischen Aspekte konzentrieren, nämlich auf die bestehende Modelldokumentation +Seite (z.B. [model docs for BERT](model_doc/bert)). + +Nachdem Sie die Grundlagen der Modelle, die Sie implementieren wollen, verstanden haben, ist es wichtig, die bestehende +Implementierung zu verstehen. Dies ist eine gute Gelegenheit, sich zu vergewissern, dass eine funktionierende Implementierung mit Ihren Erwartungen an das +Modell entspricht, und um technische Herausforderungen auf der TensorFlow-Seite vorauszusehen. + +Es ist ganz natürlich, dass Sie sich von der Menge an Informationen, die Sie gerade aufgesogen haben, überwältigt fühlen. Es ist +Es ist definitiv nicht erforderlich, dass Sie in dieser Phase alle Facetten des Modells verstehen. Dennoch empfehlen wir Ihnen dringend +ermutigen wir Sie, alle dringenden Fragen in unserem [Forum](https://discuss.huggingface.co/) zu klären. + + +### 4. Implementierung des Modells + +Jetzt ist es an der Zeit, endlich mit dem Programmieren zu beginnen. Als Ausgangspunkt empfehlen wir die PyTorch-Datei selbst: Kopieren Sie den Inhalt von +modeling_brand_new_bert.py` in `src/transformers/models/brand_new_bert/` nach +modeling_tf_brand_new_bert.py`. Das Ziel dieses Abschnitts ist es, die Datei zu ändern und die Importstruktur von +🤗 Transformers zu aktualisieren, so dass Sie `TFBrandNewBert` und +`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` erfolgreich ein funktionierendes TensorFlow *BrandNewBert* Modell lädt. + +Leider gibt es kein Rezept, um ein PyTorch-Modell in TensorFlow zu konvertieren. Sie können jedoch unsere Auswahl an +Tipps befolgen, um den Prozess so reibungslos wie möglich zu gestalten: +- Stellen Sie `TF` dem Namen aller Klassen voran (z.B. wird `BrandNewBert` zu `TFBrandNewBert`). +- Die meisten PyTorch-Operationen haben einen direkten TensorFlow-Ersatz. Zum Beispiel entspricht `torch.nn.Linear` der Klasse + `tf.keras.layers.Dense`, `torch.nn.Dropout` entspricht `tf.keras.layers.Dropout`, usw. Wenn Sie sich nicht sicher sind + über eine bestimmte Operation nicht sicher sind, können Sie die [TensorFlow-Dokumentation](https://www.tensorflow.org/api_docs/python/tf) + oder die [PyTorch-Dokumentation](https://pytorch.org/docs/stable/). +- Suchen Sie nach Mustern in der Codebasis von 🤗 Transformers. Wenn Sie auf eine bestimmte Operation stoßen, für die es keinen direkten Ersatz gibt + Ersatz hat, stehen die Chancen gut, dass jemand anderes bereits das gleiche Problem hatte. +- Behalten Sie standardmäßig die gleichen Variablennamen und die gleiche Struktur wie in PyTorch bei. Dies erleichtert die Fehlersuche, die Verfolgung von + Probleme zu verfolgen und spätere Korrekturen vorzunehmen. +- Einige Ebenen haben in jedem Framework unterschiedliche Standardwerte. Ein bemerkenswertes Beispiel ist die Schicht für die Batch-Normalisierung + epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d) + und `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)). + Prüfen Sie die Dokumentation genau! +- Die Variablen `nn.Parameter` von PyTorch müssen in der Regel innerhalb von TF Layer's `build()` initialisiert werden. Siehe das folgende + Beispiel: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) / + [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220) +- Wenn das PyTorch-Modell ein `#copied from ...` am Anfang einer Funktion hat, stehen die Chancen gut, dass Ihr TensorFlow-Modell diese Funktion auch + diese Funktion von der Architektur ausleihen kann, von der sie kopiert wurde, vorausgesetzt, es hat eine TensorFlow-Architektur. +- Die korrekte Zuweisung des Attributs `name` in TensorFlow-Funktionen ist entscheidend, um das `from_pt=True` Gewicht zu erreichen + Cross-Loading. Name" ist fast immer der Name der entsprechenden Variablen im PyTorch-Code. Wenn `name` nicht + nicht richtig gesetzt ist, sehen Sie dies in der Fehlermeldung beim Laden der Modellgewichte. +- Die Logik der Basismodellklasse, `BrandNewBertModel`, befindet sich in `TFBrandNewBertMainLayer`, einer Keras + Schicht-Unterklasse ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)). + TFBrandNewBertModel" ist lediglich ein Wrapper für diese Schicht. +- Keras-Modelle müssen erstellt werden, um die vorher trainierten Gewichte zu laden. Aus diesem Grund muss `TFBrandNewBertPreTrainedModel` + ein Beispiel für die Eingaben in das Modell enthalten, die `dummy_inputs` + ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)). +- Wenn Sie nicht weiterkommen, fragen Sie nach Hilfe - wir sind für Sie da! 🤗 + +Neben der Modelldatei selbst müssen Sie auch die Verweise auf die Modellklassen und die zugehörigen +Dokumentationsseiten hinzufügen. Sie können diesen Teil ganz nach den Mustern in anderen PRs erledigen +([Beispiel](https://github.com/huggingface/transformers/pull/18020/files)). Hier ist eine Liste der erforderlichen manuellen +Änderungen: +- Fügen Sie alle öffentlichen Klassen von *BrandNewBert* in `src/transformers/__init__.py` ein. +- Fügen Sie *BrandNewBert* Klassen zu den entsprechenden Auto Klassen in `src/transformers/models/auto/modeling_tf_auto.py` hinzu. +- Fügen Sie die *BrandNewBert* zugehörigen Klassen für träges Laden in `src/transformers/utils/dummy_tf_objects.py` hinzu. +- Aktualisieren Sie die Importstrukturen für die öffentlichen Klassen in `src/transformers/models/brand_new_bert/__init__.py`. +- Fügen Sie die Dokumentationszeiger auf die öffentlichen Methoden von *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu. +- Fügen Sie sich selbst zur Liste der Mitwirkenden an *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu. +- Fügen Sie schließlich ein grünes Häkchen ✅ in der TensorFlow-Spalte von *BrandNewBert* in `docs/source/de/index.md` hinzu. + +Wenn Sie mit Ihrer Implementierung zufrieden sind, führen Sie die folgende Checkliste aus, um zu bestätigen, dass Ihre Modellarchitektur +fertig ist: +1. Alle Schichten, die sich zur Trainingszeit anders verhalten (z.B. Dropout), werden mit einem `Training` Argument aufgerufen, das +von den Top-Level-Klassen weitergegeben wird +2. Sie haben `#copied from ...` verwendet, wann immer es möglich war. +3. Die Funktion `TFBrandNewBertMainLayer` und alle Klassen, die sie verwenden, haben ihre Funktion `call` mit `@unpack_inputs` dekoriert +4. TFBrandNewBertMainLayer` ist mit `@keras_serializable` dekoriert +5. Ein TensorFlow-Modell kann aus PyTorch-Gewichten mit `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` geladen werden. +6. Sie können das TensorFlow Modell mit dem erwarteten Eingabeformat aufrufen + + +### 5. Modell-Tests hinzufügen + +Hurra, Sie haben ein TensorFlow-Modell implementiert! Jetzt ist es an der Zeit, Tests hinzuzufügen, um sicherzustellen, dass sich Ihr Modell wie erwartet verhält. +erwartet. Wie im vorigen Abschnitt schlagen wir vor, dass Sie zunächst die Datei `test_modeling_brand_new_bert.py` in +`tests/models/brand_new_bert/` in die Datei `test_modeling_tf_brand_new_bert.py` zu kopieren und dann die notwendigen +TensorFlow-Ersetzungen vornehmen. Für den Moment sollten Sie in allen Aufrufen von `.from_pretrained()` das Flag `from_pt=True` verwenden, um die +die vorhandenen PyTorch-Gewichte zu laden. + +Wenn Sie damit fertig sind, kommt der Moment der Wahrheit: Führen Sie die Tests durch! 😬 + +```bash +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +``` + +Das wahrscheinlichste Ergebnis ist, dass Sie eine Reihe von Fehlern sehen werden. Machen Sie sich keine Sorgen, das ist zu erwarten! Das Debuggen von ML-Modellen ist +notorisch schwierig, und der Schlüssel zum Erfolg ist Geduld (und `breakpoint()`). Nach unserer Erfahrung sind die schwierigsten +Probleme aus subtilen Unstimmigkeiten zwischen ML-Frameworks, zu denen wir am Ende dieses Leitfadens ein paar Hinweise geben. +In anderen Fällen kann es sein, dass ein allgemeiner Test nicht direkt auf Ihr Modell anwendbar ist; in diesem Fall empfehlen wir eine Überschreibung +auf der Ebene der Modelltestklasse. Zögern Sie nicht, in Ihrem Entwurf einer Pull-Anfrage um Hilfe zu bitten, wenn +Sie nicht weiterkommen. + +Wenn alle Tests erfolgreich waren, können Sie Ihr Modell in die 🤗 Transformers-Bibliothek aufnehmen! 🎉 + +### 6.-7. Stellen Sie sicher, dass jeder Ihr Modell verwenden kann + +**6. Reichen Sie den Pull Request ein** + +Sobald Sie mit der Implementierung und den Tests fertig sind, ist es an der Zeit, eine Pull-Anfrage einzureichen. Bevor Sie Ihren Code einreichen, +führen Sie unser Dienstprogramm zur Codeformatierung, `make fixup` 🪄, aus. Damit werden automatisch alle Formatierungsfehler behoben, die dazu führen würden, dass +unsere automatischen Prüfungen fehlschlagen würden. + +Nun ist es an der Zeit, Ihren Entwurf einer Pull-Anfrage in eine echte Pull-Anfrage umzuwandeln. Klicken Sie dazu auf die Schaltfläche "Bereit für +Review" und fügen Sie Joao (`@gante`) und Matt (`@Rocketknight1`) als Reviewer hinzu. Eine Modell-Pull-Anfrage benötigt +mindestens 3 Reviewer, aber sie werden sich darum kümmern, geeignete zusätzliche Reviewer für Ihr Modell zu finden. + +Nachdem alle Gutachter mit dem Stand Ihres PR zufrieden sind, entfernen Sie als letzten Aktionspunkt das Flag `from_pt=True` in +.from_pretrained()-Aufrufen zu entfernen. Da es keine TensorFlow-Gewichte gibt, müssen Sie sie hinzufügen! Lesen Sie den Abschnitt +unten, um zu erfahren, wie Sie dies tun können. + +Wenn schließlich die TensorFlow-Gewichte zusammengeführt werden, Sie mindestens 3 Genehmigungen von Prüfern haben und alle CI-Checks grün sind +grün sind, überprüfen Sie die Tests ein letztes Mal lokal + +```bash +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +``` + +und wir werden Ihren PR zusammenführen! Herzlichen Glückwunsch zu dem Meilenstein 🎉. + +**7. (Optional) Erstellen Sie Demos und teilen Sie sie mit der Welt** + +Eine der schwierigsten Aufgaben bei Open-Source ist die Entdeckung. Wie können die anderen Benutzer von der Existenz Ihres +fabelhaften TensorFlow-Beitrags erfahren? Mit der richtigen Kommunikation, natürlich! 📣 + +Es gibt vor allem zwei Möglichkeiten, Ihr Modell mit der Community zu teilen: +- Erstellen Sie Demos. Dazu gehören Gradio-Demos, Notebooks und andere unterhaltsame Möglichkeiten, Ihr Modell vorzuführen. Wir raten Ihnen + ermutigen Sie, ein Notizbuch zu unseren [community-driven demos](https://huggingface.co/docs/transformers/community) hinzuzufügen. +- Teilen Sie Geschichten in sozialen Medien wie Twitter und LinkedIn. Sie sollten stolz auf Ihre Arbeit sein und sie mit der + Ihre Leistung mit der Community teilen - Ihr Modell kann nun von Tausenden von Ingenieuren und Forschern auf der ganzen Welt genutzt werden + der Welt genutzt werden 🌍! Wir werden Ihre Beiträge gerne retweeten und Ihnen helfen, Ihre Arbeit mit der Community zu teilen. + + +## Hinzufügen von TensorFlow-Gewichten zum 🤗 Hub + +Unter der Annahme, dass die TensorFlow-Modellarchitektur in 🤗 Transformers verfügbar ist, ist die Umwandlung von PyTorch-Gewichten in +TensorFlow-Gewichte ist ein Kinderspiel! + +Hier sehen Sie, wie es geht: +1. Stellen Sie sicher, dass Sie in Ihrem Terminal bei Ihrem Hugging Face Konto angemeldet sind. Sie können sich mit dem folgenden Befehl anmelden + `huggingface-cli login` (Ihre Zugangstoken finden Sie [hier](https://huggingface.co/settings/tokens)) +2. Führen Sie `transformers-cli pt-to-tf --model-name foo/bar` aus, wobei `foo/bar` der Name des Modell-Repositorys ist + ist, das die PyTorch-Gewichte enthält, die Sie konvertieren möchten. +3. Markieren Sie `@joaogante` und `@Rocketknight1` in dem 🤗 Hub PR, den der obige Befehl gerade erstellt hat + +Das war's! 🎉 + + +## Fehlersuche in verschiedenen ML-Frameworks 🐛 + +Irgendwann, wenn Sie eine neue Architektur hinzufügen oder TensorFlow-Gewichte für eine bestehende Architektur erstellen, werden Sie +stoßen Sie vielleicht auf Fehler, die sich über Unstimmigkeiten zwischen PyTorch und TensorFlow beschweren. Sie könnten sich sogar dazu entschließen, den +Modellarchitektur-Code für die beiden Frameworks zu öffnen, und stellen fest, dass sie identisch aussehen. Was ist denn da los? 🤔 + +Lassen Sie uns zunächst darüber sprechen, warum es wichtig ist, diese Diskrepanzen zu verstehen. Viele Community-Mitglieder werden 🤗 +Transformers-Modelle und vertrauen darauf, dass sich unsere Modelle wie erwartet verhalten. Wenn es eine große Diskrepanz gibt +zwischen den beiden Frameworks auftritt, bedeutet dies, dass das Modell nicht der Referenzimplementierung für mindestens eines der Frameworks folgt. +der Frameworks folgt. Dies kann zu stillen Fehlern führen, bei denen das Modell zwar läuft, aber eine schlechte Leistung aufweist. Dies ist +wohl schlimmer als ein Modell, das überhaupt nicht läuft! Aus diesem Grund streben wir an, dass die Abweichung zwischen den Frameworks kleiner als +1e-5" in allen Phasen des Modells. + +Wie bei anderen numerischen Problemen auch, steckt der Teufel im Detail. Und wie bei jedem detailorientierten Handwerk ist die geheime +Zutat hier Geduld. Hier ist unser Vorschlag für den Arbeitsablauf, wenn Sie auf diese Art von Problemen stoßen: +1. Lokalisieren Sie die Quelle der Abweichungen. Das Modell, das Sie konvertieren, hat wahrscheinlich bis zu einem gewissen Punkt nahezu identische innere Variablen. + bestimmten Punkt. Platzieren Sie `Breakpoint()`-Anweisungen in den Architekturen der beiden Frameworks und vergleichen Sie die Werte der + numerischen Variablen von oben nach unten, bis Sie die Quelle der Probleme gefunden haben. +2. Nachdem Sie nun die Ursache des Problems gefunden haben, setzen Sie sich mit dem 🤗 Transformers-Team in Verbindung. Es ist möglich + dass wir ein ähnliches Problem schon einmal gesehen haben und umgehend eine Lösung anbieten können. Als Ausweichmöglichkeit können Sie beliebte Seiten + wie StackOverflow und GitHub-Probleme. +3. Wenn keine Lösung in Sicht ist, bedeutet das, dass Sie tiefer gehen müssen. Die gute Nachricht ist, dass Sie das Problem gefunden haben. + Problem ausfindig gemacht haben, so dass Sie sich auf die problematische Anweisung konzentrieren und den Rest des Modells ausblenden können! Die schlechte Nachricht ist + dass Sie sich in die Quellimplementierung der besagten Anweisung einarbeiten müssen. In manchen Fällen finden Sie vielleicht ein + Problem mit einer Referenzimplementierung - verzichten Sie nicht darauf, ein Problem im Upstream-Repository zu öffnen. + +In einigen Fällen können wir nach Rücksprache mit dem 🤗 Transformers-Team zu dem Schluss kommen, dass die Behebung der Abweichung nicht machbar ist. +Wenn die Abweichung in den Ausgabeschichten des Modells sehr klein ist (aber möglicherweise groß in den versteckten Zuständen), können wir +könnten wir beschließen, sie zu ignorieren und das Modell zu verteilen. Die oben erwähnte CLI `pt-to-tf` hat ein `--max-error` +Flag, um die Fehlermeldung bei der Gewichtskonvertierung zu überschreiben. diff --git a/docs/source/de/autoclass_tutorial.md b/docs/source/de/autoclass_tutorial.md new file mode 100644 index 000000000000..7707f7b39b49 --- /dev/null +++ b/docs/source/de/autoclass_tutorial.md @@ -0,0 +1,131 @@ + + +# Vortrainierte Instanzen mit einer AutoClass laden + +Bei so vielen verschiedenen Transformator-Architekturen kann es eine Herausforderung sein, eine für Ihren Checkpoint zu erstellen. Als Teil der 🤗 Transformers Kernphilosophie, die Bibliothek leicht, einfach und flexibel nutzbar zu machen, leitet eine `AutoClass` automatisch die richtige Architektur aus einem gegebenen Checkpoint ab und lädt sie. Mit der Methode `from_pretrained()` kann man schnell ein vortrainiertes Modell für eine beliebige Architektur laden, so dass man keine Zeit und Ressourcen aufwenden muss, um ein Modell von Grund auf zu trainieren. Die Erstellung dieser Art von Checkpoint-agnostischem Code bedeutet, dass Ihr Code, wenn er für einen Checkpoint funktioniert, auch mit einem anderen Checkpoint funktionieren wird - solange er für eine ähnliche Aufgabe trainiert wurde - selbst wenn die Architektur unterschiedlich ist. + + + +Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/bert-base-uncased) eine Architektur, während `bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann. + + + +In dieser Anleitung lernen Sie, wie man: + +* Einen vortrainierten Tokenizer lädt. +* Einen vortrainierten Merkmalsextraktor lädt. +* Einen vortrainierten Prozessor lädt. +* Ein vortrainiertes Modell lädt. + +## AutoTokenizer + +Nahezu jede NLP-Aufgabe beginnt mit einem Tokenizer. Ein Tokenizer wandelt Ihre Eingabe in ein Format um, das vom Modell verarbeitet werden kann. + +Laden Sie einen Tokenizer mit [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +Dann tokenisieren Sie Ihre Eingabe wie unten gezeigt: + +```py +>>> sequence = "In a hole in the ground there lived a hobbit." +>>> print(tokenizer(sequence)) +{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +## AutoFeatureExtractor + +Für Audio- und Bildverarbeitungsaufgaben verarbeitet ein Merkmalsextraktor das Audiosignal oder Bild in das richtige Eingabeformat. + +Laden Sie einen Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained( +... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +## AutoProcessor + +Multimodale Aufgaben erfordern einen Prozessor, der zwei Arten von Vorverarbeitungswerkzeugen kombiniert. Das Modell [LayoutLMV2](model_doc/layoutlmv2) beispielsweise benötigt einen Feature-Extraktor für Bilder und einen Tokenizer für Text; ein Prozessor kombiniert beide. + +Laden Sie einen Prozessor mit [`AutoProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") +``` + +## AutoModel + + + +Mit den `AutoModelFor`-Klassen können Sie schließlich ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`AutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + + + +Für PyTorch-Modelle verwendet die Methode `from_pretrained()` `torch.load()`, die intern `pickle` verwendet und als unsicher bekannt ist. Generell sollte man niemals ein Modell laden, das aus einer nicht vertrauenswürdigen Quelle stammen könnte, oder das manipuliert worden sein könnte. Dieses Sicherheitsrisiko wird für öffentliche Modelle, die auf dem Hugging Face Hub gehostet werden, teilweise gemildert, da diese bei jeder Übertragung [auf Malware](https://huggingface.co/docs/hub/security-malware) gescannt werden. Siehe die [Hub-Dokumentation](https://huggingface.co/docs/hub/security) für Best Practices wie [signierte Commit-Verifizierung](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) mit GPG. + +TensorFlow- und Flax-Checkpoints sind nicht betroffen und können in PyTorch-Architekturen mit den Kwargs `from_tf` und `from_flax` für die Methode `from_pretrained` geladen werden, um dieses Problem zu umgehen. + + + +Im Allgemeinen empfehlen wir die Verwendung der Klasse "AutoTokenizer" und der Klasse "AutoModelFor", um trainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten. + + +Mit den Klassen `TFAutoModelFor` schließlich können Sie ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`TFAutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +Im Allgemeinen empfehlen wir, die Klasse "AutoTokenizer" und die Klasse "TFAutoModelFor" zu verwenden, um vortrainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten. + + diff --git a/docs/source/de/autoclass_tutorial.mdx b/docs/source/de/autoclass_tutorial.mdx deleted file mode 100644 index 95247cd04ba0..000000000000 --- a/docs/source/de/autoclass_tutorial.mdx +++ /dev/null @@ -1,127 +0,0 @@ - - -# Vortrainierte Instanzen mit einer AutoClass laden - -Bei so vielen verschiedenen Transformator-Architekturen kann es eine Herausforderung sein, eine für Ihren Checkpoint zu erstellen. Als Teil der 🤗 Transformers Kernphilosophie, die Bibliothek leicht, einfach und flexibel nutzbar zu machen, leitet eine `AutoClass` automatisch die richtige Architektur aus einem gegebenen Checkpoint ab und lädt sie. Mit der Methode `from_pretrained()` kann man schnell ein vortrainiertes Modell für eine beliebige Architektur laden, so dass man keine Zeit und Ressourcen aufwenden muss, um ein Modell von Grund auf zu trainieren. Die Erstellung dieser Art von Checkpoint-agnostischem Code bedeutet, dass Ihr Code, wenn er für einen Checkpoint funktioniert, auch mit einem anderen Checkpoint funktionieren wird - solange er für eine ähnliche Aufgabe trainiert wurde - selbst wenn die Architektur unterschiedlich ist. - - - -Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/bert-base-uncased) eine Architektur, während `bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann. - - - -In dieser Anleitung lernen Sie, wie man: - -* Einen vortrainierten Tokenizer lädt. -* Einen vortrainierten Merkmalsextraktor lädt. -* Einen vortrainierten Prozessor lädt. -* Ein vortrainiertes Modell lädt. - -## AutoTokenizer - -Nahezu jede NLP-Aufgabe beginnt mit einem Tokenizer. Ein Tokenizer wandelt Ihre Eingabe in ein Format um, das vom Modell verarbeitet werden kann. - -Laden Sie einen Tokenizer mit [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -``` - -Dann tokenisieren Sie Ihre Eingabe wie unten gezeigt: - -```py ->>> sequence = "In a hole in the ground there lived a hobbit." ->>> print(tokenizer(sequence)) -{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -## AutoFeatureExtractor - -Für Audio- und Bildverarbeitungsaufgaben verarbeitet ein Merkmalsextraktor das Audiosignal oder Bild in das richtige Eingabeformat. - -Laden Sie einen Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained( -... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -## AutoProcessor - -Multimodale Aufgaben erfordern einen Prozessor, der zwei Arten von Vorverarbeitungswerkzeugen kombiniert. Das Modell [LayoutLMV2](model_doc/layoutlmv2) beispielsweise benötigt einen Feature-Extraktor für Bilder und einen Tokenizer für Text; ein Prozessor kombiniert beide. - -Laden Sie einen Prozessor mit [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") -``` - -## AutoModel - - - -Mit den `AutoModelFor`-Klassen können Sie schließlich ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`AutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden: - -```py ->>> from transformers import AutoModelForTokenClassification - ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - - - -Für PyTorch-Modelle verwendet die Methode `from_pretrained()` `torch.load()`, die intern `pickle` verwendet und als unsicher bekannt ist. Generell sollte man niemals ein Modell laden, das aus einer nicht vertrauenswürdigen Quelle stammen könnte, oder das manipuliert worden sein könnte. Dieses Sicherheitsrisiko wird für öffentliche Modelle, die auf dem Hugging Face Hub gehostet werden, teilweise gemildert, da diese bei jeder Übertragung [auf Malware](https://huggingface.co/docs/hub/security-malware) gescannt werden. Siehe die [Hub-Dokumentation](https://huggingface.co/docs/hub/security) für Best Practices wie [signierte Commit-Verifizierung](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) mit GPG. - -TensorFlow- und Flax-Checkpoints sind nicht betroffen und können in PyTorch-Architekturen mit den Kwargs `from_tf` und `from_flax` für die Methode `from_pretrained` geladen werden, um dieses Problem zu umgehen. - - - -Im Allgemeinen empfehlen wir die Verwendung der Klasse "AutoTokenizer" und der Klasse "AutoModelFor", um trainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten. - - -Mit den Klassen `TFAutoModelFor` schließlich können Sie ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`TFAutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - -Im Allgemeinen empfehlen wir, die Klasse "AutoTokenizer" und die Klasse "TFAutoModelFor" zu verwenden, um vortrainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten. - - diff --git a/docs/source/de/index.md b/docs/source/de/index.md new file mode 100644 index 000000000000..4742a99f643c --- /dev/null +++ b/docs/source/de/index.md @@ -0,0 +1,334 @@ + + +# 🤗 Transformers + +Maschinelles Lernen auf dem neuesten Stand der Technik für PyTorch, TensorFlow und JAX. + +🤗 Transformers bietet APIs zum einfachen Herunterladen und Trainieren von vortrainierten Modellen auf dem neuesten Stand der Technik. Die Verwendung von vortrainierten Modellen kann Rechenkosten sparen und den CO2-Fußabdruck reduzieren und Zeit sparen, die für das Training eines Modells von Grund auf benötigt wird. Die Modelle können für verschiedene Modalitäten verwendet werden, wie z. B.: + +* 📝 Text: Textklassifizierung, Informationsextrahierung, Beantwortung von Fragen, Zusammenfassung, Übersetzung und Texterstellung in über 100 Sprachen. +* 🖼️ Bilder: Bildklassifizierung, Objekterkennung und Segmentierung. +* 🗣️ Audio: Spracherkennung und Audioklassifizierung. +* 🐙 Multimodal: Beantwortung von Tabellenfragen, optische Zeichenerkennung, Informationsextraktion aus gescannten Dokumenten, Videoklassifizierung und Beantwortung visueller Fragen. + +Unsere Bibliothek unterstützt die nahtlose Integration von drei der beliebtesten Deep-Learning-Bibliotheken: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) und [JAX](https://jax.readthedocs.io/en/latest/). Trainieren Sie Ihr Modell in drei Codezeilen in einem Framework und laden Sie es zur Inferenz mit einem anderen. + +Jede 🤗 Transformers-Architektur ist in einem eigenständigen Python-Modul definiert, so dass sie leicht für Forschung und Experimente angepasst werden kann. + +## Wenn Sie auf der Suche nach individueller Unterstützung durch das Hugging Face-Team sind + + + HuggingFace Expert Acceleration Program + + +## Inhalt + +Die Dokumentation ist in fünf Teile gegliedert: + +- **GET STARTED** enthält eine kurze Tour und Installationsanweisungen, um mit 🤗 Transformers loszulegen. +- **TUTORIALS** sind ein hervorragender Ausgangspunkt, wenn Sie neu in unserer Bibliothek sind. Dieser Abschnitt hilft Ihnen, die grundlegenden Fähigkeiten zu erlangen, die Sie benötigen, um mit 🤗 Transformers zu arbeiten. +- **HOW-TO GUIDES** zeigen Ihnen, wie Sie ein bestimmtes Ziel erreichen können, z. B. die Feinabstimmung eines vortrainierten Modells für die Sprachmodellierung oder die Erstellung eines benutzerdefinierten Modellkopfs. +- **KONZEPTUELLE ANLEITUNGEN** bietet weitere Diskussionen und Erklärungen zu den zugrunde liegenden Konzepten und Ideen hinter Modellen, Aufgaben und der Designphilosophie von 🤗 Transformers. +- **API** beschreibt jede Klasse und Funktion, gruppiert in: + + - **MAIN CLASSES** für die Hauptklassen, die die wichtigsten APIs der Bibliothek darstellen. + - MODELLE** für die Klassen und Funktionen, die zu jedem in der Bibliothek implementierten Modell gehören. + - **INTERNAL HELPERS** für die Klassen und Funktionen, die wir intern verwenden. + +Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, vortrainierte Modellgewichte, Nutzungsskripte und Konvertierungsprogramme für die folgenden Modelle. + +### Unterstütze Modelle + + + +1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. +1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. +1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. +1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. +1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. +1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. +1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). +1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. +1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. +1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. +1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. +1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. +1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. +1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. +1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. +1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. +1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. +1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. +1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le. +1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. +1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. +1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. +1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. +1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. +1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach +1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. +1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. +1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). +1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. +1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. +1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. +1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. +1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei. +1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. +1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. +1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. +1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. +1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. +1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. +1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. +1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. +1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. +1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. +1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. +1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. +1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. +1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. +1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. +1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. +1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. +1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. +1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. +1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. +1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. +1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. +1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. +1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. +1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. +1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. +1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. +1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. +1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine +1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. +1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UMT5](model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. +1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. +1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. +1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. +1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. +1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. +1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. +1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. +1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. +1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. +1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLM-V](model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa. +1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. +1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. +1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. +1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. +1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. + + +### Unterstützte Frameworks + +Die folgende Tabelle zeigt die derzeitige Unterstützung in der Bibliothek für jedes dieser Modelle, unabhängig davon, ob sie einen Python +Tokenizer haben (als "langsam" bezeichnet), ein "schneller" Tokenizer, der von der 🤗 Tokenizers Bibliothek unterstützt wird, ob sie Unterstützung in Jax (via +Flax), PyTorch, und/oder TensorFlow haben. + + + +| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | +|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| +| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | +| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | +| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | +| BigBird-Pegasus | ❌ | ❌ | ✅ | ❌ | ❌ | +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | +| BLOOM | ❌ | ✅ | ✅ | ❌ | ✅ | +| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| CANINE | ✅ | ❌ | ✅ | ❌ | ❌ | +| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | +| CodeGen | ✅ | ✅ | ✅ | ❌ | ❌ | +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ | +| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | +| CvT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | +| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | +| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| DeiT | ❌ | ❌ | ✅ | ✅ | ❌ | +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | +| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | +| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | +| FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | +| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | +| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | +| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | +| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | +| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | +| GroupViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | +| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | +| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LED | ✅ | ✅ | ✅ | ✅ | ❌ | +| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| LongT5 | ❌ | ❌ | ✅ | ❌ | ✅ | +| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | +| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| M-CTC-T | ❌ | ❌ | ✅ | ❌ | ❌ | +| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | +| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | +| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | +| Megatron-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| MobileViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| MVP | ✅ | ✅ | ✅ | ❌ | ❌ | +| Nezha | ❌ | ❌ | ✅ | ❌ | ❌ | +| Nyströmformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | +| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | +| OPT | ❌ | ❌ | ✅ | ✅ | ✅ | +| OWL-ViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | +| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | +| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | +| REALM | ✅ | ✅ | ✅ | ❌ | ❌ | +| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | +| RegNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ResNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | +| SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ | +| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | +| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | +| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | +| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | +| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ | +| Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | +| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | +| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | +| VideoMAE | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | +| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | +| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | +| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | +| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | +| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | +| XLM-ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | +| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | +| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | + + diff --git a/docs/source/de/index.mdx b/docs/source/de/index.mdx deleted file mode 100644 index c7d6511053ec..000000000000 --- a/docs/source/de/index.mdx +++ /dev/null @@ -1,322 +0,0 @@ - - -# 🤗 Transformers - -Maschinelles Lernen auf dem neuesten Stand der Technik für PyTorch, TensorFlow und JAX. - -🤗 Transformers bietet APIs zum einfachen Herunterladen und Trainieren von vortrainierten Modellen auf dem neuesten Stand der Technik. Die Verwendung von vortrainierten Modellen kann Rechenkosten sparen und den CO2-Fußabdruck reduzieren und Zeit sparen, die für das Training eines Modells von Grund auf benötigt wird. Die Modelle können für verschiedene Modalitäten verwendet werden, wie z. B.: - -* 📝 Text: Textklassifizierung, Informationsextrahierung, Beantwortung von Fragen, Zusammenfassung, Übersetzung und Texterstellung in über 100 Sprachen. -* 🖼️ Bilder: Bildklassifizierung, Objekterkennung und Segmentierung. -* 🗣️ Audio: Spracherkennung und Audioklassifizierung. -* 🐙 Multimodal: Beantwortung von Tabellenfragen, optische Zeichenerkennung, Informationsextraktion aus gescannten Dokumenten, Videoklassifizierung und Beantwortung visueller Fragen. - -Unsere Bibliothek unterstützt die nahtlose Integration von drei der beliebtesten Deep-Learning-Bibliotheken: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) und [JAX](https://jax.readthedocs.io/en/latest/). Trainieren Sie Ihr Modell in drei Codezeilen in einem Framework und laden Sie es zur Inferenz mit einem anderen. - -Jede 🤗 Transformers-Architektur ist in einem eigenständigen Python-Modul definiert, so dass sie leicht für Forschung und Experimente angepasst werden kann. - -## Wenn Sie auf der Suche nach individueller Unterstützung durch das Hugging Face-Team sind - - - HuggingFace Expert Acceleration Program - - -## Inhalt - -Die Dokumentation ist in fünf Teile gegliedert: - -- **GET STARTED** enthält eine kurze Tour und Installationsanweisungen, um mit 🤗 Transformers loszulegen. -- **TUTORIALS** sind ein hervorragender Ausgangspunkt, wenn Sie neu in unserer Bibliothek sind. Dieser Abschnitt hilft Ihnen, die grundlegenden Fähigkeiten zu erlangen, die Sie benötigen, um mit 🤗 Transformers zu arbeiten. -- **HOW-TO GUIDES** zeigen Ihnen, wie Sie ein bestimmtes Ziel erreichen können, z. B. die Feinabstimmung eines vortrainierten Modells für die Sprachmodellierung oder die Erstellung eines benutzerdefinierten Modellkopfs. -- **KONZEPTUELLE ANLEITUNGEN** bietet weitere Diskussionen und Erklärungen zu den zugrunde liegenden Konzepten und Ideen hinter Modellen, Aufgaben und der Designphilosophie von 🤗 Transformers. -- **API** beschreibt jede Klasse und Funktion, gruppiert in: - - - **MAIN CLASSES** für die Hauptklassen, die die wichtigsten APIs der Bibliothek darstellen. - - MODELLE** für die Klassen und Funktionen, die zu jedem in der Bibliothek implementierten Modell gehören. - - **INTERNAL HELPERS** für die Klassen und Funktionen, die wir intern verwenden. - -Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, vortrainierte Modellgewichte, Nutzungsskripte und Konvertierungsprogramme für die folgenden Modelle. - -### Unterstütze Modelle - - - -1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. -1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. -1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. -1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. -1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. -1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. -1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). -1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. -1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. -1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. -1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. -1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. -1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. -1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. -1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. -1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. -1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. -1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. -1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. -1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. -1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. -1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. -1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. -1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. -1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. -1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. -1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. -1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. -1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. -1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. -1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. -1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach -1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. -1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. -1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. -1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. -1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. -1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. -1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei. -1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. -1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. -1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. -1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. -1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. -1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. -1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. -1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. -1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. -1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. -1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. -1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. -1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. -1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. -1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. -1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. -1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. -1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. -1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. -1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. -1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. -1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. -1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. -1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. -1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. -1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. -1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. -1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. -1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. -1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. -1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. -1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. -1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. -1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. -1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. -1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. -1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. -1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. -1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. -1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine -1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. -1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler -1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. -1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. -1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. -1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. -1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. -1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. -1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. -1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. -1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. -1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. -1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. -1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. -1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. -1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. -1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. -1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. -1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. -1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. -1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. -1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. - - -### Unterstützte Frameworks - -Die folgende Tabelle zeigt die derzeitige Unterstützung in der Bibliothek für jedes dieser Modelle, unabhängig davon, ob sie einen Python -Tokenizer haben (als "langsam" bezeichnet), ein "schneller" Tokenizer, der von der 🤗 Tokenizers Bibliothek unterstützt wird, ob sie Unterstützung in Jax (via -Flax), PyTorch, und/oder TensorFlow haben. - - - -| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | -|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| -| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| BART | ✅ | ✅ | ✅ | ✅ | ✅ | -| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | -| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | -| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | -| BigBird-Pegasus | ❌ | ❌ | ✅ | ❌ | ❌ | -| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | -| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | -| BLOOM | ❌ | ✅ | ✅ | ❌ | ❌ | -| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| CANINE | ✅ | ❌ | ✅ | ❌ | ❌ | -| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | -| CodeGen | ✅ | ✅ | ✅ | ❌ | ❌ | -| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ | -| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | -| CvT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | -| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | -| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | -| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| DeiT | ❌ | ❌ | ✅ | ✅ | ❌ | -| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | -| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | -| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | -| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | -| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | -| FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | -| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | -| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | -| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | -| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | -| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | -| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | -| GroupViT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | -| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | -| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | -| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ | -| LED | ✅ | ✅ | ✅ | ✅ | ❌ | -| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | -| LongT5 | ❌ | ❌ | ✅ | ❌ | ✅ | -| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | -| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| M-CTC-T | ❌ | ❌ | ✅ | ❌ | ❌ | -| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | -| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | -| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | -| Megatron-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | -| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| MobileViT | ❌ | ❌ | ✅ | ❌ | ❌ | -| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | -| MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | -| MVP | ✅ | ✅ | ✅ | ❌ | ❌ | -| Nezha | ❌ | ❌ | ✅ | ❌ | ❌ | -| Nyströmformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | -| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | -| OPT | ❌ | ❌ | ✅ | ✅ | ✅ | -| OWL-ViT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | -| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | -| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | -| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | -| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | -| REALM | ✅ | ✅ | ✅ | ❌ | ❌ | -| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | -| RegNet | ❌ | ❌ | ✅ | ✅ | ❌ | -| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ResNet | ❌ | ❌ | ✅ | ✅ | ❌ | -| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | -| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | -| SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ | -| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | -| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | -| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | -| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | -| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | -| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | -| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ | -| Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ | -| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | -| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | -| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | -| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | -| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | -| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | -| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | -| VideoMAE | ❌ | ❌ | ✅ | ❌ | ❌ | -| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | -| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | -| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | -| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | -| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | -| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | -| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | -| XLM-ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | -| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | -| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | -| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | -| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | -| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | - - diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md new file mode 100644 index 000000000000..295c9cad97bc --- /dev/null +++ b/docs/source/de/installation.md @@ -0,0 +1,250 @@ + + +# Installation + +Installieren Sie 🤗 Transformers für die Deep-Learning-Bibliothek, mit der Sie arbeiten, richten Sie Ihren Cache ein und konfigurieren Sie 🤗 Transformers optional für den Offline-Betrieb. + +🤗 Transformers wurde unter Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, und Flax getestet. Folgen Sie den Installationsanweisungen unten für die von Ihnen verwendete Deep-Learning-Bibliothek: + +* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions. +* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions. +* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions. + +## Installation mit pip + +Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, werfen Sie einen Blick auf diese [Anleitung](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Eine virtuelle Umgebung macht es einfacher, verschiedene Projekte zu verwalten und Kompatibilitätsprobleme zwischen Abhängigkeiten zu vermeiden. + +Beginnen wir mit der Erstellung einer virtuellen Umgebung in Ihrem Projektverzeichnis: + + +```bash +python -m venv .env +``` + +Aktivieren wir die virtuelle Umgebung. Unter Linux und MacOs: + +```bash +source .env/bin/activate +``` +Aktivieren wir die virtuelle Umgebung unter Windows + +```bash +.env/Scripts/activate +``` + +Jetzt können wir die 🤗 Transformers mit dem folgenden Befehl installieren: + +```bash +pip install transformers +``` + +Bei reiner CPU-Unterstützung können wir 🤗 Transformers und eine Deep-Learning-Bibliothek bequem in einer Zeile installieren. Installieren wir zum Beispiel 🤗 Transformers und PyTorch mit: + +```bash +pip install transformers[torch] +``` + +🤗 Transformers und TensorFlow 2.0: + +```bash +pip install transformers[tf-cpu] +``` + +🤗 Transformers und Flax: + +```bash +pip install transformers[flax] +``` + +Überprüfen wir abschließend, ob 🤗 Transformers ordnungsgemäß installiert wurde, indem wir den folgenden Befehl ausführen. Es wird ein vortrainiertes Modell heruntergeladen: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" +``` + +Dann wird die Kategorie und die Wahrscheinlichkeit ausgegeben: + +```bash +[{'label': 'POSITIVE', 'score': 0.9998704791069031}] +``` + +## Installation aus dem Code + +Installieren wir 🤗 Transformers aus dem Quellcode mit dem folgenden Befehl: + +```bash +pip install git+https://github.com/huggingface/transformers +``` + +Dieser Befehl installiert die aktuelle `main` Version und nicht die neueste `stable` Version. Die `main`-Version ist nützlich, um mit den neuesten Entwicklungen Schritt zu halten. Zum Beispiel, wenn ein Fehler seit der letzten offiziellen Version behoben wurde, aber eine neue Version noch nicht veröffentlicht wurde. Das bedeutet jedoch, dass die "Hauptversion" nicht immer stabil ist. Wir bemühen uns, die Hauptversion einsatzbereit zu halten, und die meisten Probleme werden normalerweise innerhalb weniger Stunden oder eines Tages behoben. Wenn Sie auf ein Problem stoßen, öffnen Sie bitte ein [Issue] (https://github.com/huggingface/transformers/issues), damit wir es noch schneller beheben können! + +Überprüfen wir, ob 🤗 Transformers richtig installiert wurde, indem Sie den folgenden Befehl ausführen: + + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" +``` + +## Editierbare Installation + +Sie benötigen eine bearbeitbare Installation, wenn Sie: + +* die "Haupt"-Version des Quellcodes verwenden möchten. +* Zu 🤗 Transformers beitragen und Änderungen am Code testen wollen. + +Klonen Sie das Repository und installieren 🤗 Transformers mit den folgenden Befehlen: + +```bash +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install -e . +``` + +Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit den Pfaden Ihrer Python-Bibliotheken. Python wird nun in dem Ordner suchen, in den Sie geklont haben, zusätzlich zu den normalen Bibliothekspfaden. Wenn zum Beispiel Ihre Python-Pakete normalerweise in `~/anaconda3/envs/main/lib/python3.7/site-packages/` installiert sind, wird Python auch den Ordner durchsuchen, in den Sie geklont haben: `~/transformers/`. + + + + +Sie müssen den Ordner `transformers` behalten, wenn Sie die Bibliothek weiter verwenden wollen. + + + +Jetzt können Sie Ihren Klon mit dem folgenden Befehl ganz einfach auf die neueste Version von 🤗 Transformers aktualisieren: + + +```bash +cd ~/transformers/ +git pull +``` + +Ihre Python-Umgebung wird beim nächsten Ausführen die `main`-Version von 🤗 Transformers finden. + +## Installation mit conda + +Installation von dem conda Kanal `huggingface`: + +```bash +conda install -c huggingface transformers +``` + +## Cache Einrichtung + +Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben: + +1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`. +2. Shell-Umgebungsvariable: `HF_HOME`. +3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`. + + + + +Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` oder `PYTORCH_PRETRAINED_BERT_CACHE`, wenn Sie von einer früheren Iteration dieser Bibliothek kommen und diese Umgebungsvariablen gesetzt haben, sofern Sie nicht die Shell-Umgebungsvariable `TRANSFORMERS_CACHE` angeben. + + + +## Offline Modus + +Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren. + + + +Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offline-Trainingsworkflow hinzufügen, indem Sie die Umgebungsvariable `HF_DATASETS_OFFLINE=1` setzen. + + + +So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen: + +```bash +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +Führen Sie das gleiche Programm in einer Offline-Instanz mit aus: + +```bash +HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll. + + +### Abrufen von Modellen und Tokenizern zur Offline-Verwendung + +Eine andere Möglichkeit, 🤗 Transformers offline zu verwenden, besteht darin, die Dateien im Voraus herunterzuladen und dann auf ihren lokalen Pfad zu verweisen, wenn Sie sie offline verwenden müssen. Es gibt drei Möglichkeiten, dies zu tun: + +* Laden Sie eine Datei über die Benutzeroberfläche des [Model Hub](https://huggingface.co/models) herunter, indem Sie auf das ↓-Symbol klicken. + + ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) + +* Verwenden Sie den [PreTrainedModel.from_pretrained] und [PreTrainedModel.save_pretrained] Workflow: + + 1. Laden Sie Ihre Dateien im Voraus mit [`PreTrainedModel.from_pretrained`] herunter: + + ```py + >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + + >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") + ``` + + 2. Speichern Sie Ihre Dateien in einem bestimmten Verzeichnis mit [`PreTrainedModel.save_pretrained`]: + + ```py + >>> tokenizer.save_pretrained("./your/path/bigscience_t0") + >>> model.save_pretrained("./your/path/bigscience_t0") + ``` + + 3. Wenn Sie nun offline sind, laden Sie Ihre Dateien mit [`PreTrainedModel.from_pretrained`] aus dem bestimmten Verzeichnis: + + ```py + >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0") + >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0") + ``` + +* Programmatisches Herunterladen von Dateien mit der [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) Bibliothek: + + 1. Installieren Sie die "huggingface_hub"-Bibliothek in Ihrer virtuellen Umgebung: + + ```bash + python -m pip install huggingface_hub + ``` + + 2. Verwenden Sie die Funktion [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub), um eine Datei in einen bestimmten Pfad herunterzuladen. Der folgende Befehl lädt zum Beispiel die Datei "config.json" aus dem Modell [T0](https://huggingface.co/bigscience/T0_3B) in den gewünschten Pfad herunter: + + ```py + >>> from huggingface_hub import hf_hub_download + + >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0") + ``` + +Sobald Ihre Datei heruntergeladen und lokal zwischengespeichert ist, geben Sie den lokalen Pfad an, um sie zu laden und zu verwenden: + +```py +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json") +``` + + + +Weitere Informationen zum Herunterladen von Dateien, die auf dem Hub gespeichert sind, finden Sie im Abschnitt [Wie man Dateien vom Hub herunterlädt] (https://huggingface.co/docs/hub/how-to-downstream). + + diff --git a/docs/source/de/installation.mdx b/docs/source/de/installation.mdx deleted file mode 100644 index 3103830ee7fd..000000000000 --- a/docs/source/de/installation.mdx +++ /dev/null @@ -1,246 +0,0 @@ - - -# Installation - -Installieren Sie 🤗 Transformers für die Deep-Learning-Bibliothek, mit der Sie arbeiten, richten Sie Ihren Cache ein und konfigurieren Sie 🤗 Transformers optional für den Offline-Betrieb. - -🤗 Transformers wurde unter Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, und Flax getestet. Folgen Sie den Installationsanweisungen unten für die von Ihnen verwendete Deep-Learning-Bibliothek: - -* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions. -* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions. -* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions. - -## Installation mit pip - -Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, werfen Sie einen Blick auf diese [Anleitung](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Eine virtuelle Umgebung macht es einfacher, verschiedene Projekte zu verwalten und Kompatibilitätsprobleme zwischen Abhängigkeiten zu vermeiden. - -Beginnen wir mit der Erstellung einer virtuellen Umgebung in Ihrem Projektverzeichnis: - - -```bash -python -m venv .env -``` - -Aktivieren wir die virtuelle Umgebung. Unter Linux und MacOs: - -```bash -source .env/bin/activate -``` -Aktivieren wir die virtuelle Umgebung unter Windows - -```bash -.env/Scripts/activate -``` - -Jetzt können wir die 🤗 Transformers mit dem folgenden Befehl installieren: - -```bash -pip install transformers -``` - -Bei reiner CPU-Unterstützung können wir 🤗 Transformers und eine Deep-Learning-Bibliothek bequem in einer Zeile installieren. Installieren wir zum Beispiel 🤗 Transformers und PyTorch mit: - -```bash -pip install transformers[torch] -``` - -🤗 Transformers und TensorFlow 2.0: - -```bash -pip install transformers[tf-cpu] -``` - -🤗 Transformers und Flax: - -```bash -pip install transformers[flax] -``` - -Überprüfen wir abschließend, ob 🤗 Transformers ordnungsgemäß installiert wurde, indem wir den folgenden Befehl ausführen. Es wird ein vortrainiertes Modell heruntergeladen: - -```bash -python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" -``` - -Dann wird die Kategorie und die Wahrscheinlichkeit ausgegeben: - -```bash -[{'label': 'POSITIVE', 'score': 0.9998704791069031}] -``` - -## Installation aus dem Code - -Installieren wir 🤗 Transformers aus dem Quellcode mit dem folgenden Befehl: - -```bash -pip install git+https://github.com/huggingface/transformers -``` - -Dieser Befehl installiert die aktuelle `main` Version und nicht die neueste `stable` Version. Die `main`-Version ist nützlich, um mit den neuesten Entwicklungen Schritt zu halten. Zum Beispiel, wenn ein Fehler seit der letzten offiziellen Version behoben wurde, aber eine neue Version noch nicht veröffentlicht wurde. Das bedeutet jedoch, dass die "Hauptversion" nicht immer stabil ist. Wir bemühen uns, die Hauptversion einsatzbereit zu halten, und die meisten Probleme werden normalerweise innerhalb weniger Stunden oder eines Tages behoben. Wenn Sie auf ein Problem stoßen, öffnen Sie bitte ein [Issue] (https://github.com/huggingface/transformers/issues), damit wir es noch schneller beheben können! - -Überprüfen wir, ob 🤗 Transformers richtig installiert wurde, indem Sie den folgenden Befehl ausführen: - - -```bash -python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" -``` - -## Editierbare Installation - -Sie benötigen eine bearbeitbare Installation, wenn Sie: - -* die "Haupt"-Version des Quellcodes verwenden möchten. -* Zu 🤗 Transformers beitragen und Änderungen am Code testen wollen. - -Klonen Sie das Repository und installieren 🤗 Transformers mit den folgenden Befehlen: - -```bash -git clone https://github.com/huggingface/transformers.git -cd transformers -pip install -e . -``` - -Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit den Pfaden Ihrer Python-Bibliotheken. Python wird nun in dem Ordner suchen, in den Sie geklont haben, zusätzlich zu den normalen Bibliothekspfaden. Wenn zum Beispiel Ihre Python-Pakete normalerweise in `~/anaconda3/envs/main/lib/python3.7/site-packages/` installiert sind, wird Python auch den Ordner durchsuchen, in den Sie geklont haben: `~/transformers/`. - - - - -Sie müssen den Ordner `transformers` behalten, wenn Sie die Bibliothek weiter verwenden wollen. - - - -Jetzt können Sie Ihren Klon mit dem folgenden Befehl ganz einfach auf die neueste Version von 🤗 Transformers aktualisieren: - - -```bash -cd ~/transformers/ -git pull -``` - -Ihre Python-Umgebung wird beim nächsten Ausführen die `main`-Version von 🤗 Transformers finden. - -## Installation mit conda - -Installation von dem conda Kanal `huggingface`: - -```bash -conda install -c huggingface transformers -``` - -## Cache Einrichtung - -Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben: - -1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`. -2. Shell-Umgebungsvariable: `HF_HOME`. -3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`. - - - - -Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` oder `PYTORCH_PRETRAINED_BERT_CACHE`, wenn Sie von einer früheren Iteration dieser Bibliothek kommen und diese Umgebungsvariablen gesetzt haben, sofern Sie nicht die Shell-Umgebungsvariable `TRANSFORMERS_CACHE` angeben. - - - -## Offline Modus - -Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren. - - - -Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offline-Trainingsworkflow hinzufügen, indem Sie die Umgebungsvariable `HF_DATASETS_OFFLINE=1` setzen. - - - -So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen: - -```bash -python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... -``` - -Führen Sie das gleiche Programm in einer Offline-Instanz mit aus: - -```bash -HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ -python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... -``` - -Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll. - - -### Abrufen von Modellen und Tokenizern zur Offline-Verwendung - -Eine andere Möglichkeit, 🤗 Transformers offline zu verwenden, besteht darin, die Dateien im Voraus herunterzuladen und dann auf ihren lokalen Pfad zu verweisen, wenn Sie sie offline verwenden müssen. Es gibt drei Möglichkeiten, dies zu tun: - -* Laden Sie eine Datei über die Benutzeroberfläche des [Model Hub](https://huggingface.co/models) herunter, indem Sie auf das ↓-Symbol klicken. - - ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) - -* Verwenden Sie den [PreTrainedModel.from_pretrained] und [PreTrainedModel.save_pretrained] Workflow: - - 1. Laden Sie Ihre Dateien im Voraus mit [`PreTrainedModel.from_pretrained`] herunter: - - ```py - >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - - >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") - ``` - - 2. Speichern Sie Ihre Dateien in einem bestimmten Verzeichnis mit [`PreTrainedModel.save_pretrained`]: - - ```py - >>> tokenizer.save_pretrained("./your/path/bigscience_t0") - >>> model.save_pretrained("./your/path/bigscience_t0") - ``` - - 3. Wenn Sie nun offline sind, laden Sie Ihre Dateien mit [`PreTrainedModel.from_pretrained`] aus dem bestimmten Verzeichnis: - - ```py - >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0") - >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0") - ``` - -* Programmatisches Herunterladen von Dateien mit der [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) Bibliothek: - - 1. Installieren Sie die "huggingface_hub"-Bibliothek in Ihrer virtuellen Umgebung: - - ```bash - python -m pip install huggingface_hub - ``` - - 2. Verwenden Sie die Funktion [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub), um eine Datei in einen bestimmten Pfad herunterzuladen. Der folgende Befehl lädt zum Beispiel die Datei "config.json" aus dem Modell [T0](https://huggingface.co/bigscience/T0_3B) in den gewünschten Pfad herunter: - - ```py - >>> from huggingface_hub import hf_hub_download - - >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0") - ``` - -Sobald Ihre Datei heruntergeladen und lokal zwischengespeichert ist, geben Sie den lokalen Pfad an, um sie zu laden und zu verwenden: - -```py ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json") -``` - - - -Weitere Informationen zum Herunterladen von Dateien, die auf dem Hub gespeichert sind, finden Sie im Abschnitt [Wie man Dateien vom Hub herunterlädt] (https://huggingface.co/docs/hub/how-to-downstream). - - diff --git a/docs/source/de/llm_tutorial.md b/docs/source/de/llm_tutorial.md new file mode 100644 index 000000000000..1c5da4103283 --- /dev/null +++ b/docs/source/de/llm_tutorial.md @@ -0,0 +1,221 @@ + + + +# Generation with LLMs + +[[open-in-colab]] + +LLMs (Large Language Models) sind die Schlüsselkomponente bei der Texterstellung. Kurz gesagt, bestehen sie aus großen, vortrainierten Transformationsmodellen, die darauf trainiert sind, das nächste Wort (oder genauer gesagt Token) aus einem Eingabetext vorherzusagen. Da sie jeweils ein Token vorhersagen, müssen Sie etwas Aufwändigeres tun, um neue Sätze zu generieren, als nur das Modell aufzurufen - Sie müssen eine autoregressive Generierung durchführen. + +Die autoregressive Generierung ist ein Verfahren zur Inferenzzeit, bei dem ein Modell mit seinen eigenen generierten Ausgaben iterativ aufgerufen wird, wenn einige anfängliche Eingaben vorliegen. In 🤗 Transformers wird dies von der Methode [`~generation.GenerationMixin.generate`] übernommen, die allen Modellen mit generativen Fähigkeiten zur Verfügung steht. + +Dieses Tutorial zeigt Ihnen, wie Sie: + +* Text mit einem LLM generieren +* Vermeiden Sie häufige Fallstricke +* Nächste Schritte, damit Sie das Beste aus Ihrem LLM herausholen können + +Bevor Sie beginnen, stellen Sie sicher, dass Sie alle erforderlichen Bibliotheken installiert haben: + +```bash +pip install transformers bitsandbytes>=0.39.0 -q +``` + + +## Text generieren + +Ein Sprachmodell, das für [causal language modeling](tasks/language_modeling) trainiert wurde, nimmt eine Folge von Text-Token als Eingabe und gibt die Wahrscheinlichkeitsverteilung für das nächste Token zurück. + + +
+ +
"Forward pass of an LLM"
+
+ +Ein wichtiger Aspekt der autoregressiven Generierung mit LLMs ist die Auswahl des nächsten Tokens aus dieser Wahrscheinlichkeitsverteilung. In diesem Schritt ist alles möglich, solange Sie am Ende ein Token für die nächste Iteration haben. Das heißt, es kann so einfach sein wie die Auswahl des wahrscheinlichsten Tokens aus der Wahrscheinlichkeitsverteilung oder so komplex wie die Anwendung von einem Dutzend Transformationen vor der Stichprobenziehung aus der resultierenden Verteilung. + + +
+ +
"Die autoregressive Generierung wählt iterativ das nächste Token aus einer Wahrscheinlichkeitsverteilung aus, um Text zu erzeugen"
+
+ +Der oben dargestellte Prozess wird iterativ wiederholt, bis eine bestimmte Abbruchbedingung erreicht ist. Im Idealfall wird die Abbruchbedingung vom Modell vorgegeben, das lernen sollte, wann es ein Ende-der-Sequenz-Token (EOS) ausgeben muss. Ist dies nicht der Fall, stoppt die Generierung, wenn eine vordefinierte Maximallänge erreicht ist. + +Damit sich Ihr Modell so verhält, wie Sie es für Ihre Aufgabe erwarten, müssen Sie den Schritt der Token-Auswahl und die Abbruchbedingung richtig einstellen. Aus diesem Grund haben wir zu jedem Modell eine [`~generation.GenerationConfig`]-Datei, die eine gute generative Standardparametrisierung enthält und zusammen mit Ihrem Modell geladen wird. + +Lassen Sie uns über Code sprechen! + + + +Wenn Sie an der grundlegenden Verwendung von LLMs interessiert sind, ist unsere High-Level-Schnittstelle [`Pipeline`](pipeline_tutorial) ein guter Ausgangspunkt. LLMs erfordern jedoch oft fortgeschrittene Funktionen wie Quantisierung und Feinsteuerung des Token-Auswahlschritts, was am besten über [`~generation.GenerationMixin.generate`] erfolgt. Die autoregressive Generierung mit LLMs ist ebenfalls ressourcenintensiv und sollte für einen angemessenen Durchsatz auf einer GPU ausgeführt werden. + + + + +Zunächst müssen Sie das Modell laden. + +```py +>>> from transformers import AutoModelForCausalLM + +>>> model = AutoModelForCausalLM.from_pretrained( +... "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True +... ) +``` + +Sie werden zwei Flags in dem Aufruf `from_pretrained` bemerken: + + - `device_map` stellt sicher, dass das Modell auf Ihre GPU(s) übertragen wird + - `load_in_4bit` wendet [dynamische 4-Bit-Quantisierung](main_classes/quantization) an, um die Ressourcenanforderungen massiv zu reduzieren + +Es gibt noch andere Möglichkeiten, ein Modell zu initialisieren, aber dies ist eine gute Grundlage, um mit einem LLM zu beginnen. + +Als nächstes müssen Sie Ihre Texteingabe mit einem [tokenizer](tokenizer_summary) vorverarbeiten. + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b") +>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda") +``` + +Die Variable `model_inputs` enthält die tokenisierte Texteingabe sowie die Aufmerksamkeitsmaske. Obwohl [`~generation.GenerationMixin.generate`] sein Bestes tut, um die Aufmerksamkeitsmaske abzuleiten, wenn sie nicht übergeben wird, empfehlen wir, sie für optimale Ergebnisse wann immer möglich zu übergeben. + +Rufen Sie schließlich die Methode [~generation.GenerationMixin.generate] auf, um die generierten Token zurückzugeben, die vor dem Drucken in Text umgewandelt werden sollten. + +```py +>>> generated_ids = model.generate(**model_inputs) +>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'A list of colors: red, blue, green, yellow, black, white, and brown' +``` + +Und das war's! Mit ein paar Zeilen Code können Sie sich die Macht eines LLM zunutze machen. + + +## Häufige Fallstricke + +Es gibt viele [Generierungsstrategien](generation_strategies), und manchmal sind die Standardwerte für Ihren Anwendungsfall vielleicht nicht geeignet. Wenn Ihre Ausgaben nicht mit dem übereinstimmen, was Sie erwarten, haben wir eine Liste der häufigsten Fallstricke erstellt und wie Sie diese vermeiden können. + +```py +>>> from transformers import AutoModelForCausalLM, AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b") +>>> tokenizer.pad_token = tokenizer.eos_token # Llama has no pad token by default +>>> model = AutoModelForCausalLM.from_pretrained( +... "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True +... ) +``` + +### Generierte Ausgabe ist zu kurz/lang + +Wenn in der Datei [~generation.GenerationConfig`] nichts angegeben ist, gibt `generate` standardmäßig bis zu 20 Token zurück. Wir empfehlen dringend, `max_new_tokens` in Ihrem `generate`-Aufruf manuell zu setzen, um die maximale Anzahl neuer Token zu kontrollieren, die zurückgegeben werden können. Beachten Sie, dass LLMs (genauer gesagt, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) auch die Eingabeaufforderung als Teil der Ausgabe zurückgeben. + + +```py +>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda") + +>>> # By default, the output will contain up to 20 tokens +>>> generated_ids = model.generate(**model_inputs) +>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'A sequence of numbers: 1, 2, 3, 4, 5' + +>>> # Setting `max_new_tokens` allows you to control the maximum length +>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50) +>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,' +``` + +### Falscher Generierungsmodus + +Standardmäßig und sofern nicht in der Datei [~generation.GenerationConfig`] angegeben, wählt `generate` bei jeder Iteration das wahrscheinlichste Token aus (gierige Dekodierung). Je nach Aufgabe kann dies unerwünscht sein; kreative Aufgaben wie Chatbots oder das Schreiben eines Aufsatzes profitieren vom Sampling. Andererseits profitieren Aufgaben, bei denen es auf die Eingabe ankommt, wie z.B. Audiotranskription oder Übersetzung, von der gierigen Dekodierung. Aktivieren Sie das Sampling mit `do_sample=True`. Mehr zu diesem Thema erfahren Sie in diesem [Blogbeitrag] (https://huggingface.co/blog/how-to-generate). + +```py +>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility +>>> from transformers import set_seed +>>> set_seed(0) + +>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda") + +>>> # LLM + greedy decoding = repetitive, boring output +>>> generated_ids = model.generate(**model_inputs) +>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'I am a cat. I am a cat. I am a cat. I am a cat' + +>>> # With sampling, the output becomes more creative! +>>> generated_ids = model.generate(**model_inputs, do_sample=True) +>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'I am a cat.\nI just need to be. I am always.\nEvery time' +``` + +### Falsche Auffüllseite + +LLMs sind [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)-Architekturen, d.h. sie iterieren weiter über Ihre Eingabeaufforderung. Wenn Ihre Eingaben nicht die gleiche Länge haben, müssen sie aufgefüllt werden. Da LLMs nicht darauf trainiert sind, mit aufgefüllten Token fortzufahren, muss Ihre Eingabe links aufgefüllt werden. Vergessen Sie auch nicht, die Aufmerksamkeitsmaske an generate zu übergeben! + +```py +>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence, +>>> # which is shorter, has padding on the right side. Generation fails. +>>> model_inputs = tokenizer( +... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt" +... ).to("cuda") +>>> generated_ids = model.generate(**model_inputs) +>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0] +'' + +>>> # With left-padding, it works as expected! +>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left") +>>> tokenizer.pad_token = tokenizer.eos_token # Llama has no pad token by default +>>> model_inputs = tokenizer( +... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt" +... ).to("cuda") +>>> generated_ids = model.generate(**model_inputs) +>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'1, 2, 3, 4, 5, 6,' +``` + + + +## Weitere Ressourcen + +Während der Prozess der autoregressiven Generierung relativ einfach ist, kann die optimale Nutzung Ihres LLM ein schwieriges Unterfangen sein, da es viele bewegliche Teile gibt. Für Ihre nächsten Schritte, die Ihnen helfen, tiefer in die LLM-Nutzung und das Verständnis einzutauchen: + + +### Fortgeschrittene Nutzung generieren + +1. [Leitfaden](generation_strategies) zur Steuerung verschiedener Generierungsmethoden, zur Einrichtung der Generierungskonfigurationsdatei und zum Streaming der Ausgabe; +2. API-Referenz zu [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`] und [generate-bezogene Klassen](internal/generation_utils). + +### LLM-Ranglisten + +1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), das sich auf die Qualität der Open-Source-Modelle konzentriert; +2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), das sich auf den LLM-Durchsatz konzentriert. + +### Latenz und Durchsatz + +1. [Leitfaden](main_classes/quantization) zur dynamischen Quantisierung, der Ihnen zeigt, wie Sie Ihren Speicherbedarf drastisch reduzieren können. + +### Verwandte Bibliotheken + +1. [text-generation-inference](https://github.com/huggingface/text-generation-inference), ein produktionsreifer Server für LLMs; +2. [`optimum`](https://github.com/huggingface/optimum), eine Erweiterung von 🤗 Transformers, die für bestimmte Hardware-Geräte optimiert. diff --git a/docs/source/de/model_sharing.md b/docs/source/de/model_sharing.md new file mode 100644 index 000000000000..415277e00e5e --- /dev/null +++ b/docs/source/de/model_sharing.md @@ -0,0 +1,232 @@ + + +# Ein Modell teilen + +Die letzten beiden Tutorials haben gezeigt, wie man ein Modell mit PyTorch, Keras und 🤗 Accelerate für verteilte Setups feinabstimmen kann. Der nächste Schritt besteht darin, Ihr Modell mit der Community zu teilen! Bei Hugging Face glauben wir an den offenen Austausch von Wissen und Ressourcen, um künstliche Intelligenz für alle zu demokratisieren. Wir ermutigen Sie, Ihr Modell mit der Community zu teilen, um anderen zu helfen, Zeit und Ressourcen zu sparen. + +In diesem Tutorial lernen Sie zwei Methoden kennen, wie Sie ein trainiertes oder verfeinertes Modell auf dem [Model Hub](https://huggingface.co/models) teilen können: + +- Programmgesteuertes Übertragen Ihrer Dateien auf den Hub. +- Ziehen Sie Ihre Dateien per Drag-and-Drop über die Weboberfläche in den Hub. + + + + + +Um ein Modell mit der Öffentlichkeit zu teilen, benötigen Sie ein Konto auf [huggingface.co](https://huggingface.co/join). Sie können auch einer bestehenden Organisation beitreten oder eine neue Organisation gründen. + + + +## Repository-Funktionen + +Jedes Repository im Model Hub verhält sich wie ein typisches GitHub-Repository. Unsere Repositorys bieten Versionierung, Commit-Historie und die Möglichkeit, Unterschiede zu visualisieren. + +Die integrierte Versionierung des Model Hub basiert auf Git und [git-lfs](https://git-lfs.github.com/). Mit anderen Worten: Sie können ein Modell als ein Repository behandeln, was eine bessere Zugriffskontrolle und Skalierbarkeit ermöglicht. Die Versionskontrolle ermöglicht *Revisionen*, eine Methode zum Anheften einer bestimmten Version eines Modells mit einem Commit-Hash, Tag oder Branch. + +Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" laden: + +```py +>>> model = AutoModel.from_pretrained( +... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... ) +``` + +Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können die Commit-Historie sowie die Unterschiede einsehen: + +![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) + +## Einrichtung + +Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert: + +```bash +huggingface-cli login +``` + +Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub. + +```bash +pip install huggingface_hub +``` + +Verwenden Sie dann `notebook_login`, um sich beim Hub anzumelden, und folgen Sie dem Link [hier](https://huggingface.co/settings/token), um ein Token für die Anmeldung zu generieren: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Ein Modell für alle Frameworks konvertieren + +Um sicherzustellen, dass Ihr Modell von jemandem verwendet werden kann, der mit einem anderen Framework arbeitet, empfehlen wir Ihnen, Ihr Modell sowohl mit PyTorch- als auch mit TensorFlow-Checkpoints zu konvertieren und hochzuladen. Während Benutzer immer noch in der Lage sind, Ihr Modell von einem anderen Framework zu laden, wenn Sie diesen Schritt überspringen, wird es langsamer sein, weil 🤗 Transformers den Checkpoint on-the-fly konvertieren müssen. + +Die Konvertierung eines Checkpoints für ein anderes Framework ist einfach. Stellen Sie sicher, dass Sie PyTorch und TensorFlow installiert haben (siehe [hier](installation) für Installationsanweisungen), und finden Sie dann das spezifische Modell für Ihre Aufgabe in dem anderen Framework. + + + +Geben Sie `from_tf=True` an, um einen Prüfpunkt von TensorFlow nach PyTorch zu konvertieren: + +```py +>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) +>>> pt_model.save_pretrained("path/to/awesome-name-you-picked") +``` + + +Geben Sie `from_pt=True` an, um einen Prüfpunkt von PyTorch nach TensorFlow zu konvertieren: + +```py +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) +``` + +Dann können Sie Ihr neues TensorFlow-Modell mit seinem neuen Checkpoint speichern: + +```py +>>> tf_model.save_pretrained("path/to/awesome-name-you-picked") +``` + + +Wenn ein Modell in Flax verfügbar ist, können Sie auch einen Kontrollpunkt von PyTorch nach Flax konvertieren: + +```py +>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( +... "path/to/awesome-name-you-picked", from_pt=True +... ) +``` + + + +## Ein Modell während des Trainings hochladen + + + + + +Die Weitergabe eines Modells an den Hub ist so einfach wie das Hinzufügen eines zusätzlichen Parameters oder Rückrufs. Erinnern Sie sich an das [Feinabstimmungs-Tutorial](training), in der Klasse [`TrainingArguments`] geben Sie Hyperparameter und zusätzliche Trainingsoptionen an. Eine dieser Trainingsoptionen beinhaltet die Möglichkeit, ein Modell direkt an den Hub zu pushen. Setzen Sie `push_to_hub=True` in Ihrer [`TrainingArguments`]: + +```py +>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) +``` + +Übergeben Sie Ihre Trainingsargumente wie gewohnt an [`Trainer`]: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +Nach der Feinabstimmung Ihres Modells rufen Sie [`~transformers.Trainer.push_to_hub`] auf [`Trainer`] auf, um das trainierte Modell an den Hub zu übertragen. Transformers fügt sogar automatisch Trainings-Hyperparameter, Trainingsergebnisse und Framework-Versionen zu Ihrer Modellkarte hinzu! + +```py +>>> trainer.push_to_hub() +``` + + +Geben Sie ein Modell mit [`PushToHubCallback`] an den Hub weiter. In der [`PushToHubCallback`] Funktion, fügen Sie hinzu: + +- Ein Ausgabeverzeichnis für Ihr Modell. +- Einen Tokenizer. +- Die `hub_model_id`, die Ihr Hub-Benutzername und Modellname ist. + +```py +>>> from transformers import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" +... ) +``` + +Fügen Sie den Callback zu [`fit`](https://keras.io/api/models/model_training_apis/) hinzu, und 🤗 Transformers wird das trainierte Modell an den Hub weiterleiten: + +```py +>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) +``` + + + +## Verwenden Sie die Funktion `push_to_hub`. + +Sie können `push_to_hub` auch direkt für Ihr Modell aufrufen, um es in den Hub hochzuladen. + +Geben Sie den Namen Ihres Modells in "push_to_hub" an: + +```py +>>> pt_model.push_to_hub("my-awesome-model") +``` + +Dadurch wird ein Repository unter Ihrem Benutzernamen mit dem Modellnamen `my-awesome-model` erstellt. Benutzer können nun Ihr Modell mit der Funktion `from_pretrained` laden: + +```py +>>> from transformers import AutoModel + +>>> model = AutoModel.from_pretrained("your_username/my-awesome-model") +``` + +Wenn Sie zu einer Organisation gehören und Ihr Modell stattdessen unter dem Namen der Organisation pushen wollen, fügen Sie diesen einfach zur `repo_id` hinzu: + +```py +>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model") +``` + +Die Funktion "push_to_hub" kann auch verwendet werden, um andere Dateien zu einem Modell-Repository hinzuzufügen. Zum Beispiel kann man einen Tokenizer zu einem Modell-Repository hinzufügen: + +```py +>>> tokenizer.push_to_hub("my-awesome-model") +``` + +Oder vielleicht möchten Sie die TensorFlow-Version Ihres fein abgestimmten PyTorch-Modells hinzufügen: + +```py +>>> tf_model.push_to_hub("my-awesome-model") +``` + +Wenn Sie nun zu Ihrem Hugging Face-Profil navigieren, sollten Sie Ihr neu erstelltes Modell-Repository sehen. Wenn Sie auf die Registerkarte **Dateien** klicken, werden alle Dateien angezeigt, die Sie in das Repository hochgeladen haben. + +Weitere Einzelheiten zum Erstellen und Hochladen von Dateien in ein Repository finden Sie in der Hub-Dokumentation [hier](https://huggingface.co/docs/hub/how-to-upstream). + +## Hochladen mit der Weboberfläche + +Benutzer, die einen no-code Ansatz bevorzugen, können ein Modell über das Webinterface des Hubs hochladen. Besuchen Sie [huggingface.co/new](https://huggingface.co/new) um ein neues Repository zu erstellen: + +![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) + +Fügen Sie von hier aus einige Informationen über Ihr Modell hinzu: + +- Wählen Sie den **Besitzer** des Repositorys. Dies können Sie selbst oder eine der Organisationen sein, denen Sie angehören. +- Wählen Sie einen Namen für Ihr Modell, der auch der Name des Repositorys sein wird. +- Wählen Sie, ob Ihr Modell öffentlich oder privat ist. +- Geben Sie die Lizenzverwendung für Ihr Modell an. + +Klicken Sie nun auf die Registerkarte **Dateien** und klicken Sie auf die Schaltfläche **Datei hinzufügen**, um eine neue Datei in Ihr Repository hochzuladen. Ziehen Sie dann eine Datei per Drag-and-Drop hoch und fügen Sie eine Übergabemeldung hinzu. + +![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) + +## Hinzufügen einer Modellkarte + +Um sicherzustellen, dass die Benutzer die Fähigkeiten, Grenzen, möglichen Verzerrungen und ethischen Aspekte Ihres Modells verstehen, fügen Sie bitte eine Modellkarte zu Ihrem Repository hinzu. Die Modellkarte wird in der Datei `README.md` definiert. Sie können eine Modellkarte hinzufügen, indem Sie: + +* Manuelles Erstellen und Hochladen einer "README.md"-Datei. +* Klicken Sie auf die Schaltfläche **Modellkarte bearbeiten** in Ihrem Modell-Repository. + +Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards). \ No newline at end of file diff --git a/docs/source/de/model_sharing.mdx b/docs/source/de/model_sharing.mdx deleted file mode 100644 index 50318595ffc2..000000000000 --- a/docs/source/de/model_sharing.mdx +++ /dev/null @@ -1,228 +0,0 @@ - - -# Ein Modell teilen - -Die letzten beiden Tutorials haben gezeigt, wie man ein Modell mit PyTorch, Keras und 🤗 Accelerate für verteilte Setups feinabstimmen kann. Der nächste Schritt besteht darin, Ihr Modell mit der Community zu teilen! Bei Hugging Face glauben wir an den offenen Austausch von Wissen und Ressourcen, um künstliche Intelligenz für alle zu demokratisieren. Wir ermutigen Sie, Ihr Modell mit der Community zu teilen, um anderen zu helfen, Zeit und Ressourcen zu sparen. - -In diesem Tutorial lernen Sie zwei Methoden kennen, wie Sie ein trainiertes oder verfeinertes Modell auf dem [Model Hub](https://huggingface.co/models) teilen können: - -- Programmgesteuertes Übertragen Ihrer Dateien auf den Hub. -- Ziehen Sie Ihre Dateien per Drag-and-Drop über die Weboberfläche in den Hub. - - - - - -Um ein Modell mit der Öffentlichkeit zu teilen, benötigen Sie ein Konto auf [huggingface.co](https://huggingface.co/join). Sie können auch einer bestehenden Organisation beitreten oder eine neue Organisation gründen. - - - -## Repository-Funktionen - -Jedes Repository im Model Hub verhält sich wie ein typisches GitHub-Repository. Unsere Repositorys bieten Versionierung, Commit-Historie und die Möglichkeit, Unterschiede zu visualisieren. - -Die integrierte Versionierung des Model Hub basiert auf Git und [git-lfs](https://git-lfs.github.com/). Mit anderen Worten: Sie können ein Modell als ein Repository behandeln, was eine bessere Zugriffskontrolle und Skalierbarkeit ermöglicht. Die Versionskontrolle ermöglicht *Revisionen*, eine Methode zum Anheften einer bestimmten Version eines Modells mit einem Commit-Hash, Tag oder Branch. - -Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" laden: - -```py ->>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash -... ) -``` - -Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können die Commit-Historie sowie die Unterschiede einsehen: - -![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) - -## Einrichtung - -Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert: - -```bash -huggingface-cli login -``` - -Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub. - -```bash -pip install huggingface_hub -``` - -Verwenden Sie dann `notebook_login`, um sich beim Hub anzumelden, und folgen Sie dem Link [hier](https://huggingface.co/settings/token), um ein Token für die Anmeldung zu generieren: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Ein Modell für alle Frameworks konvertieren - -Um sicherzustellen, dass Ihr Modell von jemandem verwendet werden kann, der mit einem anderen Framework arbeitet, empfehlen wir Ihnen, Ihr Modell sowohl mit PyTorch- als auch mit TensorFlow-Checkpoints zu konvertieren und hochzuladen. Während Benutzer immer noch in der Lage sind, Ihr Modell von einem anderen Framework zu laden, wenn Sie diesen Schritt überspringen, wird es langsamer sein, weil 🤗 Transformers den Checkpoint on-the-fly konvertieren müssen. - -Die Konvertierung eines Checkpoints für ein anderes Framework ist einfach. Stellen Sie sicher, dass Sie PyTorch und TensorFlow installiert haben (siehe [hier](installation) für Installationsanweisungen), und finden Sie dann das spezifische Modell für Ihre Aufgabe in dem anderen Framework. - - - -Geben Sie `from_tf=True` an, um einen Prüfpunkt von TensorFlow nach PyTorch zu konvertieren: - -```py ->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) ->>> pt_model.save_pretrained("path/to/awesome-name-you-picked") -``` - - -Geben Sie `from_pt=True` an, um einen Prüfpunkt von PyTorch nach TensorFlow zu konvertieren: - -```py ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) -``` - -Dann können Sie Ihr neues TensorFlow-Modell mit seinem neuen Checkpoint speichern: - -```py ->>> tf_model.save_pretrained("path/to/awesome-name-you-picked") -``` - - -Wenn ein Modell in Flax verfügbar ist, können Sie auch einen Kontrollpunkt von PyTorch nach Flax konvertieren: - -```py ->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( -... "path/to/awesome-name-you-picked", from_pt=True -... ) -``` - - - -## Ein Modell während des Trainings hochladen - - - - - -Die Weitergabe eines Modells an den Hub ist so einfach wie das Hinzufügen eines zusätzlichen Parameters oder Rückrufs. Erinnern Sie sich an das [Feinabstimmungs-Tutorial](training), in der Klasse [`TrainingArguments`] geben Sie Hyperparameter und zusätzliche Trainingsoptionen an. Eine dieser Trainingsoptionen beinhaltet die Möglichkeit, ein Modell direkt an den Hub zu pushen. Setzen Sie `push_to_hub=True` in Ihrer [`TrainingArguments`]: - -```py ->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) -``` - -Übergeben Sie Ihre Trainingsargumente wie gewohnt an [`Trainer`]: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -Nach der Feinabstimmung Ihres Modells rufen Sie [`~transformers.Trainer.push_to_hub`] auf [`Trainer`] auf, um das trainierte Modell an den Hub zu übertragen. Transformers fügt sogar automatisch Trainings-Hyperparameter, Trainingsergebnisse und Framework-Versionen zu Ihrer Modellkarte hinzu! - -```py ->>> trainer.push_to_hub() -``` - - -Geben Sie ein Modell mit [`PushToHubCallback`] an den Hub weiter. In der [`PushToHubCallback`] Funktion, fügen Sie hinzu: - -- Ein Ausgabeverzeichnis für Ihr Modell. -- Einen Tokenizer. -- Die `hub_model_id`, die Ihr Hub-Benutzername und Modellname ist. - -```py ->>> from transformers.keras.callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" -... ) -``` - -Fügen Sie den Callback zu [`fit`](https://keras.io/api/models/model_training_apis/) hinzu, und 🤗 Transformers wird das trainierte Modell an den Hub weiterleiten: - -```py ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) -``` - - - -## Verwenden Sie die Funktion `push_to_hub`. - -Sie können `push_to_hub` auch direkt für Ihr Modell aufrufen, um es in den Hub hochzuladen. - -Geben Sie den Namen Ihres Modells in "push_to_hub" an: - -```py ->>> pt_model.push_to_hub("my-awesome-model") -``` - -Dadurch wird ein Repository unter Ihrem Benutzernamen mit dem Modellnamen `my-awesome-model` erstellt. Benutzer können nun Ihr Modell mit der Funktion `from_pretrained` laden: - -```py ->>> from transformers import AutoModel - ->>> model = AutoModel.from_pretrained("your_username/my-awesome-model") -``` - -Wenn Sie zu einer Organisation gehören und Ihr Modell stattdessen unter dem Namen der Organisation pushen wollen, fügen Sie diesen einfach zur `repo_id` hinzu: - -```py ->>> pt_model.push_to_hub("my-awesome-org/my-awesome-model") -``` - -Die Funktion "push_to_hub" kann auch verwendet werden, um andere Dateien zu einem Modell-Repository hinzuzufügen. Zum Beispiel kann man einen Tokenizer zu einem Modell-Repository hinzufügen: - -```py ->>> tokenizer.push_to_hub("my-awesome-model") -``` - -Oder vielleicht möchten Sie die TensorFlow-Version Ihres fein abgestimmten PyTorch-Modells hinzufügen: - -```py ->>> tf_model.push_to_hub("my-awesome-model") -``` - -Wenn Sie nun zu Ihrem Hugging Face-Profil navigieren, sollten Sie Ihr neu erstelltes Modell-Repository sehen. Wenn Sie auf die Registerkarte **Dateien** klicken, werden alle Dateien angezeigt, die Sie in das Repository hochgeladen haben. - -Weitere Einzelheiten zum Erstellen und Hochladen von Dateien in ein Repository finden Sie in der Hub-Dokumentation [hier](https://huggingface.co/docs/hub/how-to-upstream). - -## Hochladen mit der Weboberfläche - -Benutzer, die einen no-code Ansatz bevorzugen, können ein Modell über das Webinterface des Hubs hochladen. Besuchen Sie [huggingface.co/new](https://huggingface.co/new) um ein neues Repository zu erstellen: - -![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) - -Fügen Sie von hier aus einige Informationen über Ihr Modell hinzu: - -- Wählen Sie den **Besitzer** des Repositorys. Dies können Sie selbst oder eine der Organisationen sein, denen Sie angehören. -- Wählen Sie einen Namen für Ihr Modell, der auch der Name des Repositorys sein wird. -- Wählen Sie, ob Ihr Modell öffentlich oder privat ist. -- Geben Sie die Lizenzverwendung für Ihr Modell an. - -Klicken Sie nun auf die Registerkarte **Dateien** und klicken Sie auf die Schaltfläche **Datei hinzufügen**, um eine neue Datei in Ihr Repository hochzuladen. Ziehen Sie dann eine Datei per Drag-and-Drop hoch und fügen Sie eine Übergabemeldung hinzu. - -![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) - -## Hinzufügen einer Modellkarte - -Um sicherzustellen, dass die Benutzer die Fähigkeiten, Grenzen, möglichen Verzerrungen und ethischen Aspekte Ihres Modells verstehen, fügen Sie bitte eine Modellkarte zu Ihrem Repository hinzu. Die Modellkarte wird in der Datei `README.md` definiert. Sie können eine Modellkarte hinzufügen, indem Sie: - -* Manuelles Erstellen und Hochladen einer "README.md"-Datei. -* Klicken Sie auf die Schaltfläche **Modellkarte bearbeiten** in Ihrem Modell-Repository. - -Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards). \ No newline at end of file diff --git a/docs/source/de/peft.md b/docs/source/de/peft.md new file mode 100644 index 000000000000..bdc0684d798d --- /dev/null +++ b/docs/source/de/peft.md @@ -0,0 +1,216 @@ + + +# Adapter mit 🤗 PEFT laden + +[[open-in-colab]] + +Die [Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) Methoden frieren die vorab trainierten Modellparameter während der Feinabstimmung ein und fügen eine kleine Anzahl trainierbarer Parameter (die Adapter) hinzu. Die Adapter werden trainiert, um aufgabenspezifische Informationen zu lernen. Es hat sich gezeigt, dass dieser Ansatz sehr speichereffizient ist und weniger Rechenleistung beansprucht, während die Ergebnisse mit denen eines vollständig feinabgestimmten Modells vergleichbar sind. + +Adapter, die mit PEFT trainiert wurden, sind in der Regel um eine Größenordnung kleiner als das vollständige Modell, so dass sie bequem gemeinsam genutzt, gespeichert und geladen werden können. + +
+ +
Die Adaptergewichte für ein OPTForCausalLM-Modell, die auf dem Hub gespeichert sind, sind nur ~6MB groß, verglichen mit der vollen Größe der Modellgewichte, die ~700MB betragen können.
+
+ +Wenn Sie mehr über die 🤗 PEFT-Bibliothek erfahren möchten, sehen Sie sich die [Dokumentation](https://huggingface.co/docs/peft/index) an. + +## Setup + +Starten Sie mit der Installation von 🤗 PEFT: + +```bash +pip install peft +``` + +Wenn Sie die brandneuen Funktionen ausprobieren möchten, sollten Sie die Bibliothek aus dem Quellcode installieren: + +```bash +pip install git+https://github.com/huggingface/peft.git +``` + +## Unterstützte PEFT-Modelle + +Transformers unterstützt nativ einige PEFT-Methoden, d.h. Sie können lokal oder auf dem Hub gespeicherte Adaptergewichte laden und sie mit wenigen Zeilen Code einfach ausführen oder trainieren. Die folgenden Methoden werden unterstützt: + +- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora) +- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3) +- [AdaLoRA](https://arxiv.org/abs/2303.10512) + +Wenn Sie andere PEFT-Methoden, wie z.B. Prompt Learning oder Prompt Tuning, verwenden möchten, oder über die 🤗 PEFT-Bibliothek im Allgemeinen, lesen Sie bitte die [Dokumentation](https://huggingface.co/docs/peft/index). + + +## Laden Sie einen PEFT-Adapter + +Um ein PEFT-Adaptermodell von 🤗 Transformers zu laden und zu verwenden, stellen Sie sicher, dass das Hub-Repository oder das lokale Verzeichnis eine `adapter_config.json`-Datei und die Adaptergewichte enthält, wie im obigen Beispielbild gezeigt. Dann können Sie das PEFT-Adaptermodell mit der Klasse `AutoModelFor` laden. Um zum Beispiel ein PEFT-Adaptermodell für die kausale Sprachmodellierung zu laden: + +1. Geben Sie die PEFT-Modell-ID an. +2. übergeben Sie es an die Klasse [`AutoModelForCausalLM`]. + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +peft_model_id = "ybelkada/opt-350m-lora" +model = AutoModelForCausalLM.from_pretrained(peft_model_id) +``` + + + +Sie können einen PEFT-Adapter entweder mit einer `AutoModelFor`-Klasse oder der Basismodellklasse wie `OPTForCausalLM` oder `LlamaForCausalLM` laden. + + + +Sie können einen PEFT-Adapter auch laden, indem Sie die Methode `load_adapter` aufrufen: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "facebook/opt-350m" +peft_model_id = "ybelkada/opt-350m-lora" + +model = AutoModelForCausalLM.from_pretrained(model_id) +model.load_adapter(peft_model_id) +``` + +## Laden in 8bit oder 4bit + +Die `bitsandbytes`-Integration unterstützt Datentypen mit 8bit und 4bit Genauigkeit, was für das Laden großer Modelle nützlich ist, weil es Speicher spart (lesen Sie den `bitsandbytes`-Integrations [guide](./quantization#bitsandbytes-integration), um mehr zu erfahren). Fügen Sie die Parameter `load_in_8bit` oder `load_in_4bit` zu [`~PreTrainedModel.from_pretrained`] hinzu und setzen Sie `device_map="auto"`, um das Modell effektiv auf Ihre Hardware zu verteilen: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +peft_model_id = "ybelkada/opt-350m-lora" +model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +``` + +## Einen neuen Adapter hinzufügen + +Sie können [`~peft.PeftModel.add_adapter`] verwenden, um einen neuen Adapter zu einem Modell mit einem bestehenden Adapter hinzuzufügen, solange der neue Adapter vom gleichen Typ ist wie der aktuelle Adapter. Wenn Sie zum Beispiel einen bestehenden LoRA-Adapter an ein Modell angehängt haben: + +```py +from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer +from peft import PeftConfig + +model_id = "facebook/opt-350m" +model = AutoModelForCausalLM.from_pretrained(model_id) + +lora_config = LoraConfig( + target_modules=["q_proj", "k_proj"], + init_lora_weights=False +) + +model.add_adapter(lora_config, adapter_name="adapter_1") +``` + +Um einen neuen Adapter hinzuzufügen: + +```py +# attach new adapter with same config +model.add_adapter(lora_config, adapter_name="adapter_2") +``` + +Jetzt können Sie mit [`~peft.PeftModel.set_adapter`] festlegen, welcher Adapter verwendet werden soll: + +```py +# use adapter_1 +model.set_adapter("adapter_1") +output = model.generate(**inputs) +print(tokenizer.decode(output_disabled[0], skip_special_tokens=True)) + +# use adapter_2 +model.set_adapter("adapter_2") +output_enabled = model.generate(**inputs) +print(tokenizer.decode(output_enabled[0], skip_special_tokens=True)) +``` + +## Aktivieren und Deaktivieren von Adaptern + +Sobald Sie einen Adapter zu einem Modell hinzugefügt haben, können Sie das Adaptermodul aktivieren oder deaktivieren. So aktivieren Sie das Adaptermodul: + +```py +from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer +from peft import PeftConfig + +model_id = "facebook/opt-350m" +adapter_model_id = "ybelkada/opt-350m-lora" +tokenizer = AutoTokenizer.from_pretrained(model_id) +text = "Hello" +inputs = tokenizer(text, return_tensors="pt") + +model = AutoModelForCausalLM.from_pretrained(model_id) +peft_config = PeftConfig.from_pretrained(adapter_model_id) + +# to initiate with random weights +peft_config.init_lora_weights = False + +model.add_adapter(peft_config) +model.enable_adapters() +output = model.generate(**inputs) +``` + +So deaktivieren Sie das Adaptermodul: + +```py +model.disable_adapters() +output = model.generate(**inputs) +``` + +## PEFT-Adapter trainieren + +PEFT-Adapter werden von der Klasse [`Trainer`] unterstützt, so dass Sie einen Adapter für Ihren speziellen Anwendungsfall trainieren können. Dazu müssen Sie nur ein paar weitere Codezeilen hinzufügen. Zum Beispiel, um einen LoRA-Adapter zu trainieren: + + + +Wenn Sie mit der Feinabstimmung eines Modells mit [`Trainer`] noch nicht vertraut sind, werfen Sie einen Blick auf das Tutorial [Feinabstimmung eines vortrainierten Modells](Training). + + + +1. Definieren Sie Ihre Adapterkonfiguration mit dem Aufgabentyp und den Hyperparametern (siehe [`~peft.LoraConfig`] für weitere Details darüber, was die Hyperparameter tun). + +```py +from peft import LoraConfig + +peft_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + task_type="CAUSAL_LM", +) +``` + +2. Fügen Sie dem Modell einen Adapter hinzu. + +```py +model.add_adapter(peft_config) +``` + +3. Jetzt können Sie das Modell an [`Trainer`] übergeben! + +```py +trainer = Trainer(model=model, ...) +trainer.train() +``` + +So speichern Sie Ihren trainierten Adapter und laden ihn wieder: + +```py +model.save_pretrained(save_dir) +model = AutoModelForCausalLM.from_pretrained(save_dir) +``` + + diff --git a/docs/source/de/pipeline_tutorial.md b/docs/source/de/pipeline_tutorial.md new file mode 100644 index 000000000000..06ab440d73a6 --- /dev/null +++ b/docs/source/de/pipeline_tutorial.md @@ -0,0 +1,175 @@ + + +# Pipelines für Inferenzen + +Die [`pipeline`] macht es einfach, jedes beliebige Modell aus dem [Hub](https://huggingface.co/models) für die Inferenz auf jede Sprache, Computer Vision, Sprache und multimodale Aufgaben zu verwenden. Selbst wenn Sie keine Erfahrung mit einer bestimmten Modalität haben oder nicht mit dem zugrundeliegenden Code hinter den Modellen vertraut sind, können Sie sie mit der [`pipeline`] für Inferenzen verwenden! In diesem Beispiel lernen Sie, wie: + +* Eine [`pipeline`] für Inferenz zu verwenden. +* Einen bestimmten Tokenizer oder ein bestimmtes Modell zu verwenden. +* Eine [`pipeline`] für Audio-, Vision- und multimodale Aufgaben zu verwenden. + + + +Eine vollständige Liste der unterstützten Aufgaben und verfügbaren Parameter finden Sie in der [`pipeline`]-Dokumentation. + + + +## Verwendung von Pipelines + +Obwohl jede Aufgabe eine zugehörige [`pipeline`] hat, ist es einfacher, die allgemeine [`pipeline`]-Abstraktion zu verwenden, die alle aufgabenspezifischen Pipelines enthält. Die [`pipeline`] lädt automatisch ein Standardmodell und eine Vorverarbeitungsklasse, die für Ihre Aufgabe inferenzfähig ist. + +1. Beginnen Sie mit der Erstellung einer [`pipeline`] und geben Sie eine Inferenzaufgabe an: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline(task="text-generation") +``` + +2. Übergeben Sie Ihren Eingabetext an die [`pipeline`]: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" +... ) # doctest: +SKIP +[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}] +``` + +Wenn Sie mehr als eine Eingabe haben, übergeben Sie die Eingabe als Liste: + +```py +>>> generator( +... [ +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", +... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne", +... ] +... ) # doctest: +SKIP +``` + +Alle zusätzlichen Parameter für Ihre Aufgabe können auch in die [`pipeline`] aufgenommen werden. Die Aufgabe `Text-Generierung` hat eine [`~generation.GenerationMixin.generate`]-Methode mit mehreren Parametern zur Steuerung der Ausgabe. Wenn Sie zum Beispiel mehr als eine Ausgabe erzeugen wollen, setzen Sie den Parameter `num_return_sequences`: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", +... num_return_sequences=2, +... ) # doctest: +SKIP +``` + +### Wählen Sie ein Modell und einen Tokenizer + +Die [`pipeline`] akzeptiert jedes Modell aus dem [Hub] (https://huggingface.co/models). Auf dem Hub gibt es Tags, mit denen Sie nach einem Modell filtern können, das Sie für Ihre Aufgabe verwenden möchten. Sobald Sie ein passendes Modell ausgewählt haben, laden Sie es mit der entsprechenden `AutoModelFor` und [`AutoTokenizer`] Klasse. Laden Sie zum Beispiel die Klasse [`AutoModelForCausalLM`] für eine kausale Sprachmodellierungsaufgabe: + +```py +>>> from transformers import AutoTokenizer, AutoModelForCausalLM + +>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") +>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") +``` + +Erstellen Sie eine [`pipeline`] für Ihre Aufgabe, und geben Sie das Modell und den Tokenizer an, die Sie geladen haben: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer) +``` + +Übergeben Sie Ihren Eingabetext an die [`pipeline`] , um einen Text zu erzeugen: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" +... ) # doctest: +SKIP +[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}] +``` + +## Audio-Pipeline + +Die [`pipeline`] unterstützt auch Audioaufgaben wie Audioklassifizierung und automatische Spracherkennung. + +Lassen Sie uns zum Beispiel die Emotion in diesem Audioclip klassifizieren: + +```py +>>> from datasets import load_dataset +>>> import torch + +>>> torch.manual_seed(42) # doctest: +IGNORE_RESULT +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") +>>> audio_file = ds[0]["audio"]["path"] +``` + +Finden Sie ein [Audioklassifikation](https://huggingface.co/models?pipeline_tag=audio-classification) Modell auf dem Model Hub für Emotionserkennung und laden Sie es in die [`pipeline`]: + +```py +>>> from transformers import pipeline + +>>> audio_classifier = pipeline( +... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +Übergeben Sie die Audiodatei an die [`pipeline`]: + +```py +>>> preds = audio_classifier(audio_file) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}] +``` + +## Bildverarbeitungs-Pipeline + +Die Verwendung einer [`pipeline`] für Bildverarbeitungsaufgaben ist praktisch identisch. + +Geben Sie Ihre Aufgabe an und übergeben Sie Ihr Bild an den Klassifikator. Das Bild kann ein Link oder ein lokaler Pfad zu dem Bild sein. Zum Beispiel: Welche Katzenart ist unten abgebildet? + +![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) + +```py +>>> from transformers import pipeline + +>>> vision_classifier = pipeline(task="image-classification") +>>> preds = vision_classifier( +... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] +``` + +## Multimodale Pipeline + +Die [`pipeline`] unterstützt mehr als eine Modalität. Eine Aufgabe zur Beantwortung visueller Fragen (VQA) kombiniert zum Beispiel Text und Bild. Verwenden Sie einen beliebigen Bildlink und eine Frage, die Sie zu dem Bild stellen möchten. Das Bild kann eine URL oder ein lokaler Pfad zu dem Bild sein. + +Wenn Sie zum Beispiel das gleiche Bild wie in der obigen Vision-Pipeline verwenden: + +```py +>>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +>>> question = "Where is the cat?" +``` + +Erstellen Sie eine Pipeline für "vqa" und übergeben Sie ihr das Bild und die Frage: + +```py +>>> from transformers import pipeline + +>>> vqa = pipeline(task="vqa") +>>> preds = vqa(image=image, question=question) +>>> preds = [{"score": round(pred["score"], 4), "answer": pred["answer"]} for pred in preds] +>>> preds +[{'score': 0.9112, 'answer': 'snow'}, {'score': 0.8796, 'answer': 'in snow'}, {'score': 0.6717, 'answer': 'outside'}, {'score': 0.0291, 'answer': 'on ground'}, {'score': 0.027, 'answer': 'ground'}] +``` diff --git a/docs/source/de/pipeline_tutorial.mdx b/docs/source/de/pipeline_tutorial.mdx deleted file mode 100644 index 19c37c35dea1..000000000000 --- a/docs/source/de/pipeline_tutorial.mdx +++ /dev/null @@ -1,171 +0,0 @@ - - -# Pipelines für Inferenzen - -Die [`pipeline`] macht es einfach, jedes beliebige Modell aus dem [Hub](https://huggingface.co/models) für die Inferenz auf jede Sprache, Computer Vision, Sprache und multimodale Aufgaben zu verwenden. Selbst wenn Sie keine Erfahrung mit einer bestimmten Modalität haben oder nicht mit dem zugrundeliegenden Code hinter den Modellen vertraut sind, können Sie sie mit der [`pipeline`] für Inferenzen verwenden! In diesem Beispiel lernen Sie, wie: - -* Eine [`pipeline`] für Inferenz zu verwenden. -* Einen bestimmten Tokenizer oder ein bestimmtes Modell zu verwenden. -* Eine [`pipeline`] für Audio-, Vision- und multimodale Aufgaben zu verwenden. - - - -Eine vollständige Liste der unterstützten Aufgaben und verfügbaren Parameter finden Sie in der [`pipeline`]-Dokumentation. - - - -## Verwendung von Pipelines - -Obwohl jede Aufgabe eine zugehörige [`pipeline`] hat, ist es einfacher, die allgemeine [`pipeline`]-Abstraktion zu verwenden, die alle aufgabenspezifischen Pipelines enthält. Die [`pipeline`] lädt automatisch ein Standardmodell und eine Vorverarbeitungsklasse, die für Ihre Aufgabe inferenzfähig ist. - -1. Beginnen Sie mit der Erstellung einer [`pipeline`] und geben Sie eine Inferenzaufgabe an: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="text-generation") -``` - -2. Übergeben Sie Ihren Eingabetext an die [`pipeline`]: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" -... ) # doctest: +SKIP -[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}] -``` - -Wenn Sie mehr als eine Eingabe haben, übergeben Sie die Eingabe als Liste: - -```py ->>> generator( -... [ -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", -... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne", -... ] -... ) # doctest: +SKIP -``` - -Alle zusätzlichen Parameter für Ihre Aufgabe können auch in die [`pipeline`] aufgenommen werden. Die Aufgabe `Text-Generierung` hat eine [`~generation.GenerationMixin.generate`]-Methode mit mehreren Parametern zur Steuerung der Ausgabe. Wenn Sie zum Beispiel mehr als eine Ausgabe erzeugen wollen, setzen Sie den Parameter `num_return_sequences`: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", -... num_return_sequences=2, -... ) # doctest: +SKIP -``` - -### Wählen Sie ein Modell und einen Tokenizer - -Die [`pipeline`] akzeptiert jedes Modell aus dem [Hub] (https://huggingface.co/models). Auf dem Hub gibt es Tags, mit denen Sie nach einem Modell filtern können, das Sie für Ihre Aufgabe verwenden möchten. Sobald Sie ein passendes Modell ausgewählt haben, laden Sie es mit der entsprechenden `AutoModelFor` und [`AutoTokenizer`] Klasse. Laden Sie zum Beispiel die Klasse [`AutoModelForCausalLM`] für eine kausale Sprachmodellierungsaufgabe: - -```py ->>> from transformers import AutoTokenizer, AutoModelForCausalLM - ->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") ->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") -``` - -Erstellen Sie eine [`pipeline`] für Ihre Aufgabe, und geben Sie das Modell und den Tokenizer an, die Sie geladen haben: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer) -``` - -Übergeben Sie Ihren Eingabetext an die [`pipeline`] , um einen Text zu erzeugen: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" -... ) # doctest: +SKIP -[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}] -``` - -## Audio-Pipeline - -Die [`pipeline`] unterstützt auch Audioaufgaben wie Audioklassifizierung und automatische Spracherkennung. - -Lassen Sie uns zum Beispiel die Emotion in diesem Audioclip klassifizieren: - -```py ->>> from datasets import load_dataset ->>> import torch - ->>> torch.manual_seed(42) # doctest: +IGNORE_RESULT ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") ->>> audio_file = ds[0]["audio"]["path"] -``` - -Finden Sie ein [Audioklassifikation](https://huggingface.co/models?pipeline_tag=audio-classification) Modell auf dem Model Hub für Emotionserkennung und laden Sie es in die [`pipeline`]: - -```py ->>> from transformers import pipeline - ->>> audio_classifier = pipeline( -... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -Übergeben Sie die Audiodatei an die [`pipeline`]: - -```py ->>> preds = audio_classifier(audio_file) ->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] ->>> preds -[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}] -``` - -## Bildverarbeitungs-Pipeline - -Die Verwendung einer [`pipeline`] für Bildverarbeitungsaufgaben ist praktisch identisch. - -Geben Sie Ihre Aufgabe an und übergeben Sie Ihr Bild an den Klassifikator. Das Bild kann ein Link oder ein lokaler Pfad zu dem Bild sein. Zum Beispiel: Welche Katzenart ist unten abgebildet? - -![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) - -```py ->>> from transformers import pipeline - ->>> vision_classifier = pipeline(task="image-classification") ->>> preds = vision_classifier( -... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" -... ) ->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] ->>> preds -[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] -``` - -## Multimodale Pipeline - -Die [`pipeline`] unterstützt mehr als eine Modalität. Eine Aufgabe zur Beantwortung visueller Fragen (VQA) kombiniert zum Beispiel Text und Bild. Verwenden Sie einen beliebigen Bildlink und eine Frage, die Sie zu dem Bild stellen möchten. Das Bild kann eine URL oder ein lokaler Pfad zu dem Bild sein. - -Wenn Sie zum Beispiel das gleiche Bild wie in der obigen Vision-Pipeline verwenden: - -```py ->>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" ->>> question = "Where is the cat?" -``` - -Erstellen Sie eine Pipeline für "vqa" und übergeben Sie ihr das Bild und die Frage: - -```py ->>> from transformers import pipeline - ->>> vqa = pipeline(task="vqa") ->>> preds = vqa(image=image, question=question) ->>> preds = [{"score": round(pred["score"], 4), "answer": pred["answer"]} for pred in preds] ->>> preds -[{'score': 0.9112, 'answer': 'snow'}, {'score': 0.8796, 'answer': 'in snow'}, {'score': 0.6717, 'answer': 'outside'}, {'score': 0.0291, 'answer': 'on ground'}, {'score': 0.027, 'answer': 'ground'}] -``` diff --git a/docs/source/de/pr_checks.md b/docs/source/de/pr_checks.md new file mode 100644 index 000000000000..ee2bbf489b8e --- /dev/null +++ b/docs/source/de/pr_checks.md @@ -0,0 +1,199 @@ + + +# Überprüfungen bei einer Pull-Anfrage + +Wenn Sie eine Pull-Anfrage für 🤗 Transformers öffnen, wird eine ganze Reihe von Prüfungen durchgeführt, um sicherzustellen, dass der Patch, den Sie hinzufügen, nichts Bestehendes zerstört. Es gibt vier Arten von Prüfungen: +- reguläre Tests +- Erstellung der Dokumentation +- Stil von Code und Dokumentation +- allgemeine Konsistenz des Repository + +In diesem Dokument werden wir versuchen zu erklären, worum es sich bei diesen verschiedenen Prüfungen handelt und wie Sie sie lokal debuggen können, wenn eine der Prüfungen in Ihrer PR fehlschlägt. + +Beachten Sie, dass Sie im Idealfall eine Dev-Installation benötigen: + +```bash +pip install transformers[dev] +``` + +oder für eine bearbeitbare Installation: + +```bash +pip install -e .[dev] +``` + +innerhalb des Transformers Repo. Da die Anzahl der optionalen Abhängigkeiten von Transformers stark zugenommen hat, ist es möglich, dass Sie nicht alle davon bekommen können. Wenn die Dev-Installation fehlschlägt, stellen Sie sicher, dass Sie das Deep Learning-Framework, mit dem Sie arbeiten, installieren (PyTorch, TensorFlow und/oder Flax). + +```bash +pip install transformers[quality] +``` + +oder für eine bearbeitbare Installation: + +```bash +pip install -e .[quality] +``` + + +## Tests + +Alle Jobs, die mit `ci/circleci: run_tests_` beginnen, führen Teile der Transformers-Testsuite aus. Jeder dieser Jobs konzentriert sich auf einen Teil der Bibliothek in einer bestimmten Umgebung: `ci/circleci: run_tests_pipelines_tf` zum Beispiel führt den Pipelines-Test in einer Umgebung aus, in der nur TensorFlow installiert ist. + +Beachten Sie, dass nur ein Teil der Testsuite jedes Mal ausgeführt wird, um zu vermeiden, dass Tests ausgeführt werden, wenn es keine wirkliche Änderung in den Modulen gibt, die sie testen: ein Dienstprogramm wird ausgeführt, um die Unterschiede in der Bibliothek zwischen vor und nach dem PR zu ermitteln (was GitHub Ihnen auf der Registerkarte "Files changes" anzeigt) und die Tests auszuwählen, die von diesem Unterschied betroffen sind. Dieses Dienstprogramm kann lokal mit ausgeführt werden: + +```bash +python utils/tests_fetcher.py +``` + +aus dem Stammverzeichnis des Transformers-Repositoriums. Es wird: + +1. Überprüfen Sie für jede Datei im Diff, ob die Änderungen im Code oder nur in Kommentaren oder Docstrings enthalten sind. Nur die Dateien mit echten Codeänderungen werden beibehalten. +2. Erstellen Sie eine interne Map, die für jede Datei des Quellcodes der Bibliothek alle Dateien angibt, auf die sie rekursiv Einfluss nimmt. Von Modul A wird gesagt, dass es sich auf Modul B auswirkt, wenn Modul B Modul A importiert. Für die rekursive Auswirkung benötigen wir eine Kette von Modulen, die von Modul A zu Modul B führt und in der jedes Modul das vorherige importiert. +3. Wenden Sie diese Zuordnung auf die in Schritt 1 gesammelten Dateien an. So erhalten wir die Liste der Modelldateien, die von der PR betroffen sind. +4. Ordnen Sie jede dieser Dateien der/den entsprechenden Testdatei(en) zu und erhalten Sie die Liste der auszuführenden Tests. + +Wenn Sie das Skript lokal ausführen, sollten Sie die Ergebnisse von Schritt 1, 3 und 4 ausgegeben bekommen und somit wissen, welche Tests ausgeführt werden. Das Skript erstellt außerdem eine Datei namens `test_list.txt`, die die Liste der auszuführenden Tests enthält, die Sie mit dem folgenden Befehl lokal ausführen können: + +```bash +python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt) +``` + +Für den Fall, dass Ihnen etwas entgangen ist, wird die komplette Testreihe ebenfalls täglich ausgeführt. + +## Dokumentation erstellen + +Der Job `build_pr_documentation` erstellt und generiert eine Vorschau der Dokumentation, um sicherzustellen, dass alles in Ordnung ist, wenn Ihr PR zusammengeführt wird. Ein Bot fügt einen Link zur Vorschau der Dokumentation zu Ihrem PR hinzu. Alle Änderungen, die Sie an dem PR vornehmen, werden automatisch in der Vorschau aktualisiert. Wenn die Dokumentation nicht erstellt werden kann, klicken Sie auf **Details** neben dem fehlgeschlagenen Auftrag, um zu sehen, wo der Fehler liegt. Oft ist der Fehler so einfach wie eine fehlende Datei im `toctree`. + +Wenn Sie daran interessiert sind, die Dokumentation lokal zu erstellen oder in der Vorschau anzusehen, werfen Sie einen Blick in die [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) im Ordner docs. + +## Code und Dokumentationsstil + +Die Formatierung des Codes erfolgt für alle Quelldateien, die Beispiele und die Tests mit `black` und `ruff`. Wir haben auch ein benutzerdefiniertes Tool, das sich um die Formatierung von docstrings und `rst`-Dateien kümmert (`utils/style_doc.py`), sowie um die Reihenfolge der Lazy-Importe, die in den Transformers `__init__.py`-Dateien durchgeführt werden (`utils/custom_init_isort.py`). All dies können Sie starten, indem Sie Folgendes ausführen + +```bash +make style +``` + +Das CI prüft, ob diese innerhalb der Prüfung `ci/circleci: check_code_quality` angewendet wurden. Es führt auch `ruff` aus, das einen grundlegenden Blick auf Ihren Code wirft und sich beschwert, wenn es eine undefinierte Variable findet oder eine, die nicht verwendet wird. Um diese Prüfung lokal auszuführen, verwenden Sie + +```bash +make quality +``` + +Dies kann sehr viel Zeit in Anspruch nehmen. Um dasselbe nur für die Dateien zu tun, die Sie im aktuellen Zweig geändert haben, führen Sie + +```bash +make fixup +``` + +Dieser letzte Befehl führt auch alle zusätzlichen Prüfungen für die Konsistenz des Repositorys durch. Schauen wir uns diese an. + +## Repository-Konsistenz + +Dies fasst alle Tests zusammen, die sicherstellen, dass Ihr PR das Repository in einem guten Zustand verlässt. Sie können diese Prüfung lokal durchführen, indem Sie Folgendes ausführen: + +```bash +make repo-consistency +``` + +Dies überprüft, ob: + +- Alle zum Init hinzugefügten Objekte sind dokumentiert (ausgeführt von `utils/check_repo.py`) +- Alle `__init__.py`-Dateien haben in ihren beiden Abschnitten den gleichen Inhalt (ausgeführt von `utils/check_inits.py`) +- Der gesamte Code, der als Kopie eines anderen Moduls identifiziert wurde, stimmt mit dem Original überein (ausgeführt von `utils/check_copies.py`) +- Alle Konfigurationsklassen haben mindestens einen gültigen Prüfpunkt, der in ihren Dokumentationen erwähnt wird (ausgeführt von `utils/check_config_docstrings.py`) +- Alle Konfigurationsklassen enthalten nur Attribute, die in den entsprechenden Modellierungsdateien verwendet werden (ausgeführt von `utils/check_config_attributes.py`) +- Die Übersetzungen der READMEs und der Index des Dokuments haben die gleiche Modellliste wie die Haupt-README (durchgeführt von `utils/check_copies.py`) +- Die automatisch generierten Tabellen in der Dokumentation sind auf dem neuesten Stand (ausgeführt von `utils/check_table.py`) +- Die Bibliothek verfügt über alle Objekte, auch wenn nicht alle optionalen Abhängigkeiten installiert sind (ausgeführt von `utils/check_dummies.py`) + +Sollte diese Prüfung fehlschlagen, müssen die ersten beiden Punkte manuell korrigiert werden, die letzten vier können automatisch für Sie korrigiert werden, indem Sie den Befehl + +```bash +make fix-copies +``` + +Zusätzliche Prüfungen betreffen PRs, die neue Modelle hinzufügen, vor allem, dass: + +- Alle hinzugefügten Modelle befinden sich in einer Auto-Zuordnung (durchgeführt von `utils/check_repo.py`) + +- Alle Modelle werden ordnungsgemäß getestet (ausgeführt von `utils/check_repo.py`) + + + +### Kopien prüfen + +Da die Transformers-Bibliothek in Bezug auf den Modellcode sehr eigenwillig ist und jedes Modell vollständig in einer einzigen Datei implementiert sein sollte, ohne sich auf andere Modelle zu stützen, haben wir einen Mechanismus hinzugefügt, der überprüft, ob eine Kopie des Codes einer Ebene eines bestimmten Modells mit dem Original übereinstimmt. Auf diese Weise können wir bei einer Fehlerbehebung alle anderen betroffenen Modelle sehen und entscheiden, ob wir die Änderung weitergeben oder die Kopie zerstören. + + + +Wenn eine Datei eine vollständige Kopie einer anderen Datei ist, sollten Sie sie in der Konstante `FULL_COPIES` von `utils/check_copies.py` registrieren. + + + +Dieser Mechanismus stützt sich auf Kommentare der Form `# Kopiert von xxx`. Das `xxx` sollte den gesamten Pfad zu der Klasse der Funktion enthalten, die darunter kopiert wird. Zum Beispiel ist `RobertaSelfOutput` eine direkte Kopie der Klasse `BertSelfOutput`. Sie können also [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) sehen, dass sie einen Kommentar hat: + +```py +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +``` + +Beachten Sie, dass Sie dies nicht auf eine ganze Klasse anwenden, sondern auf die entsprechenden Methoden, von denen kopiert wird. Zum Beispiel [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) können Sie sehen, wie `RobertaPreTrainedModel._init_weights` von der gleichen Methode in `BertPreTrainedModel` mit dem Kommentar kopiert wird: + +```py +# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights +``` + +Manchmal ist die Kopie bis auf die Namen genau gleich: zum Beispiel verwenden wir in `RobertaAttention` `RobertaSelfAttention` anstelle von `BertSelfAttention`, aber ansonsten ist der Code genau derselbe. Aus diesem Grund unterstützt `#Copied from` einfache String-Ersetzungen mit der folgenden Syntax: `Kopiert von xxx mit foo->bar`. Das bedeutet, dass der Code kopiert wird, wobei alle Instanzen von "foo" durch "bar" ersetzt werden. Sie können sehen, wie es [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` mit dem Kommentar verwendet wird: + +```py +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta +``` + +Beachten Sie, dass um den Pfeil herum keine Leerzeichen stehen sollten (es sei denn, das Leerzeichen ist Teil des zu ersetzenden Musters, natürlich). + +Sie können mehrere Muster durch ein Komma getrennt hinzufügen. Zum Beispiel ist hier `CamemberForMaskedLM` eine direkte Kopie von `RobertaForMaskedLM` mit zwei Ersetzungen: `Roberta` zu `Camembert` und `ROBERTA` zu `CAMEMBERT`. Sie können [hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) sehen, wie dies mit dem Kommentar gemacht wird: + +```py +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +``` + +Wenn die Reihenfolge eine Rolle spielt (weil eine der Ersetzungen mit einer vorherigen in Konflikt geraten könnte), werden die Ersetzungen von links nach rechts ausgeführt. + + + +Wenn die Ersetzungen die Formatierung ändern (wenn Sie z.B. einen kurzen Namen durch einen sehr langen Namen ersetzen), wird die Kopie nach Anwendung des automatischen Formats überprüft. + + + +Eine andere Möglichkeit, wenn es sich bei den Mustern nur um verschiedene Umschreibungen derselben Ersetzung handelt (mit einer groß- und einer kleingeschriebenen Variante), besteht darin, die Option `all-casing` hinzuzufügen. [Hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) ist ein Beispiel in `MobileBertForSequenceClassification` mit dem Kommentar: + +```py +# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing +``` + +In diesem Fall wird der Code von `BertForSequenceClassification` kopiert, indem er ersetzt wird: +- `Bert` durch `MobileBert` (zum Beispiel bei der Verwendung von `MobileBertModel` in der Init) +- `bert` durch `mobilebert` (zum Beispiel bei der Definition von `self.mobilebert`) +- `BERT` durch `MOBILEBERT` (in der Konstante `MOBILEBERT_INPUTS_DOCSTRING`) diff --git a/docs/source/de/preprocessing.md b/docs/source/de/preprocessing.md new file mode 100644 index 000000000000..1e8f6ff4062a --- /dev/null +++ b/docs/source/de/preprocessing.md @@ -0,0 +1,506 @@ + + +# Vorverarbeiten + +[[open-in-colab]] + +Bevor Sie Ihre Daten in einem Modell verwenden können, müssen die Daten in ein für das Modell akzeptables Format gebracht werden. Ein Modell versteht keine Rohtexte, Bilder oder Audiodaten. Diese Eingaben müssen in Zahlen umgewandelt und zu Tensoren zusammengesetzt werden. In dieser Anleitung werden Sie: + +* Textdaten mit einem Tokenizer vorverarbeiten. +* Bild- oder Audiodaten mit einem Feature Extractor vorverarbeiten. +* Daten für eine multimodale Aufgabe mit einem Prozessor vorverarbeiten. + +## NLP + + + +Das wichtigste Werkzeug zur Verarbeitung von Textdaten ist ein [Tokenizer](main_classes/tokenizer). Ein Tokenizer zerlegt Text zunächst nach einer Reihe von Regeln in *Token*. Die Token werden in Zahlen umgewandelt, die zum Aufbau von Tensoren als Eingabe für ein Modell verwendet werden. Alle zusätzlichen Eingaben, die ein Modell benötigt, werden ebenfalls vom Tokenizer hinzugefügt. + + + +Wenn Sie ein vortrainiertes Modell verwenden möchten, ist es wichtig, den zugehörigen vortrainierten Tokenizer zu verwenden. Dadurch wird sichergestellt, dass der Text auf die gleiche Weise aufgeteilt wird wie das Pretraining-Korpus und die gleichen entsprechenden Token-zu-Index (in der Regel als *vocab* bezeichnet) während des Pretrainings verwendet werden. + + + +Laden Sie einen vortrainierten Tokenizer mit der Klasse [AutoTokenizer], um schnell loszulegen. Damit wird das *vocab* heruntergeladen, das verwendet wird, wenn ein Modell vortrainiert wird. + +### Tokenize + +Laden Sie einen vortrainierten Tokenizer mit [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +``` + +Dann übergeben Sie Ihren Satz an den Tokenizer: + +```py +>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") +>>> print(encoded_input) +{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +Der Tokenizer gibt ein Wörterbuch mit drei wichtigen Elementen zurück: + +* [input_ids](glossary#input-ids) sind die Indizes, die den einzelnen Token im Satz entsprechen. +* [attention_mask](glossary#attention-mask) gibt an, ob ein Token beachtet werden soll oder nicht. +* [token_type_ids](glossary#token-type-ids) gibt an, zu welcher Sequenz ein Token gehört, wenn es mehr als eine Sequenz gibt. + +Sie können die `input_ids` dekodieren, um die ursprüngliche Eingabe zurückzugeben: + +```py +>>> tokenizer.decode(encoded_input["input_ids"]) +'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' +``` + +Wie Sie sehen können, hat der Tokenisierer zwei spezielle Token - `CLS` und `SEP` (Klassifikator und Separator) - zum Satz hinzugefügt. Nicht alle Modelle benötigen +spezielle Token, aber wenn dies der Fall ist, fügt der Tokenisierer sie automatisch für Sie hinzu. + +Wenn Sie mehrere Sätze verarbeiten wollen, übergeben Sie die Sätze als Liste an den Tokenizer: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_inputs = tokenizer(batch_sentences) +>>> print(encoded_inputs) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1]]} +``` + +### Pad + +Dies bringt uns zu einem wichtigen Thema. Wenn Sie einen Haufen von Sätzen verarbeiten, sind diese nicht immer gleich lang. Das ist ein Problem, weil Tensoren, die Eingabe für das Modell, eine einheitliche Form haben müssen. Padding ist eine Strategie, die sicherstellt, dass Tensoren rechteckig sind, indem ein spezielles *Padding-Token* zu Sätzen mit weniger Token hinzugefügt wird. + +Setzen Sie den Parameter "padding" auf "true", um die kürzeren Sequenzen im Stapel so aufzufüllen, dass sie der längsten Sequenz entsprechen: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +Beachten Sie, dass der Tokenizer den ersten und den dritten Satz mit einer "0" aufgefüllt hat, weil sie kürzer sind! + +### Kürzung + +Auf der anderen Seite des Spektrums kann es vorkommen, dass eine Sequenz zu lang für ein Modell ist. In diesem Fall müssen Sie die Sequenz auf eine kürzere Länge kürzen. + +Setzen Sie den Parameter "truncation" auf "true", um eine Sequenz auf die vom Modell akzeptierte Höchstlänge zu kürzen: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +### Tensoren erstellen + +Schließlich möchten Sie, dass der Tokenizer die tatsächlichen Tensoren zurückgibt, die dem Modell zugeführt werden. + +Setzen Sie den Parameter `return_tensors` entweder auf `pt` für PyTorch, oder `tf` für TensorFlow: + + + + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") +>>> print(encoded_input) +{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} +``` + + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") +>>> print(encoded_input) +{'input_ids': , + 'token_type_ids': , + 'attention_mask': } +``` + + + +## Audio + +Audioeingaben werden anders vorverarbeitet als Texteingaben, aber das Endziel bleibt dasselbe: numerische Sequenzen zu erstellen, die das Modell verstehen kann. Ein [feature extractor](main_classes/feature_extractor) dient dem ausdrücklichen Zweck, Merkmale aus Rohbild- oder Audiodaten zu extrahieren und in Tensoren zu konvertieren. Bevor Sie beginnen, installieren Sie 🤗 Datasets, um einen Audio-Datensatz zu laden, mit dem Sie experimentieren können: + +```bash +pip install datasets +``` + +Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html)): + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +``` + +Greifen Sie auf das erste Element der `audio`-Spalte zu, um einen Blick auf die Eingabe zu werfen. Durch den Aufruf der Spalte "audio" wird die Audiodatei automatisch geladen und neu gesampelt: + +```py +>>> dataset[0]["audio"] +{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 8000} +``` + +Dies gibt drei Elemente zurück: + +* "array" ist das Sprachsignal, das als 1D-Array geladen - und möglicherweise neu gesampelt - wurde. +* Pfad" zeigt auf den Speicherort der Audiodatei. +* `sampling_rate` bezieht sich darauf, wie viele Datenpunkte im Sprachsignal pro Sekunde gemessen werden. + +### Resample + +Für dieses Tutorial werden Sie das Modell [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) verwenden. Wie Sie aus der Modellkarte ersehen können, ist das Wav2Vec2-Modell auf 16kHz abgetastetes Sprachaudio vortrainiert. Es ist wichtig, dass die Abtastrate Ihrer Audiodaten mit der Abtastrate des Datensatzes übereinstimmt, der für das Pre-Training des Modells verwendet wurde. Wenn die Abtastrate Ihrer Daten nicht dieselbe ist, müssen Sie Ihre Audiodaten neu abtasten. + +Der Datensatz [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) hat zum Beispiel eine Abtastrate von 8000 kHz. Um das Wav2Vec2-Modell mit diesem Datensatz verwenden zu können, müssen Sie die Abtastrate auf 16 kHz erhöhen: + +```py +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +>>> dataset[0]["audio"] +{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 8000} +``` + +1. Verwenden Sie die Methode [~datasets.Dataset.cast_column] von 🤗 Datasets, um die Abtastrate auf 16kHz zu erhöhen: + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +2. Laden Sie die Audiodatei: + +```py +>>> dataset[0]["audio"] +{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., + 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 16000} +``` + +Wie Sie sehen können, ist die Abtastrate jetzt 16kHz! + +### Merkmalsextraktor + +Der nächste Schritt ist das Laden eines Merkmalsextraktors, um die Eingabe zu normalisieren und aufzufüllen. Beim Auffüllen von Textdaten wird für kürzere Sequenzen ein `0` hinzugefügt. Die gleiche Idee gilt für Audiodaten, und der Audio-Feature-Extraktor fügt eine `0` - interpretiert als Stille - zu `array` hinzu. + +Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +``` + +Übergeben Sie das Audio-"Array" an den Feature-Extraktor. Wir empfehlen auch, das Argument `sampling_rate` im Feature Extractor hinzuzufügen, um eventuell auftretende stille Fehler besser zu beheben. + +```py +>>> audio_input = [dataset[0]["audio"]["array"]] +>>> feature_extractor(audio_input, sampling_rate=16000) +{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., + 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} +``` + +### Auffüllen und Kürzen + +Genau wie beim Tokenizer können Sie variable Sequenzen in einem Stapel durch Auffüllen oder Abschneiden behandeln. Werfen Sie einen Blick auf die Sequenzlänge dieser beiden Audiobeispiele: + +```py +>>> dataset[0]["audio"]["array"].shape +(173398,) + +>>> dataset[1]["audio"]["array"].shape +(106496,) +``` + +Wie Sie sehen können, hat das erste Beispiel eine längere Sequenz als das zweite Beispiel. Lassen Sie uns eine Funktion erstellen, die den Datensatz vorverarbeitet. Geben Sie eine maximale Länge der Probe an, und der Feature-Extraktor wird die Sequenzen entweder auffüllen oder abschneiden, damit sie dieser Länge entsprechen: + +```py +>>> def preprocess_function(examples): +... audio_arrays = [x["array"] for x in examples["audio"]] +... inputs = feature_extractor( +... audio_arrays, +... sampling_rate=16000, +... padding=True, +... max_length=100000, +... truncation=True, +... ) +... return inputs +``` + +Wenden Sie die Funktion auf die ersten paar Beispiele im Datensatz an: + +```py +>>> processed_dataset = preprocess_function(dataset[:5]) +``` + +Schauen Sie sich nun noch einmal die verarbeiteten Beispiel-Längen an: + +```py +>>> processed_dataset["input_values"][0].shape +(100000,) + +>>> processed_dataset["input_values"][1].shape +(100000,) +``` + +Die Länge der ersten beiden Beispiele entspricht nun der von Ihnen angegebenen Maximallänge. + +## Bildverarbeitung + +Ein Merkmalsextraktor wird auch verwendet, um Bilder für Bildverarbeitungsaufgaben zu verarbeiten. Auch hier besteht das Ziel darin, das Rohbild in eine Reihe von Tensoren als Eingabe zu konvertieren. + +Laden wir den [food101](https://huggingface.co/datasets/food101) Datensatz für dieses Tutorial. Verwenden Sie den Parameter 🤗 Datasets `split`, um nur eine kleine Stichprobe aus dem Trainingssplit zu laden, da der Datensatz recht groß ist: + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("food101", split="train[:100]") +``` + +Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) an: + +```py +>>> dataset[0]["image"] +``` + +![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png) + +### Merkmalsextraktor + +Laden Sie den Merkmalsextraktor mit [`AutoImageProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +``` + +### Datenerweiterung + +Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarbeitung eine Art von Datenerweiterung hinzuzufügen. Sie können Erweiterungen mit jeder beliebigen Bibliothek hinzufügen, aber in diesem Tutorial werden Sie das Modul [`transforms`](https://pytorch.org/vision/stable/transforms.html) von torchvision verwenden. + +1. Normalisieren Sie das Bild und verwenden Sie [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html), um einige Transformationen - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) und [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - miteinander zu verknüpfen: + +```py +>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor + +>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) +>>> _transforms = Compose( +... [RandomResizedCrop(image_processor.size["height"]), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize] +... ) +``` + +2. Das Modell akzeptiert [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) als Eingabe. Dieser Wert wird vom Merkmalsextraktor erzeugt. Erstellen Sie eine Funktion, die `pixel_values` aus den Transformationen erzeugt: + +```py +>>> def transforms(examples): +... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] +... return examples +``` + +3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform), um die Transformationen im laufenden Betrieb anzuwenden: + +```py +>>> dataset.set_transform(transforms) +``` + +4. Wenn Sie nun auf das Bild zugreifen, werden Sie feststellen, dass der Feature Extractor die Modelleingabe "pixel_values" hinzugefügt hat: + +```py +>>> dataset[0]["image"] +{'image': , + 'label': 6, + 'pixel_values': tensor([[[ 0.0353, 0.0745, 0.1216, ..., -0.9922, -0.9922, -0.9922], + [-0.0196, 0.0667, 0.1294, ..., -0.9765, -0.9843, -0.9922], + [ 0.0196, 0.0824, 0.1137, ..., -0.9765, -0.9686, -0.8667], + ..., + [ 0.0275, 0.0745, 0.0510, ..., -0.1137, -0.1216, -0.0824], + [ 0.0667, 0.0824, 0.0667, ..., -0.0588, -0.0745, -0.0980], + [ 0.0353, 0.0353, 0.0431, ..., -0.0039, -0.0039, -0.0588]], + + [[ 0.2078, 0.2471, 0.2863, ..., -0.9451, -0.9373, -0.9451], + [ 0.1608, 0.2471, 0.3098, ..., -0.9373, -0.9451, -0.9373], + [ 0.2078, 0.2706, 0.3020, ..., -0.9608, -0.9373, -0.8275], + ..., + [-0.0353, 0.0118, -0.0039, ..., -0.2392, -0.2471, -0.2078], + [ 0.0196, 0.0353, 0.0196, ..., -0.1843, -0.2000, -0.2235], + [-0.0118, -0.0039, -0.0039, ..., -0.0980, -0.0980, -0.1529]], + + [[ 0.3961, 0.4431, 0.4980, ..., -0.9216, -0.9137, -0.9216], + [ 0.3569, 0.4510, 0.5216, ..., -0.9059, -0.9137, -0.9137], + [ 0.4118, 0.4745, 0.5216, ..., -0.9137, -0.8902, -0.7804], + ..., + [-0.2314, -0.1922, -0.2078, ..., -0.4196, -0.4275, -0.3882], + [-0.1843, -0.1686, -0.2000, ..., -0.3647, -0.3804, -0.4039], + [-0.1922, -0.1922, -0.1922, ..., -0.2941, -0.2863, -0.3412]]])} +``` + +Hier sehen Sie, wie das Bild nach der Vorverarbeitung aussieht. Wie von den angewandten Transformationen zu erwarten, wurde das Bild willkürlich beschnitten und seine Farbeigenschaften sind anders. + +```py +>>> import numpy as np +>>> import matplotlib.pyplot as plt + +>>> img = dataset[0]["pixel_values"] +>>> plt.imshow(img.permute(1, 2, 0)) +``` + +![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png) + +## Multimodal + +Für multimodale Aufgaben werden Sie eine Kombination aus allem, was Sie bisher gelernt haben, verwenden und Ihre Fähigkeiten auf eine Aufgabe der automatischen Spracherkennung (ASR) anwenden. Dies bedeutet, dass Sie einen: + +* Feature Extractor zur Vorverarbeitung der Audiodaten. +* Tokenizer, um den Text zu verarbeiten. + +Kehren wir zum [LJ Speech](https://huggingface.co/datasets/lj_speech) Datensatz zurück: + +```py +>>> from datasets import load_dataset + +>>> lj_speech = load_dataset("lj_speech", split="train") +``` + +Da Sie hauptsächlich an den Spalten "Audio" und "Text" interessiert sind, entfernen Sie die anderen Spalten: + +```py +>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) +``` + +Schauen Sie sich nun die Spalten "Audio" und "Text" an: + +```py +>>> lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} + +>>> lj_speech[0]["text"] +'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' +``` + +Erinnern Sie sich an den früheren Abschnitt über die Verarbeitung von Audiodaten: Sie sollten immer die Abtastrate Ihrer Audiodaten [resample](preprocessing#audio), damit sie mit der Abtastrate des Datensatzes übereinstimmt, der für das Vortraining eines Modells verwendet wird: + +```py +>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +### Prozessor + +Ein Processor kombiniert einen Feature-Extraktor und einen Tokenizer. Laden Sie einen Processor mit [`AutoProcessor.from_pretrained]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") +``` + +1. Erstellen Sie eine Funktion, die die Audiodaten zu `input_values` verarbeitet und den Text zu `labels` tokenisiert. Dies sind Ihre Eingaben für das Modell: + +```py +>>> def prepare_dataset(example): +... audio = example["audio"] + +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) + +... return example +``` + +2. Wenden Sie die Funktion "prepare_dataset" auf ein Beispiel an: + +```py +>>> prepare_dataset(lj_speech[0]) +``` + +Beachten Sie, dass der Processor `input_values` und `labels` hinzugefügt hat. Auch die Abtastrate wurde korrekt auf 16kHz heruntergerechnet. + +Toll, Sie sollten jetzt in der Lage sein, Daten für jede Modalität vorzuverarbeiten und sogar verschiedene Modalitäten zu kombinieren! Im nächsten Kurs lernen Sie, wie Sie ein Modell mit Ihren neu aufbereiteten Daten feinabstimmen können. diff --git a/docs/source/de/preprocessing.mdx b/docs/source/de/preprocessing.mdx deleted file mode 100644 index ea6c185cc101..000000000000 --- a/docs/source/de/preprocessing.mdx +++ /dev/null @@ -1,502 +0,0 @@ - - -# Vorverarbeiten - -[[open-in-colab]] - -Bevor Sie Ihre Daten in einem Modell verwenden können, müssen die Daten in ein für das Modell akzeptables Format gebracht werden. Ein Modell versteht keine Rohtexte, Bilder oder Audiodaten. Diese Eingaben müssen in Zahlen umgewandelt und zu Tensoren zusammengesetzt werden. In dieser Anleitung werden Sie: - -* Textdaten mit einem Tokenizer vorverarbeiten. -* Bild- oder Audiodaten mit einem Feature Extractor vorverarbeiten. -* Daten für eine multimodale Aufgabe mit einem Prozessor vorverarbeiten. - -## NLP - - - -Das wichtigste Werkzeug zur Verarbeitung von Textdaten ist ein [Tokenizer](main_classes/tokenizer). Ein Tokenizer zerlegt Text zunächst nach einer Reihe von Regeln in *Token*. Die Token werden in Zahlen umgewandelt, die zum Aufbau von Tensoren als Eingabe für ein Modell verwendet werden. Alle zusätzlichen Eingaben, die ein Modell benötigt, werden ebenfalls vom Tokenizer hinzugefügt. - - - -Wenn Sie ein vortrainiertes Modell verwenden möchten, ist es wichtig, den zugehörigen vortrainierten Tokenizer zu verwenden. Dadurch wird sichergestellt, dass der Text auf die gleiche Weise aufgeteilt wird wie das Pretraining-Korpus und die gleichen entsprechenden Token-zu-Index (in der Regel als *vocab* bezeichnet) während des Pretrainings verwendet werden. - - - -Laden Sie einen vortrainierten Tokenizer mit der Klasse [AutoTokenizer], um schnell loszulegen. Damit wird das *vocab* heruntergeladen, das verwendet wird, wenn ein Modell vortrainiert wird. - -### Tokenize - -Laden Sie einen vortrainierten Tokenizer mit [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -``` - -Dann übergeben Sie Ihren Satz an den Tokenizer: - -```py ->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") ->>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -Der Tokenizer gibt ein Wörterbuch mit drei wichtigen Elementen zurück: - -* [input_ids](glossary#input-ids) sind die Indizes, die den einzelnen Token im Satz entsprechen. -* [attention_mask](glossary#attention-mask) gibt an, ob ein Token beachtet werden soll oder nicht. -* [token_type_ids](glossary#token-type-ids) gibt an, zu welcher Sequenz ein Token gehört, wenn es mehr als eine Sequenz gibt. - -Sie können die `input_ids` dekodieren, um die ursprüngliche Eingabe zurückzugeben: - -```py ->>> tokenizer.decode(encoded_input["input_ids"]) -'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' -``` - -Wie Sie sehen können, hat der Tokenisierer zwei spezielle Token - `CLS` und `SEP` (Klassifikator und Separator) - zum Satz hinzugefügt. Nicht alle Modelle benötigen -spezielle Token, aber wenn dies der Fall ist, fügt der Tokenisierer sie automatisch für Sie hinzu. - -Wenn Sie mehrere Sätze verarbeiten wollen, übergeben Sie die Sätze als Liste an den Tokenizer: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_inputs = tokenizer(batch_sentences) ->>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1]]} -``` - -### Pad - -Dies bringt uns zu einem wichtigen Thema. Wenn Sie einen Haufen von Sätzen verarbeiten, sind diese nicht immer gleich lang. Das ist ein Problem, weil Tensoren, die Eingabe für das Modell, eine einheitliche Form haben müssen. Padding ist eine Strategie, die sicherstellt, dass Tensoren rechteckig sind, indem ein spezielles *Padding-Token* zu Sätzen mit weniger Token hinzugefügt wird. - -Setzen Sie den Parameter "padding" auf "true", um die kürzeren Sequenzen im Stapel so aufzufüllen, dass sie der längsten Sequenz entsprechen: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -Beachten Sie, dass der Tokenizer den ersten und den dritten Satz mit einer "0" aufgefüllt hat, weil sie kürzer sind! - -### Kürzung - -Auf der anderen Seite des Spektrums kann es vorkommen, dass eine Sequenz zu lang für ein Modell ist. In diesem Fall müssen Sie die Sequenz auf eine kürzere Länge kürzen. - -Setzen Sie den Parameter "truncation" auf "true", um eine Sequenz auf die vom Modell akzeptierte Höchstlänge zu kürzen: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -### Tensoren erstellen - -Schließlich möchten Sie, dass der Tokenizer die tatsächlichen Tensoren zurückgibt, die dem Modell zugeführt werden. - -Setzen Sie den Parameter `return_tensors` entweder auf `pt` für PyTorch, oder `tf` für TensorFlow: - - - - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") ->>> print(encoded_input) -{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} -``` - - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") ->>> print(encoded_input) -{'input_ids': , - 'token_type_ids': , - 'attention_mask': } -``` - - - -## Audio - -Audioeingaben werden anders vorverarbeitet als Texteingaben, aber das Endziel bleibt dasselbe: numerische Sequenzen zu erstellen, die das Modell verstehen kann. Ein [feature extractor](main_classes/feature_extractor) dient dem ausdrücklichen Zweck, Merkmale aus Rohbild- oder Audiodaten zu extrahieren und in Tensoren zu konvertieren. Bevor Sie beginnen, installieren Sie 🤗 Datasets, um einen Audio-Datensatz zu laden, mit dem Sie experimentieren können: - -```bash -pip install datasets -``` - -Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html)): - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") -``` - -Greifen Sie auf das erste Element der `audio`-Spalte zu, um einen Blick auf die Eingabe zu werfen. Durch den Aufruf der Spalte "audio" wird die Audiodatei automatisch geladen und neu gesampelt: - -```py ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} -``` - -Dies gibt drei Elemente zurück: - -* "array" ist das Sprachsignal, das als 1D-Array geladen - und möglicherweise neu gesampelt - wurde. -* Pfad" zeigt auf den Speicherort der Audiodatei. -* `sampling_rate` bezieht sich darauf, wie viele Datenpunkte im Sprachsignal pro Sekunde gemessen werden. - -### Resample - -Für dieses Tutorial werden Sie das Modell [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) verwenden. Wie Sie aus der Modellkarte ersehen können, ist das Wav2Vec2-Modell auf 16kHz abgetastetes Sprachaudio vortrainiert. Es ist wichtig, dass die Abtastrate Ihrer Audiodaten mit der Abtastrate des Datensatzes übereinstimmt, der für das Pre-Training des Modells verwendet wurde. Wenn die Abtastrate Ihrer Daten nicht dieselbe ist, müssen Sie Ihre Audiodaten neu abtasten. - -Der Datensatz [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) hat zum Beispiel eine Abtastrate von 8000 kHz. Um das Wav2Vec2-Modell mit diesem Datensatz verwenden zu können, müssen Sie die Abtastrate auf 16 kHz erhöhen: - -```py ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} -``` - -1. Verwenden Sie die Methode [~datasets.Dataset.cast_column] von 🤗 Datasets, um die Abtastrate auf 16kHz zu erhöhen: - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -2. Laden Sie die Audiodatei: - -```py ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} -``` - -Wie Sie sehen können, ist die Abtastrate jetzt 16kHz! - -### Merkmalsextraktor - -Der nächste Schritt ist das Laden eines Merkmalsextraktors, um die Eingabe zu normalisieren und aufzufüllen. Beim Auffüllen von Textdaten wird für kürzere Sequenzen ein `0` hinzugefügt. Die gleiche Idee gilt für Audiodaten, und der Audio-Feature-Extraktor fügt eine `0` - interpretiert als Stille - zu `array` hinzu. - -Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") -``` - -Übergeben Sie das Audio-"Array" an den Feature-Extraktor. Wir empfehlen auch, das Argument `sampling_rate` im Feature Extractor hinzuzufügen, um eventuell auftretende stille Fehler besser zu beheben. - -```py ->>> audio_input = [dataset[0]["audio"]["array"]] ->>> feature_extractor(audio_input, sampling_rate=16000) -{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., - 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} -``` - -### Auffüllen und Kürzen - -Genau wie beim Tokenizer können Sie variable Sequenzen in einem Stapel durch Auffüllen oder Abschneiden behandeln. Werfen Sie einen Blick auf die Sequenzlänge dieser beiden Audiobeispiele: - -```py ->>> dataset[0]["audio"]["array"].shape -(173398,) - ->>> dataset[1]["audio"]["array"].shape -(106496,) -``` - -Wie Sie sehen können, hat das erste Beispiel eine längere Sequenz als das zweite Beispiel. Lassen Sie uns eine Funktion erstellen, die den Datensatz vorverarbeitet. Geben Sie eine maximale Länge der Probe an, und der Feature-Extraktor wird die Sequenzen entweder auffüllen oder abschneiden, damit sie dieser Länge entsprechen: - -```py ->>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] -... inputs = feature_extractor( -... audio_arrays, -... sampling_rate=16000, -... padding=True, -... max_length=100000, -... truncation=True, -... ) -... return inputs -``` - -Wenden Sie die Funktion auf die ersten paar Beispiele im Datensatz an: - -```py ->>> processed_dataset = preprocess_function(dataset[:5]) -``` - -Schauen Sie sich nun noch einmal die verarbeiteten Beispiel-Längen an: - -```py ->>> processed_dataset["input_values"][0].shape -(100000,) - ->>> processed_dataset["input_values"][1].shape -(100000,) -``` - -Die Länge der ersten beiden Beispiele entspricht nun der von Ihnen angegebenen Maximallänge. - -## Bildverarbeitung - -Ein Merkmalsextraktor wird auch verwendet, um Bilder für Bildverarbeitungsaufgaben zu verarbeiten. Auch hier besteht das Ziel darin, das Rohbild in eine Reihe von Tensoren als Eingabe zu konvertieren. - -Laden wir den [food101](https://huggingface.co/datasets/food101) Datensatz für dieses Tutorial. Verwenden Sie den Parameter 🤗 Datasets `split`, um nur eine kleine Stichprobe aus dem Trainingssplit zu laden, da der Datensatz recht groß ist: - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("food101", split="train[:100]") -``` - -Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) an: - -```py ->>> dataset[0]["image"] -``` - -![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png) - -### Merkmalsextraktor - -Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224") -``` - -### Datenerweiterung - -Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarbeitung eine Art von Datenerweiterung hinzuzufügen. Sie können Erweiterungen mit jeder beliebigen Bibliothek hinzufügen, aber in diesem Tutorial werden Sie das Modul [`transforms`](https://pytorch.org/vision/stable/transforms.html) von torchvision verwenden. - -1. Normalisieren Sie das Bild und verwenden Sie [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html), um einige Transformationen - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) und [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - miteinander zu verknüpfen: - -```py ->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor - ->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) ->>> _transforms = Compose( -... [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize] -... ) -``` - -2. Das Modell akzeptiert [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) als Eingabe. Dieser Wert wird vom Merkmalsextraktor erzeugt. Erstellen Sie eine Funktion, die `pixel_values` aus den Transformationen erzeugt: - -```py ->>> def transforms(examples): -... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] -... return examples -``` - -3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform), um die Transformationen im laufenden Betrieb anzuwenden: - -```py ->>> dataset.set_transform(transforms) -``` - -4. Wenn Sie nun auf das Bild zugreifen, werden Sie feststellen, dass der Feature Extractor die Modelleingabe "pixel_values" hinzugefügt hat: - -```py ->>> dataset[0]["image"] -{'image': , - 'label': 6, - 'pixel_values': tensor([[[ 0.0353, 0.0745, 0.1216, ..., -0.9922, -0.9922, -0.9922], - [-0.0196, 0.0667, 0.1294, ..., -0.9765, -0.9843, -0.9922], - [ 0.0196, 0.0824, 0.1137, ..., -0.9765, -0.9686, -0.8667], - ..., - [ 0.0275, 0.0745, 0.0510, ..., -0.1137, -0.1216, -0.0824], - [ 0.0667, 0.0824, 0.0667, ..., -0.0588, -0.0745, -0.0980], - [ 0.0353, 0.0353, 0.0431, ..., -0.0039, -0.0039, -0.0588]], - - [[ 0.2078, 0.2471, 0.2863, ..., -0.9451, -0.9373, -0.9451], - [ 0.1608, 0.2471, 0.3098, ..., -0.9373, -0.9451, -0.9373], - [ 0.2078, 0.2706, 0.3020, ..., -0.9608, -0.9373, -0.8275], - ..., - [-0.0353, 0.0118, -0.0039, ..., -0.2392, -0.2471, -0.2078], - [ 0.0196, 0.0353, 0.0196, ..., -0.1843, -0.2000, -0.2235], - [-0.0118, -0.0039, -0.0039, ..., -0.0980, -0.0980, -0.1529]], - - [[ 0.3961, 0.4431, 0.4980, ..., -0.9216, -0.9137, -0.9216], - [ 0.3569, 0.4510, 0.5216, ..., -0.9059, -0.9137, -0.9137], - [ 0.4118, 0.4745, 0.5216, ..., -0.9137, -0.8902, -0.7804], - ..., - [-0.2314, -0.1922, -0.2078, ..., -0.4196, -0.4275, -0.3882], - [-0.1843, -0.1686, -0.2000, ..., -0.3647, -0.3804, -0.4039], - [-0.1922, -0.1922, -0.1922, ..., -0.2941, -0.2863, -0.3412]]])} -``` - -Hier sehen Sie, wie das Bild nach der Vorverarbeitung aussieht. Wie von den angewandten Transformationen zu erwarten, wurde das Bild willkürlich beschnitten und seine Farbeigenschaften sind anders. - -```py ->>> import numpy as np ->>> import matplotlib.pyplot as plt - ->>> img = dataset[0]["pixel_values"] ->>> plt.imshow(img.permute(1, 2, 0)) -``` - -![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png) - -## Multimodal - -Für multimodale Aufgaben werden Sie eine Kombination aus allem, was Sie bisher gelernt haben, verwenden und Ihre Fähigkeiten auf eine Aufgabe der automatischen Spracherkennung (ASR) anwenden. Dies bedeutet, dass Sie einen: - -* Feature Extractor zur Vorverarbeitung der Audiodaten. -* Tokenizer, um den Text zu verarbeiten. - -Kehren wir zum [LJ Speech](https://huggingface.co/datasets/lj_speech) Datensatz zurück: - -```py ->>> from datasets import load_dataset - ->>> lj_speech = load_dataset("lj_speech", split="train") -``` - -Da Sie hauptsächlich an den Spalten "Audio" und "Text" interessiert sind, entfernen Sie die anderen Spalten: - -```py ->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) -``` - -Schauen Sie sich nun die Spalten "Audio" und "Text" an: - -```py ->>> lj_speech[0]["audio"] -{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., - 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 22050} - ->>> lj_speech[0]["text"] -'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' -``` - -Erinnern Sie sich an den früheren Abschnitt über die Verarbeitung von Audiodaten: Sie sollten immer die Abtastrate Ihrer Audiodaten [resample](preprocessing#audio), damit sie mit der Abtastrate des Datensatzes übereinstimmt, der für das Vortraining eines Modells verwendet wird: - -```py ->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -### Prozessor - -Ein Processor kombiniert einen Feature-Extraktor und einen Tokenizer. Laden Sie einen Processor mit [`AutoProcessor.from_pretrained]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") -``` - -1. Erstellen Sie eine Funktion, die die Audiodaten zu `input_values` verarbeitet und den Text zu `labels` tokenisiert. Dies sind Ihre Eingaben für das Modell: - -```py ->>> def prepare_dataset(example): -... audio = example["audio"] - -... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) - -... return example -``` - -2. Wenden Sie die Funktion "prepare_dataset" auf ein Beispiel an: - -```py ->>> prepare_dataset(lj_speech[0]) -``` - -Beachten Sie, dass der Processor `input_values` und `labels` hinzugefügt hat. Auch die Abtastrate wurde korrekt auf 16kHz heruntergerechnet. - -Toll, Sie sollten jetzt in der Lage sein, Daten für jede Modalität vorzuverarbeiten und sogar verschiedene Modalitäten zu kombinieren! Im nächsten Kurs lernen Sie, wie Sie ein Modell mit Ihren neu aufbereiteten Daten feinabstimmen können. diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md new file mode 100644 index 000000000000..139869e5d1ee --- /dev/null +++ b/docs/source/de/quicktour.md @@ -0,0 +1,438 @@ + + +# Schnellstart + +[[open-in-colab]] + +Mit 🤗 Transformers können Sie sofort loslegen! Verwenden Sie die [`pipeline`] für schnelle Inferenz und laden Sie schnell ein vortrainiertes Modell und einen Tokenizer mit einer [AutoClass](./model_doc/auto), um Ihre Text-, Bild- oder Audioaufgabe zu lösen. + + + +Alle in der Dokumentation vorgestellten Codebeispiele haben oben links einen Umschalter für PyTorch und TensorFlow. Wenn +nicht, wird erwartet, dass der Code für beide Backends ohne Änderungen funktioniert. + + + +## Pipeline + +[`pipeline`] ist der einfachste Weg, ein vortrainiertes Modell für eine bestimmte Aufgabe zu verwenden. + + + +Die [`pipeline`] unterstützt viele gängige Aufgaben: + +**Text**: +* Stimmungsanalyse: Klassifizierung der Polarität eines gegebenen Textes. +* Textgenerierung (auf Englisch): Generierung von Text aus einer gegebenen Eingabe. +* Name-Entity-Recognition (NER): Kennzeichnung jedes Worts mit der Entität, die es repräsentiert (Person, Datum, Ort usw.). +* Beantwortung von Fragen: Extrahieren der Antwort aus dem Kontext, wenn ein gewisser Kontext und eine Frage gegeben sind. +* Fill-mask: Ausfüllen von Lücken in einem Text mit maskierten Wörtern. +* Zusammenfassung: Erstellung einer Zusammenfassung einer langen Text- oder Dokumentensequenz. +* Übersetzung: Übersetzen eines Textes in eine andere Sprache. +* Merkmalsextraktion: Erstellen einer Tensordarstellung des Textes. + +**Bild**: +* Bildklassifizierung: Klassifizierung eines Bildes. +* Bildsegmentierung: Klassifizierung jedes Pixels in einem Bild. +* Objekterkennung: Erkennen von Objekten innerhalb eines Bildes. + +**Audio**: +* Audioklassifizierung: Zuweisung eines Labels zu einem bestimmten Audiosegment. +* Automatische Spracherkennung (ASR): Transkription von Audiodaten in Text. + + + +Für mehr Details über die [`pipeline`] und assoziierte Aufgaben, schauen Sie in die Dokumentation [hier](./main_classes/pipelines). + + + +### Verwendung der Pipeline + +Im folgenden Beispiel werden Sie die [`pipeline`] für die Stimmungsanalyse verwenden. + +Installieren Sie die folgenden Abhängigkeiten, falls Sie dies nicht bereits getan haben: + + + + +```bash +pip install torch +``` + + + +```bash +pip install tensorflow +``` + + + +Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie lösen möchten: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("sentiment-analysis") +``` + +Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell] (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden: + +```py +>>> classifier("We are very happy to show you the 🤗 Transformers library.") +[{'label': 'POSITIVE', 'score': 0.9998}] +``` + +For more than one sentence, pass a list of sentences to the [`pipeline`] which returns a list of dictionaries: + +```py +>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) +>>> for result in results: +... print(f"label: {result['label']}, with score: {round(result['score'], 4)}") +label: POSITIVE, with score: 0.9998 +label: NEGATIVE, with score: 0.5309 +``` + +Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek: + +```bash +pip install datasets +``` + +Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten. + +```py +>>> import torch +>>> from transformers import pipeline + +>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") +``` + +Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz: + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT +``` + +Wir müssen sicherstellen, dass die Abtastrate des Datensatzes der Abtastrate entspricht, mit der `facebook/wav2vec2-base-960h` trainiert wurde. + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate)) +``` + +Audiodateien werden automatisch geladen und neu abgetastet, wenn die Spalte "audio" aufgerufen wird. +Extrahieren wir die rohen Wellenform-Arrays der ersten 4 Beispiele und übergeben wir sie als Liste an die Pipeline: + +```py +>>> result = speech_recognizer(dataset[:4]["audio"]) +>>> print([d["text"] for d in result]) +['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT'] +``` + +Bei einem größeren Datensatz mit vielen Eingaben (wie bei Sprache oder Bildverarbeitung) sollten Sie einen Generator anstelle einer Liste übergeben, der alle Eingaben in den Speicher lädt. Weitere Informationen finden Sie in der [Pipeline-Dokumentation](./main_classes/pipelines). + +### Ein anderes Modell und einen anderen Tokenizer in der Pipeline verwenden + +Die [`pipeline`] kann jedes Modell aus dem [Model Hub] (https://huggingface.co/models) verwenden, wodurch es einfach ist, die [`pipeline`] für andere Anwendungsfälle anzupassen. Wenn Sie beispielsweise ein Modell wünschen, das französischen Text verarbeiten kann, verwenden Sie die Tags im Model Hub, um nach einem geeigneten Modell zu filtern. Das oberste gefilterte Ergebnis liefert ein mehrsprachiges [BERT-Modell](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), das auf die Stimmungsanalyse abgestimmt ist. Großartig, verwenden wir dieses Modell! + +```py +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +``` + + + +Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below): + +```py +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + +Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below): + +```py +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + + +Dann können Sie das Modell und den Tokenizer in der [`pipeline`] angeben und den `Klassifikator` auf Ihren Zieltext anwenden: + +```py +>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) +>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") +[{'label': '5 stars', 'score': 0.7273}] +``` + +Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein vortrainiertes Modell auf Ihren Daten feinabstimmen. Schauen Sie sich unser [Feinabstimmungs-Tutorial](./training) an, um zu erfahren, wie das geht. Und schließlich, nachdem Sie Ihr trainiertes Modell verfeinert haben, sollten Sie es mit der Community im Model Hub teilen (siehe Tutorial [hier](./model_sharing)), um NLP für alle zu demokratisieren! 🤗 + +## AutoClass + + + +Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. + +Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren. + +### AutoTokenizer + +Ein Tokenizer ist für die Vorverarbeitung von Text in ein für das Modell verständliches Format zuständig. Zunächst zerlegt der Tokenisierer den Text in Wörter, die *Token* genannt werden. Es gibt mehrere Regeln für den Tokenisierungsprozess, z. B. wie und auf welcher Ebene ein Wort aufgespalten wird (weitere Informationen über Tokenisierung [hier](./tokenizer_summary)). Das Wichtigste ist jedoch, dass Sie den Tokenizer mit demselben Modellnamen instanziieren müssen, um sicherzustellen, dass Sie dieselben Tokenisierungsregeln verwenden, mit denen ein Modell zuvor trainiert wurde. +Laden sie einen Tokenizer mit [`AutoTokenizer`]: + +```py +>>> from transformers import AutoTokenizer + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + +Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als Eingabe für das Modell zu konstruieren. Dieser wird als *Vokabular* des Modells bezeichnet. + +Übergeben Sie Ihren Text an den Tokenizer: + +```py +>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.") +>>> print(encoding) +{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält: + +* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token. +* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen. + +Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben: + + + + +```py +>>> pt_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="pt", +... ) +``` + + + +```py +>>> tf_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="tf", +... ) +``` + + + +Lesen Sie das Tutorial [preprocessing](./preprocessing) für weitere Details zur Tokenisierung. + +### AutoModel + + + +🤗 Transformers bietet eine einfache und einheitliche Möglichkeit, vortrainierte Instanzen zu laden. Das bedeutet, dass Sie ein [`AutoModel`] laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`AutoModel`] für die Aufgabe. Da Sie eine Text- oder Sequenzklassifizierung vornehmen, laden Sie [`AutoModelForSequenceClassification`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist. + + + +Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben. Sie müssen nur das Wörterbuch entpacken, indem Sie `**` hinzufügen: + +```py +>>> pt_outputs = pt_model(**pt_batch) +``` + +Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten: + +```py +>>> from torch import nn + +>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) +>>> print(pt_predictions) +tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], + [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) +``` + + +🤗 Transformers bietet eine einfache und einheitliche Methode zum Laden von vortrainierten Instanzen. Das bedeutet, dass Sie ein [`TFAutoModel`] genauso laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`TFAutoModel`] für die Aufgabe. Da Sie Text - oder Sequenz - Klassifizierung machen, laden Sie [`TFAutoModelForSequenceClassification`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist. + + + +Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben: + +```py +>>> tf_outputs = tf_model(tf_batch) +``` + +Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten: + +```py +>>> import tensorflow as tf + +>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) +>>> tf_predictions # doctest: +IGNORE_RESULT +``` + + + + + +Alle 🤗 Transformers-Modelle (PyTorch oder TensorFlow) geben die Tensoren *vor* der endgültigen Aktivierungsfunktion +Funktion (wie Softmax) aus, da die endgültige Aktivierungsfunktion oft mit dem Verlusten verschmolzen ist. + + + +Modelle sind ein standardmäßiges [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) oder ein [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), sodass Sie sie in Ihrer üblichen Trainingsschleife verwenden können. Um jedoch die Dinge einfacher zu machen, bietet 🤗 Transformers eine [`Trainer`]-Klasse für PyTorch, die Funktionalität für verteiltes Training, gemischte Präzision und mehr bietet. Für TensorFlow können Sie die Methode `fit` aus [Keras](https://keras.io/) verwenden. Siehe das [training tutorial](./training) für weitere Details. + + + +Transformers-Modellausgaben sind spezielle Datenklassen, so dass ihre Attribute in einer IDE automatisch vervollständigt werden. +Die Modellausgänge verhalten sich auch wie ein Tupel oder ein Wörterbuch (z.B. können Sie mit einem Integer, einem Slice oder einem String indexieren), wobei die Attribute, die "None" sind, ignoriert werden. + + + +### Modell speichern + + + +Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer speichern, indem Sie [`PreTrainedModel.save_pretrained`] verwenden: + +```py +>>> pt_save_directory = "./pt_save_pretrained" +>>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT +>>> pt_model.save_pretrained(pt_save_directory) +``` + +Wenn Sie bereit sind, das Modell erneut zu verwenden, laden Sie es mit [`PreTrainedModel.from_pretrained`]: + +```py +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") +``` + + +Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer unter Verwendung von [`TFPreTrainedModel.save_pretrained`] speichern: + +```py +>>> tf_save_directory = "./tf_save_pretrained" +>>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT +>>> tf_model.save_pretrained(tf_save_directory) +``` + +Wenn Sie bereit sind, das Modell wieder zu verwenden, laden Sie es mit [`TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") +``` + + + +Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell zu speichern und es entweder als PyTorch- oder TensorFlow-Modell wieder zu laden. Der Parameter "from_pt" oder "from_tf" kann das Modell von einem Framework in das andere konvertieren: + + + + +```py +>>> from transformers import AutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +``` + + + +```py +>>> from transformers import TFAutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +``` + + + +## Custom model builds + +Sie können die Konfigurationsklasse des Modells ändern, um zu bestimmen, wie ein Modell aufgebaut ist. Die Konfiguration legt die Attribute eines Modells fest, z. B. die Anzahl der verborgenen Schichten oder der Aufmerksamkeitsköpfe. Wenn Sie ein Modell aus einer benutzerdefinierten Konfigurationsklasse initialisieren, beginnen Sie bei Null. Die Modellattribute werden zufällig initialisiert, und Sie müssen das Modell trainieren, bevor Sie es verwenden können, um aussagekräftige Ergebnisse zu erhalten. + +Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte Modell, das Sie ändern möchten. Innerhalb von [`AutoConfig.from_pretrained`] können Sie das Attribut angeben, das Sie ändern möchten, z. B. die Anzahl der Aufmerksamkeitsköpfe: + +```py +>>> from transformers import AutoConfig + +>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12) +``` + + + +Create a model from your custom configuration with [`AutoModel.from_config`]: + +```py +>>> from transformers import AutoModel + +>>> my_model = AutoModel.from_config(my_config) +``` + + +Create a model from your custom configuration with [`TFAutoModel.from_config`]: + +```py +>>> from transformers import TFAutoModel + +>>> my_model = TFAutoModel.from_config(my_config) +``` + + + +Weitere Informationen zur Erstellung von benutzerdefinierten Konfigurationen finden Sie in der Anleitung [Erstellen einer benutzerdefinierten Architektur](./create_a_model). + +## Wie geht es weiter? + +Nachdem Sie nun die 🤗 Transformers-Kurztour abgeschlossen haben, schauen Sie sich unsere Anleitungen an und erfahren Sie, wie Sie spezifischere Dinge tun können, wie das Schreiben eines benutzerdefinierten Modells, die Feinabstimmung eines Modells für eine Aufgabe und wie man ein Modell mit einem Skript trainiert. Wenn Sie mehr über die Kernkonzepte von 🤗 Transformers erfahren möchten, nehmen Sie sich eine Tasse Kaffee und werfen Sie einen Blick auf unsere konzeptionellen Leitfäden! diff --git a/docs/source/de/quicktour.mdx b/docs/source/de/quicktour.mdx deleted file mode 100644 index 4c668bf419b1..000000000000 --- a/docs/source/de/quicktour.mdx +++ /dev/null @@ -1,428 +0,0 @@ - - -# Schnellstart - -[[open-in-colab]] - -Mit 🤗 Transformers können Sie sofort loslegen! Verwenden Sie die [`pipeline`] für schnelle Inferenz und laden Sie schnell ein vortrainiertes Modell und einen Tokenizer mit einer [AutoClass](./model_doc/auto), um Ihre Text-, Bild- oder Audioaufgabe zu lösen. - - - -Alle in der Dokumentation vorgestellten Codebeispiele haben oben links einen Umschalter für PyTorch und TensorFlow. Wenn -nicht, wird erwartet, dass der Code für beide Backends ohne Änderungen funktioniert. - - - -## Pipeline - -[`pipeline`] ist der einfachste Weg, ein vortrainiertes Modell für eine bestimmte Aufgabe zu verwenden. - - - -Die [`pipeline`] unterstützt viele gängige Aufgaben: - -**Text**: -* Stimmungsanalyse: Klassifizierung der Polarität eines gegebenen Textes. -* Textgenerierung (auf Englisch): Generierung von Text aus einer gegebenen Eingabe. -* Name-Entity-Recognition (NER): Kennzeichnung jedes Worts mit der Entität, die es repräsentiert (Person, Datum, Ort usw.). -* Beantwortung von Fragen: Extrahieren der Antwort aus dem Kontext, wenn ein gewisser Kontext und eine Frage gegeben sind. -* Fill-mask: Ausfüllen von Lücken in einem Text mit maskierten Wörtern. -* Zusammenfassung: Erstellung einer Zusammenfassung einer langen Text- oder Dokumentensequenz. -* Übersetzung: Übersetzen eines Textes in eine andere Sprache. -* Merkmalsextraktion: Erstellen einer Tensordarstellung des Textes. - -**Bild**: -* Bildklassifizierung: Klassifizierung eines Bildes. -* Bildsegmentierung: Klassifizierung jedes Pixels in einem Bild. -* Objekterkennung: Erkennen von Objekten innerhalb eines Bildes. - -**Audio**: -* Audioklassifizierung: Zuweisung eines Labels zu einem bestimmten Audiosegment. -* Automatische Spracherkennung (ASR): Transkription von Audiodaten in Text. - - - -Für mehr Details über die [`pipeline`] und assoziierte Aufgaben, schauen Sie in die Dokumentation [hier](./main_classes/pipelines). - - - -### Verwendung der Pipeline - -Im folgenden Beispiel werden Sie die [`pipeline`] für die Stimmungsanalyse verwenden. - -Installieren Sie die folgenden Abhängigkeiten, falls Sie dies nicht bereits getan haben: - - - -```bash -pip install torch -``` - - -```bash -pip install tensorflow -``` - - - -Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie lösen möchten: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("sentiment-analysis") -``` - -Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell] (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden: - -```py ->>> classifier("We are very happy to show you the 🤗 Transformers library.") -[{'label': 'POSITIVE', 'score': 0.9998}] -``` - -For more than one sentence, pass a list of sentences to the [`pipeline`] which returns a list of dictionaries: - -```py ->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) ->>> for result in results: -... print(f"label: {result['label']}, with score: {round(result['score'], 4)}") -label: POSITIVE, with score: 0.9998 -label: NEGATIVE, with score: 0.5309 -``` - -Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek: - -```bash -pip install datasets -``` - -Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten. - -```py ->>> import torch ->>> from transformers import pipeline - ->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") -``` - -Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz: - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT -``` - -Wir müssen sicherstellen, dass die Abtastrate des Datensatzes der Abtastrate entspricht, mit der `facebook/wav2vec2-base-960h` trainiert wurde. - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate)) -``` - -Audiodateien werden automatisch geladen und neu abgetastet, wenn die Spalte "audio" aufgerufen wird. -Extrahieren wir die rohen Wellenform-Arrays der ersten 4 Beispiele und übergeben wir sie als Liste an die Pipeline: - -```py ->>> result = speech_recognizer(dataset[:4]["audio"]) ->>> print([d["text"] for d in result]) -['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT'] -``` - -Bei einem größeren Datensatz mit vielen Eingaben (wie bei Sprache oder Bildverarbeitung) sollten Sie einen Generator anstelle einer Liste übergeben, der alle Eingaben in den Speicher lädt. Weitere Informationen finden Sie in der [Pipeline-Dokumentation](./main_classes/pipelines). - -### Ein anderes Modell und einen anderen Tokenizer in der Pipeline verwenden - -Die [`pipeline`] kann jedes Modell aus dem [Model Hub] (https://huggingface.co/models) verwenden, wodurch es einfach ist, die [`pipeline`] für andere Anwendungsfälle anzupassen. Wenn Sie beispielsweise ein Modell wünschen, das französischen Text verarbeiten kann, verwenden Sie die Tags im Model Hub, um nach einem geeigneten Modell zu filtern. Das oberste gefilterte Ergebnis liefert ein mehrsprachiges [BERT-Modell](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), das auf die Stimmungsanalyse abgestimmt ist. Großartig, verwenden wir dieses Modell! - -```py ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" -``` - - - -Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below): - -```py ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - -Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below): - -```py ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - - -Dann können Sie das Modell und den Tokenizer in der [`pipeline`] angeben und den `Klassifikator` auf Ihren Zieltext anwenden: - -```py ->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) ->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") -[{'label': '5 stars', 'score': 0.7273}] -``` - -Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein vortrainiertes Modell auf Ihren Daten feinabstimmen. Schauen Sie sich unser [Feinabstimmungs-Tutorial](./training) an, um zu erfahren, wie das geht. Und schließlich, nachdem Sie Ihr trainiertes Modell verfeinert haben, sollten Sie es mit der Community im Model Hub teilen (siehe Tutorial [hier](./model_sharing)), um NLP für alle zu demokratisieren! 🤗 - -## AutoClass - - - -Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. - -Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren. - -### AutoTokenizer - -Ein Tokenizer ist für die Vorverarbeitung von Text in ein für das Modell verständliches Format zuständig. Zunächst zerlegt der Tokenisierer den Text in Wörter, die *Token* genannt werden. Es gibt mehrere Regeln für den Tokenisierungsprozess, z. B. wie und auf welcher Ebene ein Wort aufgespalten wird (weitere Informationen über Tokenisierung [hier](./tokenizer_summary)). Das Wichtigste ist jedoch, dass Sie den Tokenizer mit demselben Modellnamen instanziieren müssen, um sicherzustellen, dass Sie dieselben Tokenisierungsregeln verwenden, mit denen ein Modell zuvor trainiert wurde. -Laden sie einen Tokenizer mit [`AutoTokenizer`]: - -```py ->>> from transformers import AutoTokenizer - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - -Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als Eingabe für das Modell zu konstruieren. Dieser wird als *Vokabular* des Modells bezeichnet. - -Übergeben Sie Ihren Text an den Tokenizer: - -```py ->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.") ->>> print(encoding) -{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält: - -* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token. -* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen. - -Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben: - - - -```py ->>> pt_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="pt", -... ) -``` - - -```py ->>> tf_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="tf", -... ) -``` - - - -Lesen Sie das Tutorial [preprocessing](./preprocessing) für weitere Details zur Tokenisierung. - -### AutoModel - - - -🤗 Transformers bietet eine einfache und einheitliche Möglichkeit, vortrainierte Instanzen zu laden. Das bedeutet, dass Sie ein [`AutoModel`] laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`AutoModel`] für die Aufgabe. Da Sie eine Text- oder Sequenzklassifizierung vornehmen, laden Sie [`AutoModelForSequenceClassification`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - -In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist. - - - -Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben. Sie müssen nur das Wörterbuch entpacken, indem Sie `**` hinzufügen: - -```py ->>> pt_outputs = pt_model(**pt_batch) -``` - -Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten: - -```py ->>> from torch import nn - ->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) ->>> print(pt_predictions) -tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], - [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) -``` - - -🤗 Transformers bietet eine einfache und einheitliche Methode zum Laden von vortrainierten Instanzen. Das bedeutet, dass Sie ein [`TFAutoModel`] genauso laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`TFAutoModel`] für die Aufgabe. Da Sie Text - oder Sequenz - Klassifizierung machen, laden Sie [`TFAutoModelForSequenceClassification`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - -In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist. - - - -Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben: - -```py ->>> tf_outputs = tf_model(tf_batch) -``` - -Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten: - -```py ->>> import tensorflow as tf - ->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) ->>> tf_predictions # doctest: +IGNORE_RESULT -``` - - - - - -Alle 🤗 Transformers-Modelle (PyTorch oder TensorFlow) geben die Tensoren *vor* der endgültigen Aktivierungsfunktion -Funktion (wie Softmax) aus, da die endgültige Aktivierungsfunktion oft mit dem Verlusten verschmolzen ist. - - - -Modelle sind ein standardmäßiges [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) oder ein [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), sodass Sie sie in Ihrer üblichen Trainingsschleife verwenden können. Um jedoch die Dinge einfacher zu machen, bietet 🤗 Transformers eine [`Trainer`]-Klasse für PyTorch, die Funktionalität für verteiltes Training, gemischte Präzision und mehr bietet. Für TensorFlow können Sie die Methode `fit` aus [Keras](https://keras.io/) verwenden. Siehe das [training tutorial](./training) für weitere Details. - - - -Transformers-Modellausgaben sind spezielle Datenklassen, so dass ihre Attribute in einer IDE automatisch vervollständigt werden. -Die Modellausgänge verhalten sich auch wie ein Tupel oder ein Wörterbuch (z.B. können Sie mit einem Integer, einem Slice oder einem String indexieren), wobei die Attribute, die "None" sind, ignoriert werden. - - - -### Modell speichern - - - -Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer speichern, indem Sie [`PreTrainedModel.save_pretrained`] verwenden: - -```py ->>> pt_save_directory = "./pt_save_pretrained" ->>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT ->>> pt_model.save_pretrained(pt_save_directory) -``` - -Wenn Sie bereit sind, das Modell erneut zu verwenden, laden Sie es mit [`PreTrainedModel.from_pretrained`]: - -```py ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") -``` - - -Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer unter Verwendung von [`TFPreTrainedModel.save_pretrained`] speichern: - -```py ->>> tf_save_directory = "./tf_save_pretrained" ->>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT ->>> tf_model.save_pretrained(tf_save_directory) -``` - -Wenn Sie bereit sind, das Modell wieder zu verwenden, laden Sie es mit [`TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") -``` - - - -Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell zu speichern und es entweder als PyTorch- oder TensorFlow-Modell wieder zu laden. Der Parameter "from_pt" oder "from_tf" kann das Modell von einem Framework in das andere konvertieren: - - - -```py ->>> from transformers import AutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) -``` - - -```py ->>> from transformers import TFAutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) -``` - - - -## Custom model builds - -Sie können die Konfigurationsklasse des Modells ändern, um zu bestimmen, wie ein Modell aufgebaut ist. Die Konfiguration legt die Attribute eines Modells fest, z. B. die Anzahl der verborgenen Schichten oder der Aufmerksamkeitsköpfe. Wenn Sie ein Modell aus einer benutzerdefinierten Konfigurationsklasse initialisieren, beginnen Sie bei Null. Die Modellattribute werden zufällig initialisiert, und Sie müssen das Modell trainieren, bevor Sie es verwenden können, um aussagekräftige Ergebnisse zu erhalten. - -Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte Modell, das Sie ändern möchten. Innerhalb von [`AutoConfig.from_pretrained`] können Sie das Attribut angeben, das Sie ändern möchten, z. B. die Anzahl der Aufmerksamkeitsköpfe: - -```py ->>> from transformers import AutoConfig - ->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12) -``` - - - -Create a model from your custom configuration with [`AutoModel.from_config`]: - -```py ->>> from transformers import AutoModel - ->>> my_model = AutoModel.from_config(my_config) -``` - - -Create a model from your custom configuration with [`TFAutoModel.from_config`]: - -```py ->>> from transformers import TFAutoModel - ->>> my_model = TFAutoModel.from_config(my_config) -``` - - - -Weitere Informationen zur Erstellung von benutzerdefinierten Konfigurationen finden Sie in der Anleitung [Erstellen einer benutzerdefinierten Architektur](./create_a_model). - -## Wie geht es weiter? - -Nachdem Sie nun die 🤗 Transformers-Kurztour abgeschlossen haben, schauen Sie sich unsere Anleitungen an und erfahren Sie, wie Sie spezifischere Dinge tun können, wie das Schreiben eines benutzerdefinierten Modells, die Feinabstimmung eines Modells für eine Aufgabe und wie man ein Modell mit einem Skript trainiert. Wenn Sie mehr über die Kernkonzepte von 🤗 Transformers erfahren möchten, nehmen Sie sich eine Tasse Kaffee und werfen Sie einen Blick auf unsere konzeptionellen Leitfäden! diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md new file mode 100644 index 000000000000..2902d4c08414 --- /dev/null +++ b/docs/source/de/run_scripts.md @@ -0,0 +1,351 @@ + + +# Trainieren mit einem Skript + +Neben den 🤗 Transformers [notebooks](./noteboks/README) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert. + +Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist. + +Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. Möglicherweise müssen Sie das Skript an das Problem anpassen, das Sie zu lösen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollständig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf für Ihren Anwendungsfall bearbeiten können. + +Für jede Funktion, die Sie in einem Beispielskript implementieren möchten, diskutieren Sie bitte im [Forum] (https://discuss.huggingface.co/) oder in einem [issue] (https://github.com/huggingface/transformers/issues), bevor Sie einen Pull Request einreichen. Wir freuen uns zwar über Fehlerkorrekturen, aber es ist unwahrscheinlich, dass wir einen Pull Request zusammenführen, der mehr Funktionalität auf Kosten der Lesbarkeit hinzufügt. + +Diese Anleitung zeigt Ihnen, wie Sie ein Beispiel für ein Trainingsskript zur Zusammenfassung in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) und [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) ausführen können. Sofern nicht anders angegeben, sollten alle Beispiele mit beiden Frameworks funktionieren. + +## Einrichtung + +Um die neueste Version der Beispielskripte erfolgreich auszuführen, **müssen Sie 🤗 Transformers aus dem Quellcode** in einer neuen virtuellen Umgebung installieren: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +pip install . +``` + +Für ältere Versionen der Beispielskripte klicken Sie auf die Umschalttaste unten: + +
+ Beispiele für ältere Versionen von 🤗 Transformers + +
+ +Dann stellen Sie Ihren aktuellen Klon von 🤗 Transformers auf eine bestimmte Version um, z.B. v3.5.1: + +```bash +git checkout tags/v3.5.1 +``` + +Nachdem Sie die richtige Bibliotheksversion eingerichtet haben, navigieren Sie zu dem Beispielordner Ihrer Wahl und installieren die beispielspezifischen Anforderungen: + +```bash +pip install -r requirements.txt +``` + +## Ein Skript ausführen + + + +Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Dann nimmt das Skript eine Feinabstimmung eines Datensatzes mit dem [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) auf einer Architektur vor, die eine Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem Datensatz [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt. + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Anschließend nimmt das Skript die Feinabstimmung eines Datensatzes mit Keras auf einer Architektur vor, die die Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) Datensatz durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt. + +```bash +python examples/tensorflow/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Verteiltes Training und gemischte Präzision + +Der [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) unterstützt verteiltes Training und gemischte Präzision, d.h. Sie können ihn auch in einem Skript verwenden. So aktivieren Sie diese beiden Funktionen: + +- Fügen Sie das Argument `fp16` hinzu, um gemischte Genauigkeit zu aktivieren. +- Legen Sie die Anzahl der zu verwendenden GPUs mit dem Argument `nproc_per_node` fest. + +```bash +python -m torch.distributed.launch \ + --nproc_per_node 8 pytorch/summarization/run_summarization.py \ + --fp16 \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +TensorFlow-Skripte verwenden eine [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) für verteiltes Training, und Sie müssen dem Trainingsskript keine zusätzlichen Argumente hinzufügen. Das TensorFlow-Skript verwendet standardmäßig mehrere GPUs, wenn diese verfügbar sind. + +## Ein Skript auf einer TPU ausführen + + + +Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistung konzipiert. PyTorch unterstützt TPUs mit dem [XLA](https://www.tensorflow.org/xla) Deep Learning Compiler (siehe [hier](https://github.com/pytorch/xla/blob/master/README.md) für weitere Details). Um eine TPU zu verwenden, starten Sie das Skript `xla_spawn.py` und verwenden das Argument `num_cores`, um die Anzahl der TPU-Kerne festzulegen, die Sie verwenden möchten. + +```bash +python xla_spawn.py --num_cores 8 \ + summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistung konzipiert. TensorFlow Skripte verwenden eine [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) für das Training auf TPUs. Um eine TPU zu verwenden, übergeben Sie den Namen der TPU-Ressource an das Argument `tpu`. + +```bash +python run_summarization.py \ + --tpu name_of_tpu_resource \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Führen Sie ein Skript mit 🤗 Accelerate aus. + +🤗 [Accelerate](https://huggingface.co/docs/accelerate) ist eine reine PyTorch-Bibliothek, die eine einheitliche Methode für das Training eines Modells auf verschiedenen Arten von Setups (nur CPU, mehrere GPUs, TPUs) bietet und dabei die vollständige Transparenz der PyTorch-Trainingsschleife beibehält. Stellen Sie sicher, dass Sie 🤗 Accelerate installiert haben, wenn Sie es nicht bereits haben: + +> Hinweis: Da Accelerate schnell weiterentwickelt wird, muss die Git-Version von Accelerate installiert sein, um die Skripte auszuführen. +```bash +pip install git+https://github.com/huggingface/accelerate +``` + +Anstelle des Skripts `run_summarization.py` müssen Sie das Skript `run_summarization_no_trainer.py` verwenden. Die von Accelerate unterstützten Skripte haben eine Datei `task_no_trainer.py` im Ordner. Beginnen Sie mit dem folgenden Befehl, um eine Konfigurationsdatei zu erstellen und zu speichern: + +```bash +accelerate config +``` + +Testen Sie Ihre Einrichtung, um sicherzustellen, dass sie korrekt konfiguriert ist: + +```bash +accelerate test +``` + +Jetzt sind Sie bereit, das Training zu starten: + +```bash +accelerate launch run_summarization_no_trainer.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir ~/tmp/tst-summarization +``` + +## Verwenden Sie einen benutzerdefinierten Datensatz + +Das Verdichtungsskript unterstützt benutzerdefinierte Datensätze, solange es sich um eine CSV- oder JSON-Line-Datei handelt. Wenn Sie Ihren eigenen Datensatz verwenden, müssen Sie mehrere zusätzliche Argumente angeben: + +- `train_file` und `validation_file` geben den Pfad zu Ihren Trainings- und Validierungsdateien an. +- text_column` ist der Eingabetext, der zusammengefasst werden soll. +- Summary_column" ist der auszugebende Zieltext. + +Ein Zusammenfassungsskript, das einen benutzerdefinierten Datensatz verwendet, würde wie folgt aussehen: + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --train_file path_to_csv_or_jsonlines_file \ + --validation_file path_to_csv_or_jsonlines_file \ + --text_column text_column_name \ + --summary_column summary_column_name \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --overwrite_output_dir \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --predict_with_generate +``` + +## Testen Sie ein Skript + +Es ist oft eine gute Idee, Ihr Skript an einer kleineren Anzahl von Beispielen für Datensätze auszuführen, um sicherzustellen, dass alles wie erwartet funktioniert, bevor Sie sich auf einen ganzen Datensatz festlegen, dessen Fertigstellung Stunden dauern kann. Verwenden Sie die folgenden Argumente, um den Datensatz auf eine maximale Anzahl von Stichproben zu beschränken: + +- `max_train_samples` +- `max_eval_samples` +- `max_predict_samples` + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +Nicht alle Beispielskripte unterstützen das Argument `max_predict_samples`. Wenn Sie sich nicht sicher sind, ob Ihr Skript dieses Argument unterstützt, fügen Sie das Argument `-h` hinzu, um dies zu überprüfen: + +```bash +examples/pytorch/summarization/run_summarization.py -h +``` + +## Training vom Kontrollpunkt fortsetzen + +Eine weitere hilfreiche Option, die Sie aktivieren können, ist die Wiederaufnahme des Trainings von einem früheren Kontrollpunkt aus. Auf diese Weise können Sie im Falle einer Unterbrechung Ihres Trainings dort weitermachen, wo Sie aufgehört haben, ohne von vorne beginnen zu müssen. Es gibt zwei Methoden, um das Training von einem Kontrollpunkt aus wieder aufzunehmen. + +Die erste Methode verwendet das Argument `output_dir previous_output_dir`, um das Training ab dem letzten in `output_dir` gespeicherten Kontrollpunkt wieder aufzunehmen. In diesem Fall sollten Sie `overwrite_output_dir` entfernen: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --output_dir previous_output_dir \ + --predict_with_generate +``` + +Die zweite Methode verwendet das Argument `Resume_from_checkpoint path_to_specific_checkpoint`, um das Training ab einem bestimmten Checkpoint-Ordner wieder aufzunehmen. + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --resume_from_checkpoint path_to_specific_checkpoint \ + --predict_with_generate +``` + +## Teilen Sie Ihr Modell + +Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen: + +```bash +huggingface-cli login +``` + +Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt. + +Wenn Sie Ihrem Repository einen bestimmten Namen geben möchten, fügen Sie ihn mit dem Argument `push_to_hub_model_id` hinzu. Das Repository wird automatisch unter Ihrem Namensraum aufgeführt. + +Das folgende Beispiel zeigt, wie Sie ein Modell mit einem bestimmten Repository-Namen hochladen können: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --push_to_hub \ + --push_to_hub_model_id finetuned-t5-cnn_dailymail \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` \ No newline at end of file diff --git a/docs/source/de/testing.md b/docs/source/de/testing.md new file mode 100644 index 000000000000..e921484fa2f6 --- /dev/null +++ b/docs/source/de/testing.md @@ -0,0 +1,1293 @@ + + +# Testen + + +Werfen wir einen Blick darauf, wie 🤗 Transformers-Modelle getestet werden und wie Sie neue Tests schreiben und die vorhandenen verbessern können. + +Es gibt 2 Testsuiten im Repository: + +1. `tests` -- Tests für die allgemeine API +2. `examples` -- Tests hauptsächlich für verschiedene Anwendungen, die nicht Teil der API sind + +## Wie Transformatoren getestet werden + +1. Sobald ein PR eingereicht wurde, wird er mit 9 CircleCi Jobs getestet. Jeder neue Commit zu diesem PR wird erneut getestet. Diese Aufträge + sind in dieser [Konfigurationsdatei](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml) definiert, so dass Sie bei Bedarf die gleiche Umgebung auf Ihrem Rechner reproduzieren können. + Umgebung auf Ihrem Rechner reproduzieren können. + + Diese CI-Jobs führen keine `@slow`-Tests durch. + +2. Es gibt 3 Jobs, die von [github actions](https://github.com/huggingface/transformers/actions) ausgeführt werden: + + - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): prüft, ob die torch hub + Integration funktioniert. + + - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): führt schnelle Tests auf der GPU nur bei Commits auf + `main`. Es wird nur ausgeführt, wenn ein Commit auf `main` den Code in einem der folgenden Ordner aktualisiert hat: `src`, + `tests`, `.github` (um zu verhindern, dass er auf hinzugefügten Modellkarten, Notebooks usw. läuft) + + - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): führt normale und langsame Tests auf GPU in + `tests` und `examples`: + +```bash +RUN_SLOW=1 pytest tests/ +RUN_SLOW=1 pytest examples/ +``` + + Die Ergebnisse können Sie [hier](https://github.com/huggingface/transformers/actions) sehen. + + + +## Tests ausführen + + + + + +### Auswahl der auszuführenden Tests + +In diesem Dokument wird ausführlich erläutert, wie Tests ausgeführt werden können. Wenn Sie nach der Lektüre noch mehr Details benötigen +finden Sie diese [hier](https://docs.pytest.org/en/latest/usage.html). + +Hier sind einige der nützlichsten Möglichkeiten, Tests auszuführen. + +Alle ausführen: + +```console +pytest +``` + +oder: + +```bash +make test +``` + +Beachten Sie, dass Letzteres wie folgt definiert ist: + +```bash +python -m pytest -n auto --dist=loadfile -s -v ./tests/ +``` + +was pytest anweist: + +- so viele Testprozesse laufen zu lassen, wie es CPU-Kerne gibt (was zu viele sein könnten, wenn Sie nicht über eine Menge RAM verfügen!) +- sicherzustellen, dass alle Tests aus derselben Datei von demselben Testprozess ausgeführt werden +- Erfassen Sie keine Ausgaben +- im ausführlichen Modus laufen lassen + + + +### Abrufen der Liste aller Tests + +Alle Tests der Testsuite: + +```bash +pytest --collect-only -q +``` + +Alle Tests einer bestimmten Testdatei: + +```bash +pytest tests/test_optimization.py --collect-only -q +``` + +### Führen Sie ein bestimmtes Testmodul aus + +Um ein einzelnes Testmodul auszuführen: + +```bash +pytest tests/utils/test_logging.py +``` + +### Spezifische Tests ausführen + +Da unittest in den meisten Tests verwendet wird, müssen Sie, um bestimmte Untertests auszuführen, den Namen der unittest +Klasse, die diese Tests enthält. Er könnte zum Beispiel lauten: + +```bash +pytest tests/test_optimization.py::OptimizationTest::test_adam_w +``` + +Hier: + +- `tests/test_optimization.py` - die Datei mit den Tests +- `OptimizationTest` - der Name der Klasse +- `test_adam_w` - der Name der spezifischen Testfunktion + +Wenn die Datei mehrere Klassen enthält, können Sie auswählen, dass nur die Tests einer bestimmten Klasse ausgeführt werden sollen. Zum Beispiel: + +```bash +pytest tests/test_optimization.py::OptimizationTest +``` + +führt alle Tests innerhalb dieser Klasse aus. + +Wie bereits erwähnt, können Sie sehen, welche Tests in der Klasse `OptimizationTest` enthalten sind, indem Sie sie ausführen: + +```bash +pytest tests/test_optimization.py::OptimizationTest --collect-only -q +``` + +Sie können Tests mit Hilfe von Schlüsselwortausdrücken ausführen. + +Um nur Tests auszuführen, deren Name `adam` enthält: + +```bash +pytest -k adam tests/test_optimization.py +``` + +Die logischen `und` und `oder` können verwendet werden, um anzugeben, ob alle Schlüsselwörter übereinstimmen sollen oder nur eines. `nicht` kann verwendet werden, um +negieren. + +Um alle Tests auszuführen, außer denen, deren Name `adam` enthält: + +```bash +pytest -k "not adam" tests/test_optimization.py +``` + +Und Sie können die beiden Muster in einem kombinieren: + +```bash +pytest -k "ada and not adam" tests/test_optimization.py +``` + +Um zum Beispiel sowohl `test_adafactor` als auch `test_adam_w` auszuführen, können Sie verwenden: + +```bash +pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py +``` + +Beachten Sie, dass wir hier `oder` verwenden, da wir wollen, dass eines der Schlüsselwörter übereinstimmt, um beide einzuschließen. + +Wenn Sie nur Tests einschließen möchten, die beide Muster enthalten, müssen Sie `und` verwenden: + +```bash +pytest -k "test and ada" tests/test_optimization.py +``` + +### Führen Sie `accelerate` Tests durch + +Manchmal müssen Sie `accelerate` Tests für Ihre Modelle ausführen. Dazu fügen Sie einfach `-m accelerate_tests` zu Ihrem Befehl hinzu, wenn Sie diese Tests bei einem `OPT`-Lauf ausführen möchten: +```bash +RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py +``` + + +### Dokumentationstests ausführen + +Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind. +Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden: + +```python +r""" +Returns: + +Example: + ```python + >>> import torch + >>> from transformers import WhisperModel, WhisperFeatureExtractor + >>> from datasets import load_dataset + + >>> model = WhisperModel.from_pretrained("openai/whisper-base") + >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") + >>> input_features = inputs.input_features + >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id + >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state + >>> list(last_hidden_state.shape) + [1, 2, 512] + ```""" + +``` + +Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen: +```bash +pytest --doctest-modules +``` +Wenn die Datei eine Markdown-Erweiterung hat, sollten Sie das Argument `--doctest-glob="*.md"` hinzufügen. + +### Nur geänderte Tests ausführen + +Mit [pytest-picked](https://github.com/anapaulagomes/pytest-picked) können Sie die Tests ausführen, die sich auf die unstaged Dateien oder den aktuellen Zweig (gemäß Git) beziehen. Auf diese Weise können Sie schnell testen, ob Ihre Änderungen nichts kaputt gemacht haben. +nichts kaputt gemacht haben, da die Tests für Dateien, die Sie nicht verändert haben, nicht ausgeführt werden. + +```bash +pip install pytest-picked +``` + +```bash +pytest --picked +``` + +Alle Tests werden von Dateien und Ordnern ausgeführt, die geändert, aber noch nicht übergeben wurden. + +### Fehlgeschlagene Tests bei Änderung der Quelle automatisch wiederholen + +[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) bietet eine sehr nützliche Funktion zur Erkennung aller fehlgeschlagenen +Tests zu erkennen und dann darauf zu warten, dass Sie Dateien ändern, um die fehlgeschlagenen Tests so lange zu wiederholen, bis sie erfolgreich sind, während Sie die +sie reparieren. So müssen Sie pytest nicht erneut starten, nachdem Sie die Korrektur vorgenommen haben. Dies wird so lange wiederholt, bis alle Tests bestanden sind. +Danach wird erneut ein vollständiger Durchlauf durchgeführt. + +```bash +pip install pytest-xdist +``` + +So rufen Sie den Modus auf: `pytest -f` oder `pytest --looponfail` + +Datei-Änderungen werden erkannt, indem die Wurzelverzeichnisse von `looponfailroots` und alle ihre Inhalte (rekursiv) untersucht werden. +Wenn die Vorgabe für diesen Wert für Sie nicht funktioniert, können Sie ihn in Ihrem Projekt ändern, indem Sie eine Konfigurations +Option in der Datei `setup.cfg` ändern: + +```ini +[tool:pytest] +looponfailroots = transformers tests +``` + +oder die Dateien `pytest.ini`/`tox.ini``: + +```ini +[pytest] +looponfailroots = transformers tests +``` + +Dies würde dazu führen, dass nur nach Dateiänderungen in den jeweiligen Verzeichnissen gesucht wird, die relativ zum Verzeichnis der ini-Datei angegeben sind. +Verzeichnis. + +[pytest-watch](https://github.com/joeyespo/pytest-watch) ist eine alternative Implementierung dieser Funktionalität. + + +### Überspringen eines Testmoduls + +Wenn Sie alle Testmodule ausführen möchten, mit Ausnahme einiger weniger, können Sie diese ausschließen, indem Sie eine explizite Liste der auszuführenden Tests angeben. Für +Beispiel: Um alle Tests außer `test_modeling_*.py` auszuführen: + +```bash +pytest *ls -1 tests/*py | grep -v test_modeling* +``` + +### Status leeren + +CI-Builds und wenn Isolation wichtig ist (gegen Geschwindigkeit), sollte der Cache geleert werden: + +```bash +pytest --cache-clear tests +``` + +### Tests parallel ausführen + +Wie bereits erwähnt, führt `make test` über das Plugin `pytest-xdist` Tests parallel aus (Argument `-n X`, z.B. `-n 2` +um 2 Jobs parallel laufen zu lassen). + +Mit der Option `--dist=` von `pytest-xdist` können Sie steuern, wie die Tests gruppiert werden. Mit `--dist=loadfile` werden die +Tests, die sich in einer Datei befinden, in denselben Prozess. + +Da die Reihenfolge der ausgeführten Tests unterschiedlich und nicht vorhersehbar ist, kann die Ausführung der Testsuite mit `pytest-xdist` +zu Fehlern führt (was bedeutet, dass wir einige unentdeckte gekoppelte Tests haben), verwenden Sie [pytest-replay](https://github.com/ESSS/pytest-replay), um die Tests in der gleichen Reihenfolge abzuspielen, was dabei helfen sollte +diese fehlgeschlagene Sequenz auf ein Minimum zu reduzieren. + +### Testreihenfolge und Wiederholung + +Es ist gut, die Tests mehrmals zu wiederholen, nacheinander, zufällig oder in Gruppen, um mögliche +Abhängigkeiten und zustandsbezogene Fehler zu erkennen (Abriss). Und die einfache, mehrfache Wiederholung ist einfach gut, um +einige Probleme zu erkennen, die durch die Zufälligkeit von DL aufgedeckt werden. + + +#### Wiederholungstests + +- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder): + +```bash +pip install pytest-flakefinder +``` + +Und führen Sie dann jeden Test mehrmals durch (standardmäßig 50): + +```bash +pytest --flake-finder --flake-runs=5 tests/test_failing_test.py +``` + + + +Dieses Plugin funktioniert nicht mit dem `-n` Flag von `pytest-xdist`. + + + + + +Es gibt noch ein anderes Plugin `pytest-repeat`, aber es funktioniert nicht mit `unittest`. + + + +#### Run tests in a random order + +```bash +pip install pytest-random-order +``` + +Wichtig: Das Vorhandensein von `pytest-random-order` sorgt für eine automatische Zufallsanordnung der Tests, es sind keine Konfigurationsänderungen oder +Befehlszeilenoptionen sind nicht erforderlich. + +Wie bereits erläutert, ermöglicht dies die Erkennung von gekoppelten Tests - bei denen der Zustand eines Tests den Zustand eines anderen beeinflusst. Wenn +`pytest-random-order` installiert ist, gibt es den Zufallswert aus, der für diese Sitzung verwendet wurde, z.B: + +```bash +pytest tests +[...] +Using --random-order-bucket=module +Using --random-order-seed=573663 +``` + +Wenn eine bestimmte Sequenz fehlschlägt, können Sie sie reproduzieren, indem Sie genau diesen Seed hinzufügen, z.B: + +```bash +pytest --random-order-seed=573663 +[...] +Using --random-order-bucket=module +Using --random-order-seed=573663 +``` + +Es wird nur dann die exakte Reihenfolge reproduzieren, wenn Sie genau dieselbe Liste von Tests (oder gar keine Liste) verwenden. Sobald Sie beginnen, die Liste +die Liste manuell einzugrenzen, können Sie sich nicht mehr auf den Seed verlassen, sondern müssen die Tests manuell in der genauen Reihenfolge auflisten +auflisten und pytest anweisen, sie nicht zu randomisieren, indem Sie `--random-order-bucket=none` verwenden, z.B.: + +```bash +pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py +``` + +So deaktivieren Sie das Shuffling für alle Tests: + +```bash +pytest --random-order-bucket=none +``` + +Standardmäßig ist `--random-order-bucket=module` impliziert, wodurch die Dateien auf den Modulebenen gemischt werden. Es kann auch +auf den Ebenen `class`, `package`, `global` und `none` mischen. Die vollständigen Details entnehmen Sie bitte der +[Dokumentation] (https://github.com/jbasko/pytest-random-order). + +Eine weitere Alternative zur Randomisierung ist: [`pytest-random`](https://github.com/pytest-dev/pytest-randomly). Dieses +Modul hat eine sehr ähnliche Funktionalität/Schnittstelle, aber es hat nicht die Eimermodi, die in +`pytest-random-order` zur Verfügung. Es hat das gleiche Problem, dass es sich nach der Installation aufdrängt. + +### Variationen von Aussehen und Bedienung + +#### pytest-zucker + +[pytest-sugar](https://github.com/Frozenball/pytest-sugar) ist ein Plugin, das das Erscheinungsbild verbessert, eine +Fortschrittsbalken hinzufügt und Tests, die fehlschlagen, sowie die Bestätigung sofort anzeigt. Es wird bei der Installation automatisch aktiviert. + +```bash +pip install pytest-sugar +``` + +Um Tests ohne sie durchzuführen, führen Sie aus: + +```bash +pytest -p no:sugar +``` + +oder deinstallieren Sie es. + + + +#### Melden Sie den Namen jedes Subtests und seinen Fortschritt + +Für einen einzelnen oder eine Gruppe von Tests über `pytest` (nach `pip install pytest-pspec`): + +```bash +pytest --pspec tests/test_optimization.py +``` + +#### Zeigt fehlgeschlagene Tests sofort an + +[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) zeigt Fehlschläge und Fehler sofort an, anstatt +bis zum Ende der Testsitzung zu warten. + +```bash +pip install pytest-instafail +``` + +```bash +pytest --instafail +``` + +### Zu GPU oder nicht zu GPU + +Bei einem GPU-aktivierten Setup fügen Sie zum Testen im reinen CPU-Modus `CUDA_VISIBLE_DEVICES=""` hinzu: + +```bash +CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py +``` + +oder wenn Sie mehrere Grafikprozessoren haben, können Sie angeben, welcher von `pytest` verwendet werden soll. Wenn Sie zum Beispiel nur den +zweiten Grafikkarte zu verwenden, wenn Sie die Grafikkarten `0` und `1` haben, können Sie folgendes ausführen: + +```bash +CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py +``` + +Dies ist praktisch, wenn Sie verschiedene Aufgaben auf verschiedenen GPUs ausführen möchten. + +Einige Tests müssen nur auf der CPU ausgeführt werden, andere entweder auf der CPU, der GPU oder der TPU und wieder andere auf mehreren GPUs. Die folgenden skip +Dekorateure werden verwendet, um die Anforderungen von Tests in Bezug auf CPU/GPU/TPU festzulegen: + +- `require_torch` - dieser Test wird nur unter Torch ausgeführt +- `require_torch_gpu` - wie `require_torch` plus erfordert mindestens 1 GPU +- `require_torch_multi_gpu` - wie `require_torch` und zusätzlich mindestens 2 GPUs erforderlich +- `require_torch_non_multi_gpu` - wie `require_torch` plus benötigt 0 oder 1 GPUs +- `require_torch_up_to_2_gpus` - wie `require_torch` plus erfordert 0 oder 1 oder 2 GPUs +- `require_torch_tpu` - wie `require_torch` plus erfordert mindestens 1 TPU + +Lassen Sie uns die GPU-Anforderungen in der folgenden Tabelle darstellen: + + +| n gpus | decorator | +|--------|--------------------------------| +| `>= 0` | `@require_torch` | +| `>= 1` | `@require_torch_gpu` | +| `>= 2` | `@require_torch_multi_gpu` | +| `< 2` | `@require_torch_non_multi_gpu` | +| `< 3` | `@require_torch_up_to_2_gpus` | + + +Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder mehr GPUs verfügbar sind und pytorch installiert ist: + +```python no-style +@require_torch_multi_gpu +def test_example_with_multi_gpu(): +``` + +Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel: + +```python no-style +@require_tf +def test_tf_thing_with_tensorflow(): +``` + +Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie +wie Sie ihn einrichten können: + +```python no-style +@require_torch_gpu +@slow +def test_example_slow_on_gpu(): +``` + +Einige Dekoratoren wie `@parametrized` schreiben Testnamen um, daher müssen `@require_*`-Sprungdekoratoren als letztes aufgeführt werden. +zuletzt aufgeführt werden, damit sie korrekt funktionieren. Hier ist ein Beispiel für die korrekte Verwendung: + +```python no-style +@parameterized.expand(...) +@require_torch_multi_gpu +def test_integration_foo(): +``` + +Dieses Problem mit der Reihenfolge gibt es bei `@pytest.mark.parametrize` nicht, Sie können es an den Anfang oder an den Schluss setzen und es wird trotzdem funktionieren. +funktionieren. Aber es funktioniert nur bei Nicht-Unittests. + +Innerhalb von Tests: + +- Wie viele GPUs sind verfügbar: + +```python +from transformers.testing_utils import get_gpu_count + +n_gpu = get_gpu_count() # works with torch and tf +``` + +### Testen mit einem bestimmten PyTorch-Backend oder Gerät + +Um die Testsuite auf einem bestimmten Torch-Gerät auszuführen, fügen Sie `TRANSFORMERS_TEST_DEVICE="$Gerät"` hinzu, wobei `$Gerät` das Ziel-Backend ist. Zum Beispiel, um nur auf der CPU zu testen: +```bash +TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py +``` + +Diese Variable ist nützlich, um benutzerdefinierte oder weniger verbreitete PyTorch-Backends wie `mps` zu testen. Sie kann auch verwendet werden, um den gleichen Effekt wie `CUDA_VISIBLE_DEVICES` zu erzielen, indem Sie bestimmte GPUs anvisieren oder im reinen CPU-Modus testen. + +Bestimmte Geräte erfordern einen zusätzlichen Import, nachdem Sie `torch` zum ersten Mal importiert haben. Dies kann über die Umgebungsvariable `TRANSFORMERS_TEST_BACKEND` festgelegt werden: +```bash +TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py +``` + + +### Verteiltes Training + +`pytest` kann nicht direkt mit verteiltem Training umgehen. Wenn dies versucht wird, tun die Unterprozesse nicht das Richtige +und denken am Ende, sie seien `pytest` und beginnen, die Testsuite in Schleifen auszuführen. Es funktioniert jedoch, wenn man +einen normalen Prozess erzeugt, der dann mehrere Worker erzeugt und die IO-Pipes verwaltet. + +Hier sind einige Tests, die dies verwenden: + +- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py) +- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py) + +Um direkt mit der Ausführung zu beginnen, suchen Sie in diesen Tests nach dem Aufruf `execute_subprocess_async`. + +Sie benötigen mindestens 2 GPUs, um diese Tests in Aktion zu sehen: + +```bash +CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py +``` + +### Erfassung von Ausgaben + +Während der Testausführung werden alle Ausgaben, die an `stdout` und `stderr` gesendet werden, aufgezeichnet. Wenn ein Test oder eine Setup-Methode fehlschlägt, wird die +wird die entsprechende aufgezeichnete Ausgabe in der Regel zusammen mit dem Fehler-Traceback angezeigt. + +Um die Aufzeichnung von Ausgaben zu deaktivieren und `stdout` und `stderr` normal zu erhalten, verwenden Sie `-s` oder `--capture=no`: + +```bash +pytest -s tests/utils/test_logging.py +``` + +So senden Sie Testergebnisse an die JUnit-Formatausgabe: + +```bash +py.test tests --junitxml=result.xml +``` + +### Farbsteuerung + +Keine Farbe zu haben (z.B. gelb auf weißem Hintergrund ist nicht lesbar): + +```bash +pytest --color=no tests/utils/test_logging.py +``` + +### Testbericht an den Online-Dienst pastebin senden + +Erstellen Sie eine URL für jeden Testfehler: + +```bash +pytest --pastebin=failed tests/utils/test_logging.py +``` + +Dadurch werden Informationen über den Testlauf an einen entfernten Paste-Dienst übermittelt und eine URL für jeden Fehlschlag bereitgestellt. Sie können die +Tests wie gewohnt auswählen oder z.B. -x hinzufügen, wenn Sie nur einen bestimmten Fehler senden möchten. + +Erstellen einer URL für ein ganzes Testsitzungsprotokoll: + +```bash +pytest --pastebin=all tests/utils/test_logging.py +``` + +## Tests schreiben + +🤗 Die Tests von Transformers basieren auf `unittest`, werden aber von `pytest` ausgeführt, so dass die meiste Zeit Funktionen aus beiden Systemen +verwendet werden können. + +Sie können [hier](https://docs.pytest.org/en/stable/unittest.html) nachlesen, welche Funktionen unterstützt werden, aber das Wichtigste ist +Wichtig ist, dass die meisten `pytest`-Fixtures nicht funktionieren. Auch die Parametrisierung nicht, aber wir verwenden das Modul +`parametrisiert`, das auf ähnliche Weise funktioniert. + + +### Parametrisierung + +Oft besteht die Notwendigkeit, denselben Test mehrmals auszuführen, aber mit unterschiedlichen Argumenten. Das könnte innerhalb des Tests geschehen +des Tests gemacht werden, aber dann gibt es keine Möglichkeit, den Test mit nur einem Satz von Argumenten auszuführen. + +```python +# test_this1.py +import unittest +from parameterized import parameterized + + +class TestMathUnitTest(unittest.TestCase): + @parameterized.expand( + [ + ("negative", -1.5, -2.0), + ("integer", 1, 1.0), + ("large fraction", 1.6, 1), + ] + ) + def test_floor(self, name, input, expected): + assert_equal(math.floor(input), expected) +``` + +Nun wird dieser Test standardmäßig 3 Mal ausgeführt, wobei jedes Mal die letzten 3 Argumente von `test_floor` den entsprechenden Argumenten in der Parameterliste zugeordnet werden. +die entsprechenden Argumente in der Parameterliste. + +Sie können auch nur die Parameter `negativ` und `ganzzahlig` mit ausführen: + +```bash +pytest -k "negative and integer" tests/test_mytest.py +``` + +oder alle Untertests außer `negativ`, mit: + +```bash +pytest -k "not negative" tests/test_mytest.py +``` + +Neben der Verwendung des gerade erwähnten Filters `-k` können Sie auch den genauen Namen jedes Untertests herausfinden und jeden +oder alle unter Verwendung ihrer genauen Namen ausführen. + +```bash +pytest test_this1.py --collect-only -q +``` + +und es wird aufgelistet: + +```bash +test_this1.py::TestMathUnitTest::test_floor_0_negative +test_this1.py::TestMathUnitTest::test_floor_1_integer +test_this1.py::TestMathUnitTest::test_floor_2_large_fraction +``` + +Jetzt können Sie also nur 2 spezifische Untertests durchführen: + +```bash +pytest test_this1.py::TestMathUnitTest::test_floor_0_negative test_this1.py::TestMathUnitTest::test_floor_1_integer +``` + +Das Modul [parametrisiert](https://pypi.org/project/parameterized/), das sich bereits in den Entwickler-Abhängigkeiten befindet +von `transformers` befindet, funktioniert sowohl für `unittests` als auch für `pytest` Tests. + +Wenn es sich bei dem Test jedoch nicht um einen `Unittest` handelt, können Sie `pytest.mark.parametrize` verwenden (oder Sie können sehen, dass es in +einigen bestehenden Tests verwendet wird, meist unter `Beispiele`). + +Hier ist das gleiche Beispiel, diesmal unter Verwendung der `parametrize`-Markierung von `pytest`: + +```python +# test_this2.py +import pytest + + +@pytest.mark.parametrize( + "name, input, expected", + [ + ("negative", -1.5, -2.0), + ("integer", 1, 1.0), + ("large fraction", 1.6, 1), + ], +) +def test_floor(name, input, expected): + assert_equal(math.floor(input), expected) +``` + +Genau wie bei `parametrisiert` können Sie mit `pytest.mark.parametrize` genau steuern, welche Subtests ausgeführt werden +ausgeführt werden, wenn der Filter `-k` nicht ausreicht. Allerdings erzeugt diese Parametrisierungsfunktion einen etwas anderen Satz von +Namen für die Untertests. Sie sehen folgendermaßen aus: + +```bash +pytest test_this2.py --collect-only -q +``` + +und es wird aufgelistet: + +```bash +test_this2.py::test_floor[integer-1-1.0] +test_this2.py::test_floor[negative--1.5--2.0] +test_this2.py::test_floor[large fraction-1.6-1] +``` + +Jetzt können Sie also nur den spezifischen Test durchführen: + +```bash +pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0] +``` + +wie im vorherigen Beispiel. + + + +### Dateien und Verzeichnisse + +In Tests müssen wir oft wissen, wo sich Dinge relativ zur aktuellen Testdatei befinden, und das ist nicht trivial, da der Test +von mehreren Verzeichnissen aus aufgerufen werden kann oder sich in Unterverzeichnissen mit unterschiedlicher Tiefe befinden kann. Eine Hilfsklasse +`transformers.test_utils.TestCasePlus` löst dieses Problem, indem sie alle grundlegenden Pfade sortiert und einfache +Zugriffsmöglichkeiten auf sie bietet: + +- `pathlib`-Objekte (alle vollständig aufgelöst): + + - `test_file_path` - der aktuelle Testdateipfad, d.h. `__file__` + - `test_file_dir` - das Verzeichnis, das die aktuelle Testdatei enthält + - `tests_dir` - das Verzeichnis der `tests` Testreihe + - `examples_dir` - das Verzeichnis der `examples` Test-Suite + - repo_root_dir` - das Verzeichnis des Repositorys + - src_dir` - das Verzeichnis von `src` (d.h. wo sich das Unterverzeichnis `transformers` befindet) + +- stringifizierte Pfade - wie oben, aber diese geben Pfade als Strings zurück, anstatt als `pathlib`-Objekte: + + - `test_file_path_str` + - `test_file_dir_str` + - `tests_dir_str` + - `examples_dir_str` + - `repo_root_dir_str` + - `src_dir_str` + +Um diese zu verwenden, müssen Sie lediglich sicherstellen, dass der Test in einer Unterklasse von +`transformers.test_utils.TestCasePlus` befindet. Zum Beispiel: + +```python +from transformers.testing_utils import TestCasePlus + + +class PathExampleTest(TestCasePlus): + def test_something_involving_local_locations(self): + data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" +``` + +Wenn Sie Pfade nicht über `pathlib` manipulieren müssen oder nur einen Pfad als String benötigen, können Sie jederzeit +`str()` auf das `pathlib`-Objekt anwenden oder die Accessoren mit der Endung `_str` verwenden. Zum Beispiel: + +```python +from transformers.testing_utils import TestCasePlus + + +class PathExampleTest(TestCasePlus): + def test_something_involving_stringified_locations(self): + examples_dir = self.examples_dir_str +``` + +### Temporäre Dateien und Verzeichnisse + +Die Verwendung eindeutiger temporärer Dateien und Verzeichnisse ist für die parallele Durchführung von Tests unerlässlich, damit sich die Tests nicht gegenseitig überschreiben. +Daten gegenseitig überschreiben. Außerdem möchten wir, dass die temporären Dateien und Verzeichnisse am Ende jedes Tests, der sie erstellt hat, gelöscht werden. +erstellt hat. Daher ist die Verwendung von Paketen wie `tempfile`, die diese Anforderungen erfüllen, unerlässlich. + +Beim Debuggen von Tests müssen Sie jedoch sehen können, was in der temporären Datei oder dem temporären Verzeichnis gespeichert wird und Sie möchten +Sie müssen den genauen Pfad kennen und dürfen ihn nicht bei jedem neuen Testdurchlauf zufällig ändern. + +Für solche Zwecke ist die Hilfsklasse `transformers.test_utils.TestCasePlus` am besten geeignet. Sie ist eine Unterklasse von +Unittest.TestCase`, so dass wir in den Testmodulen einfach von ihr erben können. + +Hier ist ein Beispiel für die Verwendung dieser Klasse: + +```python +from transformers.testing_utils import TestCasePlus + + +class ExamplesTests(TestCasePlus): + def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() +``` + +Dieser Code erstellt ein eindeutiges temporäres Verzeichnis und setzt `tmp_dir` auf dessen Speicherort. + +- Erstellen Sie ein eindeutiges temporäres Verzeichnis: + +```python +def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() +``` + +tmp_dir" enthält den Pfad zu dem erstellten temporären Verzeichnis. Es wird am Ende des Tests automatisch entfernt. +Tests entfernt. + +- Erstellen Sie ein temporäres Verzeichnis meiner Wahl, stellen Sie sicher, dass es leer ist, bevor der Test beginnt, und leeren Sie es nach dem Test nicht. + +```python +def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir("./xxx") +``` + +Dies ist nützlich für die Fehlersuche, wenn Sie ein bestimmtes Verzeichnis überwachen und sicherstellen möchten, dass die vorherigen Tests keine Daten darin hinterlassen haben. +keine Daten dort hinterlassen haben. + +- Sie können das Standardverhalten außer Kraft setzen, indem Sie die Argumente `before` und `after` direkt überschreiben, was zu einem der folgenden Verhaltensweisen führt + folgenden Verhaltensweisen: + + - `before=True`: das temporäre Verzeichnis wird immer zu Beginn des Tests gelöscht. + - `before=False`: wenn das temporäre Verzeichnis bereits existiert, bleiben alle vorhandenen Dateien dort erhalten. + - `after=True`: das temporäre Verzeichnis wird immer am Ende des Tests gelöscht. + - `after=False`: das temporäre Verzeichnis wird am Ende des Tests immer beibehalten. + + + +Um das Äquivalent von `rm -r` sicher ausführen zu können, sind nur Unterverzeichnisse des Projektarchivs checkout erlaubt, wenn +ein explizites `tmp_dir` verwendet wird, so dass nicht versehentlich ein `/tmp` oder ein ähnlich wichtiger Teil des Dateisystems vernichtet wird. +d.h. geben Sie bitte immer Pfade an, die mit `./` beginnen. + + + + + +Jeder Test kann mehrere temporäre Verzeichnisse registrieren, die alle automatisch entfernt werden, sofern nicht anders gewünscht. +anders. + + + +### Temporäre Überschreibung von sys.path + +Wenn Sie `sys.path` vorübergehend überschreiben müssen, um z.B. von einem anderen Test zu importieren, können Sie den +Kontextmanager `ExtendSysPath` verwenden. Beispiel: + + +```python +import os +from transformers.testing_utils import ExtendSysPath + +bindir = os.path.abspath(os.path.dirname(__file__)) +with ExtendSysPath(f"{bindir}/.."): + from test_trainer import TrainerIntegrationCommon # noqa +``` + +### Überspringen von Tests + +Dies ist nützlich, wenn ein Fehler gefunden und ein neuer Test geschrieben wird, der Fehler aber noch nicht behoben ist. Damit wir ihn +in das Haupt-Repository zu übertragen, müssen wir sicherstellen, dass er bei `make test` übersprungen wird. + +Methoden: + +- Ein **Skip** bedeutet, dass Sie erwarten, dass Ihr Test nur dann erfolgreich ist, wenn einige Bedingungen erfüllt sind, andernfalls sollte pytest den Test überspringen. + die Ausführung des Tests ganz überspringen. Übliche Beispiele sind das Überspringen von Tests, die nur unter Windows laufen, auf Nicht-Windows-Plattformen oder das Überspringen von + Tests, die von einer externen Ressource abhängen, die im Moment nicht verfügbar ist (z.B. eine Datenbank). + +- Ein **xfail** bedeutet, dass Sie erwarten, dass ein Test aus irgendeinem Grund fehlschlägt. Ein gängiges Beispiel ist ein Test für eine Funktion, die noch nicht + noch nicht implementiert oder ein noch nicht behobener Fehler. Wenn ein Test trotz eines erwarteten Fehlschlags bestanden wird (markiert mit + pytest.mark.xfail), ist dies ein xpass und wird in der Testzusammenfassung gemeldet. + +Einer der wichtigsten Unterschiede zwischen den beiden ist, dass `skip` den Test nicht ausführt, während `xfail` dies tut. Wenn also der +Code, der fehlerhaft ist, einen schlechten Zustand verursacht, der sich auf andere Tests auswirkt, sollten Sie also nicht `xfail` verwenden. + +#### Implementierung + +- Hier sehen Sie, wie Sie einen ganzen Test bedingungslos überspringen können: + +```python no-style +@unittest.skip("this bug needs to be fixed") +def test_feature_x(): +``` + +oder mit pytest: + +```python no-style +@pytest.mark.skip(reason="this bug needs to be fixed") +``` + +oder mit dem `xfail` Weg: + +```python no-style +@pytest.mark.xfail +def test_feature_x(): +``` + +- Hier erfahren Sie, wie Sie einen Test aufgrund einer internen Prüfung innerhalb des Tests auslassen können: + +```python +def test_feature_x(): + if not has_something(): + pytest.skip("unsupported configuration") +``` + +oder das ganze Modul: + +```python +import pytest + +if not pytest.config.getoption("--custom-flag"): + pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True) +``` + +oder mit dem `xfail` Weg: + +```python +def test_feature_x(): + pytest.xfail("expected to fail until bug XYZ is fixed") +``` + +- Hier erfahren Sie, wie Sie alle Tests in einem Modul überspringen können, wenn ein Import fehlt: + +```python +docutils = pytest.importorskip("docutils", minversion="0.3") +``` + +- Einen Test aufgrund einer Bedingung überspringen: + +```python no-style +@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher") +def test_feature_x(): +``` + +oder: + +```python no-style +@unittest.skipIf(torch_device == "cpu", "Can't do half precision") +def test_feature_x(): +``` + +oder überspringen Sie das ganze Modul: + +```python no-style +@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows") +class TestClass(): + def test_feature_x(self): +``` + +Weitere Details, Beispiele und Möglichkeiten finden Sie [hier](https://docs.pytest.org/en/latest/skipping.html). + +### Langsame Tests + +Die Bibliothek der Tests wächst ständig, und einige der Tests brauchen Minuten, um ausgeführt zu werden, daher können wir es uns nicht leisten, eine Stunde zu warten, bis die +eine Stunde auf die Fertigstellung der Testsuite auf CI zu warten. Daher sollten langsame Tests, mit einigen Ausnahmen für wichtige Tests, wie im folgenden Beispiel +wie im folgenden Beispiel markiert werden: + +```python no-style +from transformers.testing_utils import slow +@slow +def test_integration_foo(): +``` + +Sobald ein Test als `@langsam` markiert ist, setzen Sie die Umgebungsvariable `RUN_SLOW=1`, um solche Tests auszuführen, z.B: + +```bash +RUN_SLOW=1 pytest tests +``` + +Einige Dekoratoren wie `@parameterized` schreiben Testnamen um, daher müssen `@slow` und die übrigen Skip-Dekoratoren +`@require_*` müssen als letztes aufgeführt werden, damit sie korrekt funktionieren. Hier ist ein Beispiel für die korrekte Verwendung: + +```python no-style +@parameteriz ed.expand(...) +@slow +def test_integration_foo(): +``` + +Wie zu Beginn dieses Dokuments erläutert, werden langsame Tests nach einem Zeitplan ausgeführt und nicht in PRs CI +Prüfungen. Es ist also möglich, dass einige Probleme bei der Einreichung eines PRs übersehen werden und zusammengeführt werden. Solche Probleme werden +werden beim nächsten geplanten CI-Job abgefangen. Das bedeutet aber auch, dass es wichtig ist, die langsamen Tests auf Ihrem +Rechner auszuführen, bevor Sie den PR einreichen. + +Hier ist ein grober Entscheidungsmechanismus für die Auswahl der Tests, die als langsam markiert werden sollen: + +Wenn der Test auf eine der internen Komponenten der Bibliothek ausgerichtet ist (z.B. Modellierungsdateien, Tokenisierungsdateien, +Pipelines), dann sollten wir diesen Test in der nicht langsamen Testsuite ausführen. Wenn er sich auf einen anderen Aspekt der Bibliothek bezieht, +wie z.B. die Dokumentation oder die Beispiele, dann sollten wir diese Tests in der langsamen Testsuite durchführen. Und dann, zur Verfeinerung +Ansatz zu verfeinern, sollten wir Ausnahmen einführen: + +- Alle Tests, die einen umfangreichen Satz von Gewichten oder einen Datensatz mit einer Größe von mehr als ~50MB herunterladen müssen (z.B. Modell- oder + Tokenizer-Integrationstests, Pipeline-Integrationstests) sollten auf langsam gesetzt werden. Wenn Sie ein neues Modell hinzufügen, sollten Sie + sollten Sie eine kleine Version des Modells (mit zufälligen Gewichtungen) für Integrationstests erstellen und in den Hub hochladen. Dies wird + wird in den folgenden Abschnitten erläutert. +- Alle Tests, die ein Training durchführen müssen, das nicht speziell auf Schnelligkeit optimiert ist, sollten auf langsam gesetzt werden. +- Wir können Ausnahmen einführen, wenn einige dieser Tests, die nicht langsam sein sollten, unerträglich langsam sind, und sie auf + @langsam`. Auto-Modellierungstests, die große Dateien auf der Festplatte speichern und laden, sind ein gutes Beispiel für Tests, die als + als `@langsam` markiert sind. +- Wenn ein Test in weniger als 1 Sekunde auf CI abgeschlossen wird (einschließlich eventueller Downloads), sollte es sich trotzdem um einen normalen Test handeln. + +Insgesamt müssen alle nicht langsamen Tests die verschiedenen Interna abdecken und dabei schnell bleiben. Zum Beispiel, +kann eine signifikante Abdeckung erreicht werden, indem Sie mit speziell erstellten kleinen Modellen mit zufälligen Gewichten testen. Solche Modelle +haben eine sehr geringe Anzahl von Schichten (z.B. 2), Vokabeln (z.B. 1000), usw. Dann können die `@slow`-Tests große +langsame Modelle verwenden, um qualitative Tests durchzuführen. Um die Verwendung dieser Modelle zu sehen, suchen Sie einfach nach *winzigen* Modellen mit: + +```bash +grep tiny tests examples +``` + +Hier ist ein Beispiel für ein [Skript](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py), das das winzige Modell erstellt hat +[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). Sie können es ganz einfach an Ihre eigene +Architektur Ihres Modells anpassen. + +Es ist leicht, die Laufzeit falsch zu messen, wenn zum Beispiel ein großes Modell heruntergeladen wird, aber wenn +Sie es lokal testen, würden die heruntergeladenen Dateien zwischengespeichert und somit die Download-Zeit nicht gemessen werden. Prüfen Sie daher den +Ausführungsgeschwindigkeitsbericht in den CI-Protokollen (die Ausgabe von `pytest --durations=0 tests`). + +Dieser Bericht ist auch nützlich, um langsame Ausreißer zu finden, die nicht als solche gekennzeichnet sind oder die neu geschrieben werden müssen, um schnell zu sein. +Wenn Sie bemerken, dass die Testsuite beim CI langsam wird, zeigt die oberste Liste dieses Berichts die langsamsten +Tests. + + +### Testen der stdout/stderr-Ausgabe + +Um Funktionen zu testen, die in `stdout` und/oder `stderr` schreiben, kann der Test auf diese Ströme zugreifen, indem er die +[capsys system](https://docs.pytest.org/en/latest/capture.html) von `pytest` zugreifen. So wird dies bewerkstelligt: + +```python +import sys + + +def print_to_stdout(s): + print(s) + + +def print_to_stderr(s): + sys.stderr.write(s) + + +def test_result_and_stdout(capsys): + msg = "Hello" + print_to_stdout(msg) + print_to_stderr(msg) + out, err = capsys.readouterr() # consume the captured output streams + # optional: if you want to replay the consumed streams: + sys.stdout.write(out) + sys.stderr.write(err) + # test: + assert msg in out + assert msg in err +``` + +Und natürlich wird `stderr` in den meisten Fällen als Teil einer Ausnahme auftreten, so dass try/except in einem solchen Fall verwendet werden muss +Fall verwendet werden: + +```python +def raise_exception(msg): + raise ValueError(msg) + + +def test_something_exception(): + msg = "Not a good value" + error = "" + try: + raise_exception(msg) + except Exception as e: + error = str(e) + assert msg in error, f"{msg} is in the exception:\n{error}" +``` + +Ein anderer Ansatz zur Erfassung von stdout ist `contextlib.redirect_stdout`: + +```python +from io import StringIO +from contextlib import redirect_stdout + + +def print_to_stdout(s): + print(s) + + +def test_result_and_stdout(): + msg = "Hello" + buffer = StringIO() + with redirect_stdout(buffer): + print_to_stdout(msg) + out = buffer.getvalue() + # optional: if you want to replay the consumed streams: + sys.stdout.write(out) + # test: + assert msg in out +``` + +Ein wichtiges potenzielles Problem beim Erfassen von stdout ist, dass es `r` Zeichen enthalten kann, die bei normalem `print` +alles zurücksetzen, was bisher gedruckt wurde. Mit `pytest` gibt es kein Problem, aber mit `pytest -s` werden diese +werden diese Zeichen in den Puffer aufgenommen. Um den Test mit und ohne `-s` laufen zu lassen, müssen Sie also eine zusätzliche Bereinigung +zusätzliche Bereinigung der erfassten Ausgabe vornehmen, indem Sie `re.sub(r'~.*\r', '', buf, 0, re.M)` verwenden. + +Aber dann haben wir einen Hilfskontextmanager-Wrapper, der sich automatisch um alles kümmert, unabhängig davon, ob er +einige "*.*.*.*" enthält oder nicht: + +```python +from transformers.testing_utils import CaptureStdout + +with CaptureStdout() as cs: + function_that_writes_to_stdout() +print(cs.out) +``` + +Hier ist ein vollständiges Testbeispiel: + +```python +from transformers.testing_utils import CaptureStdout + +msg = "Secret message\r" +final = "Hello World" +with CaptureStdout() as cs: + print(msg + final) +assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}" +``` + +Wenn Sie `stderr` aufzeichnen möchten, verwenden Sie stattdessen die Klasse `CaptureStderr`: + +```python +from transformers.testing_utils import CaptureStderr + +with CaptureStderr() as cs: + function_that_writes_to_stderr() +print(cs.err) +``` + +Wenn Sie beide Streams auf einmal erfassen müssen, verwenden Sie die übergeordnete Klasse `CaptureStd`: + +```python +from transformers.testing_utils import CaptureStd + +with CaptureStd() as cs: + function_that_writes_to_stdout_and_stderr() +print(cs.err, cs.out) +``` + +Um das Debuggen von Testproblemen zu erleichtern, geben diese Kontextmanager standardmäßig die aufgezeichneten Streams beim Verlassen +aus dem Kontext wieder. + + +### Erfassen von Logger-Streams + +Wenn Sie die Ausgabe eines Loggers validieren müssen, können Sie `CaptureLogger` verwenden: + +```python +from transformers import logging +from transformers.testing_utils import CaptureLogger + +msg = "Testing 1, 2, 3" +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.bart.tokenization_bart") +with CaptureLogger(logger) as cl: + logger.info(msg) +assert cl.out, msg + "\n" +``` + +### Testen mit Umgebungsvariablen + +Wenn Sie die Auswirkungen von Umgebungsvariablen für einen bestimmten Test testen möchten, können Sie einen Hilfsdekorator verwenden +`transformers.testing_utils.mockenv` + +```python +from transformers.testing_utils import mockenv + + +class HfArgumentParserTest(unittest.TestCase): + @mockenv(TRANSFORMERS_VERBOSITY="error") + def test_env_override(self): + env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) +``` + +Manchmal muss ein externes Programm aufgerufen werden, was die Einstellung von `PYTHONPATH` in `os.environ` erfordert, um mehrere lokale Pfade einzuschließen. +mehrere lokale Pfade. Eine Hilfsklasse `transformers.test_utils.TestCasePlus` hilft Ihnen dabei: + +```python +from transformers.testing_utils import TestCasePlus + + +class EnvExampleTest(TestCasePlus): + def test_external_prog(self): + env = self.get_env() + # now call the external program, passing `env` to it +``` + +Je nachdem, ob die Testdatei in der Testsuite `tests` oder in `examples` war, wird sie korrekt eingerichtet +env[PYTHONPATH]` eines dieser beiden Verzeichnisse und auch das `src` Verzeichnis, um sicherzustellen, dass der Test gegen das aktuelle +um sicherzustellen, dass der Test mit dem aktuellen Projektarchiv durchgeführt wird, und schließlich mit dem, was in `env[PYTHONPATH]` bereits eingestellt war, bevor der Test aufgerufen wurde. +wenn überhaupt. + +Diese Hilfsmethode erstellt eine Kopie des Objekts `os.environ`, so dass das Original intakt bleibt. + + +### Reproduzierbare Ergebnisse erhalten + +In manchen Situationen möchten Sie vielleicht die Zufälligkeit Ihrer Tests beseitigen. Um identische, reproduzierbare Ergebnisse zu erhalten, müssen Sie +müssen Sie den Seed festlegen: + +```python +seed = 42 + +# python RNG +import random + +random.seed(seed) + +# pytorch RNGs +import torch + +torch.manual_seed(seed) +torch.backends.cudnn.deterministic = True +if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + +# numpy RNG +import numpy as np + +np.random.seed(seed) + +# tf RNG +tf.random.set_seed(seed) +``` + +### Tests debuggen + +Um einen Debugger an der Stelle zu starten, an der die Warnung auftritt, gehen Sie wie folgt vor: + +```bash +pytest tests/utils/test_logging.py -W error::UserWarning --pdb +``` + +## Arbeiten mit Github-Aktionen-Workflows + +Um einen CI-Job für einen Self-Push-Workflow auszulösen, müssen Sie: + +1. Erstellen Sie einen neuen Zweig auf `transformers` Ursprung (keine Gabelung!). +2. Der Name der Verzweigung muss entweder mit `ci_` oder `ci-` beginnen (`main` löst ihn auch aus, aber wir können keine PRs auf + `main`). Es wird auch nur für bestimmte Pfade ausgelöst - Sie können die aktuelle Definition finden, falls sie + falls sie sich seit der Erstellung dieses Dokuments geändert hat [hier](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) unter *push:* +3. Erstellen Sie einen PR von diesem Zweig. +4. Dann können Sie sehen, wie der Job erscheint [hier](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). Er wird möglicherweise nicht sofort ausgeführt, wenn es + ein Backlog vorhanden ist. + + + + +## Testen experimenteller CI-Funktionen + +Das Testen von CI-Funktionen kann potenziell problematisch sein, da es die normale CI-Funktion beeinträchtigen kann. Wenn also eine +neue CI-Funktion hinzugefügt werden soll, sollte dies wie folgt geschehen. + +1. Erstellen Sie einen neuen Auftrag, der die zu testende Funktion testet. +2. Der neue Job muss immer erfolgreich sein, so dass er uns ein grünes ✓ gibt (Details unten). +3. Lassen Sie ihn einige Tage lang laufen, um zu sehen, dass eine Vielzahl verschiedener PR-Typen darauf laufen (Benutzer-Gabelzweige, + nicht geforkte Zweige, Zweige, die von github.com UI direct file edit stammen, verschiedene erzwungene Pushes, etc. - es gibt + es gibt so viele), während Sie die Protokolle des experimentellen Jobs überwachen (nicht den gesamten Job grün, da er absichtlich immer + grün) +4. Wenn klar ist, dass alles in Ordnung ist, fügen Sie die neuen Änderungen in die bestehenden Jobs ein. + +Auf diese Weise wird der normale Arbeitsablauf nicht durch Experimente mit der CI-Funktionalität selbst beeinträchtigt. + +Wie können wir nun dafür sorgen, dass der Auftrag immer erfolgreich ist, während die neue CI-Funktion entwickelt wird? + +Einige CIs, wie TravisCI, unterstützen ignore-step-failure und melden den gesamten Job als erfolgreich, aber CircleCI und +Github Actions unterstützen dies zum jetzigen Zeitpunkt nicht. + +Sie können also die folgende Abhilfe verwenden: + +1. Setzen Sie `set +euo pipefail` am Anfang des Ausführungsbefehls, um die meisten potenziellen Fehler im Bash-Skript zu unterdrücken. +2. Der letzte Befehl muss ein Erfolg sein: `echo "done"` oder einfach `true` reicht aus. + +Hier ist ein Beispiel: + +```yaml +- run: + name: run CI experiment + command: | + set +euo pipefail + echo "setting run-all-despite-any-errors-mode" + this_command_will_fail + echo "but bash continues to run" + # emulate another failure + false + # but the last command must be a success + echo "during experiment do not remove: reporting success to CI, even if there were failures" +``` + +Für einfache Befehle können Sie auch Folgendes tun: + +```bash +cmd_that_may_fail || true +``` + +Wenn Sie mit den Ergebnissen zufrieden sind, integrieren Sie den experimentellen Schritt oder Job natürlich in den Rest der normalen Jobs, +Entfernen Sie dabei `set +euo pipefail` oder andere Dinge, die Sie eventuell hinzugefügt haben, um sicherzustellen, dass der experimentelle Auftrag nicht +den normalen CI-Betrieb nicht beeinträchtigt. + +Dieser ganze Prozess wäre viel einfacher gewesen, wenn wir nur etwas wie `allow-failure` für den +experimentellen Schritt festlegen könnten und ihn scheitern lassen würden, ohne den Gesamtstatus der PRs zu beeinträchtigen. Aber wie bereits erwähnt, haben CircleCI und +Github Actions dies im Moment nicht unterstützen. + +Sie können in diesen CI-spezifischen Threads für diese Funktion stimmen und sehen, wo sie steht: + +- [Github Actions:](https://github.com/actions/toolkit/issues/399) +- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344) diff --git a/docs/source/de/training.md b/docs/source/de/training.md new file mode 100644 index 000000000000..493de3052bbf --- /dev/null +++ b/docs/source/de/training.md @@ -0,0 +1,433 @@ + + +# Optimierung eines vortrainierten Modells + +[[open-in-colab]] + +Die Verwendung eines vorab trainierten Modells hat erhebliche Vorteile. Es reduziert die Rechenkosten und den CO2-Fußabdruck und ermöglicht Ihnen die Verwendung von Modellen, die dem neuesten Stand der Technik entsprechen, ohne dass Sie ein Modell von Grund auf neu trainieren müssen. Transformers bietet Zugang zu Tausenden von vortrainierten Modellen für eine Vielzahl von Aufgaben. Wenn Sie ein vorab trainiertes Modell verwenden, trainieren Sie es auf einem für Ihre Aufgabe spezifischen Datensatz. Dies wird als Feinabstimmung bezeichnet und ist eine unglaublich leistungsfähige Trainingstechnik. In diesem Tutorial werden Sie ein vortrainiertes Modell mit einem Deep-Learning-Framework Ihrer Wahl feinabstimmen: + +* Feinabstimmung eines vorab trainierten Modells mit 🤗 Transformers [`Trainer`]. +* Feinabstimmung eines vorab trainierten Modells in TensorFlow mit Keras. +* Feinabstimmung eines vorab trainierten Modells in nativem PyTorch. + + + +## Vorbereitung eines Datensatzes + + + +Bevor Sie die Feinabstimmung eines vortrainierten Modells vornehmen können, müssen Sie einen Datensatz herunterladen und für das Training vorbereiten. Im vorangegangenen Leitfaden haben Sie gelernt, wie man Daten für das Training aufbereitet, und jetzt haben Sie die Gelegenheit, diese Fähigkeiten zu testen! + +Laden Sie zunächst den Datensatz [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full): + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("yelp_review_full") +>>> dataset["train"][100] +{'label': 0, + 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} +``` + +Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + +>>> def tokenize_function(examples): +... return tokenizer(examples["text"], padding="max_length", truncation=True) + + +>>> tokenized_datasets = dataset.map(tokenize_function, batched=True) +``` + +Wenn Sie möchten, können Sie eine kleinere Teilmenge des gesamten Datensatzes für die Feinabstimmung erstellen, um den Zeitaufwand zu verringern: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + + + +## Training + +An dieser Stelle sollten Sie dem Abschnitt folgen, der dem Rahmen entspricht, den Sie verwenden möchten. Sie können über die Links +in der rechten Seitenleiste können Sie zu dem gewünschten Abschnitt springen - und wenn Sie den gesamten Inhalt eines bestimmten Frameworks ausblenden möchten, +klicken Sie einfach auf die Schaltfläche oben rechts im Block des jeweiligen Frameworks! + + + + + +## Trainieren mit PyTorch Trainer + +🤗 Transformers bietet eine [`Trainer`]-Klasse, die für das Training von 🤗 Transformers-Modellen optimiert ist und es einfacher macht, mit dem Training zu beginnen, ohne manuell eine eigene Trainingsschleife zu schreiben. Die [`Trainer`]-API unterstützt eine breite Palette von Trainingsoptionen und Funktionen wie Logging, Gradientenakkumulation und gemischte Präzision. + +Beginnen Sie mit dem Laden Ihres Modells und geben Sie die Anzahl der erwarteten Labels an. Aus dem Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields) wissen Sie, dass es fünf Labels gibt: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + + + +Es wird eine Warnung angezeigt, dass einige der trainierten Parameter nicht verwendet werden und einige Parameter zufällig +initialisiert werden. Machen Sie sich keine Sorgen, das ist völlig normal! Der vorher trainierte Kopf des BERT-Modells wird verworfen und durch einen zufällig initialisierten Klassifikationskopf ersetzt. Sie werden diesen neuen Modellkopf in Ihrer Sequenzklassifizierungsaufgabe feinabstimmen, indem Sie das Wissen des vortrainierten Modells auf ihn übertragen. + + + +### Hyperparameter für das Training + +Als Nächstes erstellen Sie eine Klasse [`TrainingArguments`], die alle Hyperparameter enthält, die Sie einstellen können, sowie Flags zur Aktivierung verschiedener Trainingsoptionen. Für dieses Lernprogramm können Sie mit den Standard- [Hyperparametern](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) beginnen, aber Sie können mit diesen experimentieren, um Ihre optimalen Einstellungen zu finden. + +Geben Sie an, wo die Kontrollpunkte Ihres Trainings gespeichert werden sollen: + +```py +>>> from transformers import TrainingArguments + +>>> training_args = TrainingArguments(output_dir="test_trainer") +``` + +### Auswerten + +Der [`Trainer`] wertet die Leistung des Modells während des Trainings nicht automatisch aus. Sie müssen [`Trainer`] eine Funktion übergeben, um Metriken zu berechnen und zu berichten. Die [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) Bibliothek bietet eine einfache [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) Funktion, die Sie mit der [`evaluate.load`] Funktion laden können (siehe diese [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) für weitere Informationen): + +```py +>>> import numpy as np +>>> import evaluate + +>>> metric = evaluate.load("accuracy") +``` + +Rufen Sie [`~evaluate.compute`] auf `metric` auf, um die Genauigkeit Ihrer Vorhersagen zu berechnen. Bevor Sie Ihre Vorhersagen an `compute` übergeben, müssen Sie die Vorhersagen in Logits umwandeln (denken Sie daran, dass alle 🤗 Transformers-Modelle Logits zurückgeben): + +```py +>>> def compute_metrics(eval_pred): +... logits, labels = eval_pred +... predictions = np.argmax(logits, axis=-1) +... return metric.compute(predictions=predictions, references=labels) +``` + +Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `evaluation_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln: + +```py +>>> from transformers import TrainingArguments, Trainer + +>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") +``` + +### Trainer + +Erstellen Sie ein [`Trainer`]-Objekt mit Ihrem Modell, Trainingsargumenten, Trainings- und Testdatensätzen und einer Evaluierungsfunktion: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +Anschließend können Sie Ihr Modell durch den Aufruf von [`~transformers.Trainer.train`] optimieren: + +```py +>>> trainer.train() +``` + + + + + + +## Trainieren Sie ein TensorFlow-Modell mit Keras + +Sie können auch 🤗 Transformers Modelle in TensorFlow mit der Keras API trainieren! + +### Laden von Daten für Keras + +Wenn Sie ein 🤗 Transformers Modell mit der Keras API trainieren wollen, müssen Sie Ihren Datensatz in ein Format konvertieren, das +Keras versteht. Wenn Ihr Datensatz klein ist, können Sie das Ganze einfach in NumPy-Arrays konvertieren und an Keras übergeben. +Probieren wir das zuerst aus, bevor wir etwas Komplizierteres tun. + +Laden Sie zunächst ein Dataset. Wir werden den CoLA-Datensatz aus dem [GLUE-Benchmark](https://huggingface.co/datasets/glue) verwenden, +da es sich um eine einfache Aufgabe zur Klassifizierung von binärem Text handelt, und nehmen vorerst nur den Trainingssplit. + +```py +from datasets import load_dataset + +dataset = load_dataset("glue", "cola") +dataset = dataset["train"] # Just take the training split for now +``` + +Als nächstes laden Sie einen Tokenizer und tokenisieren die Daten als NumPy-Arrays. Beachten Sie, dass die Beschriftungen bereits eine Liste von 0 und 1en sind, +Wir können sie also ohne Tokenisierung direkt in ein NumPy-Array konvertieren! + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True) +# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras +tokenized_data = dict(tokenized_data) + +labels = np.array(dataset["label"]) # Label is already an array of 0 and 1 +``` + +Schließlich laden, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) und [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) Sie das Modell: + +```py +from transformers import TFAutoModelForSequenceClassification +from tensorflow.keras.optimizers import Adam + +# Load and compile our model +model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased") +# Lower learning rates are often better for fine-tuning transformers +model.compile(optimizer=Adam(3e-5)) + +model.fit(tokenized_data, labels) +``` + + + +Sie müssen Ihren Modellen kein Verlustargument übergeben, wenn Sie sie `compile()`! Hugging-Face-Modelle wählen automatisch +einen Loss, der für ihre Aufgabe und Modellarchitektur geeignet ist, wenn dieses Argument leer gelassen wird. Sie können jederzeit außer Kraft setzen, indem Sie selbst einen Loss angeben, wenn Sie das möchten! + + + +Dieser Ansatz eignet sich hervorragend für kleinere Datensätze, aber bei größeren Datensätzen kann er zu einem Problem werden. Warum? +Weil das tokenisierte Array und die Beschriftungen vollständig in den Speicher geladen werden müssten, und weil NumPy nicht mit +"gezackte" Arrays nicht verarbeiten kann, so dass jedes tokenisierte Sample auf die Länge des längsten Samples im gesamten Datensatz aufgefüllt werden müsste. +Datensatzes aufgefüllt werden. Dadurch wird das Array noch größer, und all die aufgefüllten Token verlangsamen auch das Training! + +### Laden von Daten als tf.data.Dataset + +Wenn Sie eine Verlangsamung des Trainings vermeiden wollen, können Sie Ihre Daten stattdessen als `tf.data.Dataset` laden. Sie können zwar Ihre eigene +tf.data"-Pipeline schreiben können, wenn Sie wollen, haben wir zwei bequeme Methoden, um dies zu tun: + +- [`~TFPreTrainedModel.prepare_tf_dataset`]: Dies ist die Methode, die wir in den meisten Fällen empfehlen. Da es sich um eine Methode +Ihres Modells ist, kann sie das Modell inspizieren, um automatisch herauszufinden, welche Spalten als Modelleingaben verwendet werden können, und +verwirft die anderen, um einen einfacheren, leistungsfähigeren Datensatz zu erstellen. +- [~datasets.Dataset.to_tf_dataset`]: Diese Methode ist eher auf niedriger Ebene angesiedelt und ist nützlich, wenn Sie genau kontrollieren wollen, wie +Dataset erstellt wird, indem man genau angibt, welche `columns` und `label_cols` einbezogen werden sollen. + +Bevor Sie [~TFPreTrainedModel.prepare_tf_dataset`] verwenden können, müssen Sie die Tokenizer-Ausgaben als Spalten zu Ihrem Datensatz hinzufügen, wie in +dem folgenden Codebeispiel: + +```py +def tokenize_dataset(data): + # Keys of the returned dictionary will be added to the dataset as columns + return tokenizer(data["text"]) + + +dataset = dataset.map(tokenize_dataset) +``` + +Denken Sie daran, dass Hugging Face-Datensätze standardmäßig auf der Festplatte gespeichert werden, so dass dies nicht zu einem erhöhten Arbeitsspeicherbedarf führen wird! Sobald die +Spalten hinzugefügt wurden, können Sie Batches aus dem Datensatz streamen und zu jedem Batch Auffüllungen hinzufügen, was die Anzahl der Auffüllungs-Token im Vergleich zum Auffüllen des gesamten Datensatzes reduziert. + + +```py +>>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer) +``` + +Beachten Sie, dass Sie im obigen Codebeispiel den Tokenizer an `prepare_tf_dataset` übergeben müssen, damit die Stapel beim Laden korrekt aufgefüllt werden können. +Wenn alle Stichproben in Ihrem Datensatz die gleiche Länge haben und kein Auffüllen erforderlich ist, können Sie dieses Argument weglassen. +Wenn Sie etwas Komplexeres als nur das Auffüllen von Stichproben benötigen (z. B. das Korrumpieren von Token für die maskierte Sprachmodellierung), können Sie das Argument +Modellierung), können Sie stattdessen das Argument `collate_fn` verwenden, um eine Funktion zu übergeben, die aufgerufen wird, um die +Liste von Stichproben in einen Stapel umwandelt und alle gewünschten Vorverarbeitungen vornimmt. Siehe unsere +[examples](https://github.com/huggingface/transformers/tree/main/examples) oder +[notebooks](https://huggingface.co/docs/transformers/notebooks), um diesen Ansatz in Aktion zu sehen. + +Sobald Sie einen `tf.data.Dataset` erstellt haben, können Sie das Modell wie zuvor kompilieren und anpassen: + +```py +model.compile(optimizer=Adam(3e-5)) + +model.fit(tf_dataset) +``` + + + + + + +## Trainieren in nativem PyTorch + + + + + +[`Trainer`] kümmert sich um die Trainingsschleife und ermöglicht die Feinabstimmung eines Modells in einer einzigen Codezeile. Für Benutzer, die es vorziehen, ihre eigene Trainingsschleife zu schreiben, können Sie auch eine Feinabstimmung eines 🤗 Transformers-Modells in nativem PyTorch vornehmen. + +An diesem Punkt müssen Sie möglicherweise Ihr Notebook neu starten oder den folgenden Code ausführen, um etwas Speicher freizugeben: + +```py +del model +del pytorch_model +del trainer +torch.cuda.empty_cache() +``` + +Als Nächstes müssen Sie den Datensatz `tokenized_dataset` manuell nachbearbeiten, um ihn für das Training vorzubereiten. + +1. Entfernen Sie die Spalte "Text", da das Modell keinen Rohtext als Eingabe akzeptiert: + + ```py + >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) + ``` + +2. Benennen Sie die Spalte "Label" in "Labels" um, da das Modell erwartet, dass das Argument "Labels" genannt wird: + + ```py + >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") + ``` + +3. Stellen Sie das Format des Datensatzes so ein, dass PyTorch-Tensoren anstelle von Listen zurückgegeben werden: + + ```py + >>> tokenized_datasets.set_format("torch") + ``` + +Erstellen Sie dann eine kleinere Teilmenge des Datensatzes, wie zuvor gezeigt, um die Feinabstimmung zu beschleunigen: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + +### DataLoader + +Erstellen Sie einen `DataLoader` für Ihre Trainings- und Testdatensätze, damit Sie über die Datenstapel iterieren können: + +```py +>>> from torch.utils.data import DataLoader + +>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) +>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) +``` + +Laden Sie Ihr Modell mit der Anzahl der erwarteten Kennzeichnungen: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + +### Optimierer und Lernratensteuerung + +Erstellen Sie einen Optimierer und einen Scheduler für die Lernrate, um das Modell fein abzustimmen. Wir verwenden den Optimierer [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) aus PyTorch: + +```py +>>> from torch.optim import AdamW + +>>> optimizer = AdamW(model.parameters(), lr=5e-5) +``` + +Erstellen Sie den Standard-Lernratenplaner aus [`Trainer`]: + +```py +>>> from transformers import get_scheduler + +>>> num_epochs = 3 +>>> num_training_steps = num_epochs * len(train_dataloader) +>>> lr_scheduler = get_scheduler( +... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps +... ) +``` + +Geben Sie schließlich `device` an, um einen Grafikprozessor zu verwenden, wenn Sie Zugang zu einem solchen haben. Andernfalls kann das Training auf einer CPU mehrere Stunden statt ein paar Minuten dauern. + +```py +>>> import torch + +>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +>>> model.to(device) +``` + + + +Holen Sie sich mit einem gehosteten Notebook wie [Colaboratory](https://colab.research.google.com/) oder [SageMaker StudioLab](https://studiolab.sagemaker.aws/) kostenlosen Zugang zu einem Cloud-GPU, wenn Sie noch keinen haben. + + + +Großartig, Sie sind bereit für das Training! 🥳 + +### Trainingsschleife + +Um Ihren Trainingsfortschritt zu verfolgen, verwenden Sie die [tqdm](https://tqdm.github.io/) Bibliothek, um einen Fortschrittsbalken über die Anzahl der Trainingsschritte hinzuzufügen: + +```py +>>> from tqdm.auto import tqdm + +>>> progress_bar = tqdm(range(num_training_steps)) + +>>> model.train() +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... outputs = model(**batch) +... loss = outputs.loss +... loss.backward() + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +### Auswertung + +Genauso wie Sie eine Bewertungsfunktion zu [`Trainer`] hinzugefügt haben, müssen Sie dasselbe tun, wenn Sie Ihre eigene Trainingsschleife schreiben. Aber anstatt die Metrik am Ende jeder Epoche zu berechnen und zu melden, werden Sie dieses Mal alle Stapel mit [`~evaluate.add_batch`] akkumulieren und die Metrik ganz am Ende berechnen. + +```py +>>> import evaluate + +>>> metric = evaluate.load("accuracy") +>>> model.eval() +>>> for batch in eval_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... with torch.no_grad(): +... outputs = model(**batch) + +... logits = outputs.logits +... predictions = torch.argmax(logits, dim=-1) +... metric.add_batch(predictions=predictions, references=batch["labels"]) + +>>> metric.compute() +``` + + + + + +## Zusätzliche Ressourcen + +Weitere Beispiele für die Feinabstimmung finden Sie unter: + +- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) enthält Skripte + um gängige NLP-Aufgaben in PyTorch und TensorFlow zu trainieren. + +- [🤗 Transformers Notebooks](notebooks) enthält verschiedene Notebooks zur Feinabstimmung eines Modells für bestimmte Aufgaben in PyTorch und TensorFlow. \ No newline at end of file diff --git a/docs/source/de/training.mdx b/docs/source/de/training.mdx deleted file mode 100644 index e38779ba5571..000000000000 --- a/docs/source/de/training.mdx +++ /dev/null @@ -1,429 +0,0 @@ - - -# Optimierung eines vortrainierten Modells - -[[open-in-colab]] - -Die Verwendung eines vorab trainierten Modells hat erhebliche Vorteile. Es reduziert die Rechenkosten und den CO2-Fußabdruck und ermöglicht Ihnen die Verwendung von Modellen, die dem neuesten Stand der Technik entsprechen, ohne dass Sie ein Modell von Grund auf neu trainieren müssen. Transformers bietet Zugang zu Tausenden von vortrainierten Modellen für eine Vielzahl von Aufgaben. Wenn Sie ein vorab trainiertes Modell verwenden, trainieren Sie es auf einem für Ihre Aufgabe spezifischen Datensatz. Dies wird als Feinabstimmung bezeichnet und ist eine unglaublich leistungsfähige Trainingstechnik. In diesem Tutorial werden Sie ein vortrainiertes Modell mit einem Deep-Learning-Framework Ihrer Wahl feinabstimmen: - -* Feinabstimmung eines vorab trainierten Modells mit 🤗 Transformers [`Trainer`]. -* Feinabstimmung eines vorab trainierten Modells in TensorFlow mit Keras. -* Feinabstimmung eines vorab trainierten Modells in nativem PyTorch. - - - -## Vorbereitung eines Datensatzes - - - -Bevor Sie die Feinabstimmung eines vortrainierten Modells vornehmen können, müssen Sie einen Datensatz herunterladen und für das Training vorbereiten. Im vorangegangenen Leitfaden haben Sie gelernt, wie man Daten für das Training aufbereitet, und jetzt haben Sie die Gelegenheit, diese Fähigkeiten zu testen! - -Laden Sie zunächst den Datensatz [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full): - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("yelp_review_full") ->>> dataset["train"][100] -{'label': 0, - 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} -``` - -Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - - ->>> def tokenize_function(examples): -... return tokenizer(examples["text"], padding="max_length", truncation=True) - - ->>> tokenized_datasets = dataset.map(tokenize_function, batched=True) -``` - -Wenn Sie möchten, können Sie eine kleinere Teilmenge des gesamten Datensatzes für die Feinabstimmung erstellen, um den Zeitaufwand zu verringern: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - - - -## Training - -An dieser Stelle sollten Sie dem Abschnitt folgen, der dem Rahmen entspricht, den Sie verwenden möchten. Sie können über die Links -in der rechten Seitenleiste können Sie zu dem gewünschten Abschnitt springen - und wenn Sie den gesamten Inhalt eines bestimmten Frameworks ausblenden möchten, -klicken Sie einfach auf die Schaltfläche oben rechts im Block des jeweiligen Frameworks! - - - - - -## Trainieren mit PyTorch Trainer - -🤗 Transformers bietet eine [`Trainer`]-Klasse, die für das Training von 🤗 Transformers-Modellen optimiert ist und es einfacher macht, mit dem Training zu beginnen, ohne manuell eine eigene Trainingsschleife zu schreiben. Die [`Trainer`]-API unterstützt eine breite Palette von Trainingsoptionen und Funktionen wie Logging, Gradientenakkumulation und gemischte Präzision. - -Beginnen Sie mit dem Laden Ihres Modells und geben Sie die Anzahl der erwarteten Labels an. Aus dem Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields) wissen Sie, dass es fünf Labels gibt: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - - - -Es wird eine Warnung angezeigt, dass einige der trainierten Parameter nicht verwendet werden und einige Parameter zufällig -initialisiert werden. Machen Sie sich keine Sorgen, das ist völlig normal! Der vorher trainierte Kopf des BERT-Modells wird verworfen und durch einen zufällig initialisierten Klassifikationskopf ersetzt. Sie werden diesen neuen Modellkopf in Ihrer Sequenzklassifizierungsaufgabe feinabstimmen, indem Sie das Wissen des vortrainierten Modells auf ihn übertragen. - - - -### Hyperparameter für das Training - -Als Nächstes erstellen Sie eine Klasse [`TrainingArguments`], die alle Hyperparameter enthält, die Sie einstellen können, sowie Flags zur Aktivierung verschiedener Trainingsoptionen. Für dieses Lernprogramm können Sie mit den Standard- [Hyperparametern](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) beginnen, aber Sie können mit diesen experimentieren, um Ihre optimalen Einstellungen zu finden. - -Geben Sie an, wo die Kontrollpunkte Ihres Trainings gespeichert werden sollen: - -```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments(output_dir="test_trainer") -``` - -### Auswerten - -Der [`Trainer`] wertet die Leistung des Modells während des Trainings nicht automatisch aus. Sie müssen [`Trainer`] eine Funktion übergeben, um Metriken zu berechnen und zu berichten. Die [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) Bibliothek bietet eine einfache [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) Funktion, die Sie mit der [`evaluate.load`] Funktion laden können (siehe diese [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) für weitere Informationen): - -```py ->>> import numpy as np ->>> import evaluate - ->>> metric = evaluate.load("accuracy") -``` - -Rufen Sie [`~evaluate.compute`] auf `metric` auf, um die Genauigkeit Ihrer Vorhersagen zu berechnen. Bevor Sie Ihre Vorhersagen an `compute` übergeben, müssen Sie die Vorhersagen in Logits umwandeln (denken Sie daran, dass alle 🤗 Transformers-Modelle Logits zurückgeben): - -```py ->>> def compute_metrics(eval_pred): -... logits, labels = eval_pred -... predictions = np.argmax(logits, axis=-1) -... return metric.compute(predictions=predictions, references=labels) -``` - -Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `evaluation_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln: - -```py ->>> from transformers import TrainingArguments, Trainer - ->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") -``` - -### Trainer - -Erstellen Sie ein [`Trainer`]-Objekt mit Ihrem Modell, Trainingsargumenten, Trainings- und Testdatensätzen und einer Evaluierungsfunktion: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -Anschließend können Sie Ihr Modell durch den Aufruf von [`~transformers.Trainer.train`] optimieren: - -```py ->>> trainer.train() -``` - - - - - - -## Trainieren Sie ein TensorFlow-Modell mit Keras - -Sie können auch 🤗 Transformers Modelle in TensorFlow mit der Keras API trainieren! - -### Laden von Daten für Keras - -Wenn Sie ein 🤗 Transformers Modell mit der Keras API trainieren wollen, müssen Sie Ihren Datensatz in ein Format konvertieren, das -Keras versteht. Wenn Ihr Datensatz klein ist, können Sie das Ganze einfach in NumPy-Arrays konvertieren und an Keras übergeben. -Probieren wir das zuerst aus, bevor wir etwas Komplizierteres tun. - -Laden Sie zunächst ein Dataset. Wir werden den CoLA-Datensatz aus dem [GLUE-Benchmark](https://huggingface.co/datasets/glue) verwenden, -da es sich um eine einfache Aufgabe zur Klassifizierung von binärem Text handelt, und nehmen vorerst nur den Trainingssplit. - -```py -from datasets import load_dataset - -dataset = load_dataset("glue", "cola") -dataset = dataset["train"] # Just take the training split for now -``` - -Als nächstes laden Sie einen Tokenizer und tokenisieren die Daten als NumPy-Arrays. Beachten Sie, dass die Beschriftungen bereits eine Liste von 0 und 1en sind, -Wir können sie also ohne Tokenisierung direkt in ein NumPy-Array konvertieren! - -```py -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True) -# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras -tokenized_data = dict(tokenized_data) - -labels = np.array(dataset["label"]) # Label is already an array of 0 and 1 -``` - -Schließlich laden, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) und [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) Sie das Modell: - -```py -from transformers import TFAutoModelForSequenceClassification -from tensorflow.keras.optimizers import Adam - -# Load and compile our model -model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased") -# Lower learning rates are often better for fine-tuning transformers -model.compile(optimizer=Adam(3e-5)) - -model.fit(tokenized_data, labels) -``` - - - -Sie müssen Ihren Modellen kein Verlustargument übergeben, wenn Sie sie `compile()`! Hugging-Face-Modelle wählen automatisch -einen Loss, der für ihre Aufgabe und Modellarchitektur geeignet ist, wenn dieses Argument leer gelassen wird. Sie können jederzeit außer Kraft setzen, indem Sie selbst einen Loss angeben, wenn Sie das möchten! - - - -Dieser Ansatz eignet sich hervorragend für kleinere Datensätze, aber bei größeren Datensätzen kann er zu einem Problem werden. Warum? -Weil das tokenisierte Array und die Beschriftungen vollständig in den Speicher geladen werden müssten, und weil NumPy nicht mit -"gezackte" Arrays nicht verarbeiten kann, so dass jedes tokenisierte Sample auf die Länge des längsten Samples im gesamten Datensatz aufgefüllt werden müsste. -Datensatzes aufgefüllt werden. Dadurch wird das Array noch größer, und all die aufgefüllten Token verlangsamen auch das Training! - -### Laden von Daten als tf.data.Dataset - -Wenn Sie eine Verlangsamung des Trainings vermeiden wollen, können Sie Ihre Daten stattdessen als `tf.data.Dataset` laden. Sie können zwar Ihre eigene -tf.data"-Pipeline schreiben können, wenn Sie wollen, haben wir zwei bequeme Methoden, um dies zu tun: - -- [`~TFPreTrainedModel.prepare_tf_dataset`]: Dies ist die Methode, die wir in den meisten Fällen empfehlen. Da es sich um eine Methode -Ihres Modells ist, kann sie das Modell inspizieren, um automatisch herauszufinden, welche Spalten als Modelleingaben verwendet werden können, und -verwirft die anderen, um einen einfacheren, leistungsfähigeren Datensatz zu erstellen. -- [~datasets.Dataset.to_tf_dataset`]: Diese Methode ist eher auf niedriger Ebene angesiedelt und ist nützlich, wenn Sie genau kontrollieren wollen, wie -Dataset erstellt wird, indem man genau angibt, welche `columns` und `label_cols` einbezogen werden sollen. - -Bevor Sie [~TFPreTrainedModel.prepare_tf_dataset`] verwenden können, müssen Sie die Tokenizer-Ausgaben als Spalten zu Ihrem Datensatz hinzufügen, wie in -dem folgenden Codebeispiel: - -```py -def tokenize_dataset(data): - # Keys of the returned dictionary will be added to the dataset as columns - return tokenizer(data["text"]) - - -dataset = dataset.map(tokenize_dataset) -``` - -Denken Sie daran, dass Hugging Face-Datensätze standardmäßig auf der Festplatte gespeichert werden, so dass dies nicht zu einem erhöhten Arbeitsspeicherbedarf führen wird! Sobald die -Spalten hinzugefügt wurden, können Sie Batches aus dem Datensatz streamen und zu jedem Batch Auffüllungen hinzufügen, was die Anzahl der Auffüllungs-Token im Vergleich zum Auffüllen des gesamten Datensatzes reduziert. - - -```py ->>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer) -``` - -Beachten Sie, dass Sie im obigen Codebeispiel den Tokenizer an `prepare_tf_dataset` übergeben müssen, damit die Stapel beim Laden korrekt aufgefüllt werden können. -Wenn alle Stichproben in Ihrem Datensatz die gleiche Länge haben und kein Auffüllen erforderlich ist, können Sie dieses Argument weglassen. -Wenn Sie etwas Komplexeres als nur das Auffüllen von Stichproben benötigen (z. B. das Korrumpieren von Token für die maskierte Sprachmodellierung), können Sie das Argument -Modellierung), können Sie stattdessen das Argument `collate_fn` verwenden, um eine Funktion zu übergeben, die aufgerufen wird, um die -Liste von Stichproben in einen Stapel umwandelt und alle gewünschten Vorverarbeitungen vornimmt. Siehe unsere -[examples](https://github.com/huggingface/transformers/tree/main/examples) oder -[notebooks](https://huggingface.co/docs/transformers/notebooks), um diesen Ansatz in Aktion zu sehen. - -Sobald Sie einen `tf.data.Dataset` erstellt haben, können Sie das Modell wie zuvor kompilieren und anpassen: - -```py -model.compile(optimizer=Adam(3e-5)) - -model.fit(tf_dataset) -``` - - - - - - -## Trainieren in nativem PyTorch - - - - - -[`Trainer`] kümmert sich um die Trainingsschleife und ermöglicht die Feinabstimmung eines Modells in einer einzigen Codezeile. Für Benutzer, die es vorziehen, ihre eigene Trainingsschleife zu schreiben, können Sie auch eine Feinabstimmung eines 🤗 Transformers-Modells in nativem PyTorch vornehmen. - -An diesem Punkt müssen Sie möglicherweise Ihr Notebook neu starten oder den folgenden Code ausführen, um etwas Speicher freizugeben: - -```py -del model -del pytorch_model -del trainer -torch.cuda.empty_cache() -``` - -Als Nächstes müssen Sie den Datensatz `tokenized_dataset` manuell nachbearbeiten, um ihn für das Training vorzubereiten. - -1. Entfernen Sie die Spalte "Text", da das Modell keinen Rohtext als Eingabe akzeptiert: - - ```py - >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) - ``` - -2. Benennen Sie die Spalte "Label" in "Labels" um, da das Modell erwartet, dass das Argument "Labels" genannt wird: - - ```py - >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") - ``` - -3. Stellen Sie das Format des Datensatzes so ein, dass PyTorch-Tensoren anstelle von Listen zurückgegeben werden: - - ```py - >>> tokenized_datasets.set_format("torch") - ``` - -Erstellen Sie dann eine kleinere Teilmenge des Datensatzes, wie zuvor gezeigt, um die Feinabstimmung zu beschleunigen: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - -### DataLoader - -Erstellen Sie einen `DataLoader` für Ihre Trainings- und Testdatensätze, damit Sie über die Datenstapel iterieren können: - -```py ->>> from torch.utils.data import DataLoader - ->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) ->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) -``` - -Laden Sie Ihr Modell mit der Anzahl der erwarteten Kennzeichnungen: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - -### Optimierer und Lernratensteuerung - -Erstellen Sie einen Optimierer und einen Scheduler für die Lernrate, um das Modell fein abzustimmen. Wir verwenden den Optimierer [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) aus PyTorch: - -```py ->>> from torch.optim import AdamW - ->>> optimizer = AdamW(model.parameters(), lr=5e-5) -``` - -Erstellen Sie den Standard-Lernratenplaner aus [`Trainer`]: - -```py ->>> from transformers import get_scheduler - ->>> num_epochs = 3 ->>> num_training_steps = num_epochs * len(train_dataloader) ->>> lr_scheduler = get_scheduler( -... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps -... ) -``` - -Geben Sie schließlich `device` an, um einen Grafikprozessor zu verwenden, wenn Sie Zugang zu einem solchen haben. Andernfalls kann das Training auf einer CPU mehrere Stunden statt ein paar Minuten dauern. - -```py ->>> import torch - ->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ->>> model.to(device) -``` - - - -Holen Sie sich mit einem gehosteten Notebook wie [Colaboratory](https://colab.research.google.com/) oder [SageMaker StudioLab](https://studiolab.sagemaker.aws/) kostenlosen Zugang zu einem Cloud-GPU, wenn Sie noch keinen haben. - - - -Großartig, Sie sind bereit für das Training! 🥳 - -### Trainingsschleife - -Um Ihren Trainingsfortschritt zu verfolgen, verwenden Sie die [tqdm](https://tqdm.github.io/) Bibliothek, um einen Fortschrittsbalken über die Anzahl der Trainingsschritte hinzuzufügen: - -```py ->>> from tqdm.auto import tqdm - ->>> progress_bar = tqdm(range(num_training_steps)) - ->>> model.train() ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... outputs = model(**batch) -... loss = outputs.loss -... loss.backward() - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -### Auswertung - -Genauso wie Sie eine Bewertungsfunktion zu [`Trainer`] hinzugefügt haben, müssen Sie dasselbe tun, wenn Sie Ihre eigene Trainingsschleife schreiben. Aber anstatt die Metrik am Ende jeder Epoche zu berechnen und zu melden, werden Sie dieses Mal alle Stapel mit [`~evaluate.add_batch`] akkumulieren und die Metrik ganz am Ende berechnen. - -```py ->>> import evaluate - ->>> metric = evaluate.load("accuracy") ->>> model.eval() ->>> for batch in eval_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... with torch.no_grad(): -... outputs = model(**batch) - -... logits = outputs.logits -... predictions = torch.argmax(logits, dim=-1) -... metric.add_batch(predictions=predictions, references=batch["labels"]) - ->>> metric.compute() -``` - - - - - -## Zusätzliche Ressourcen - -Weitere Beispiele für die Feinabstimmung finden Sie unter: - -- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) enthält Skripte - um gängige NLP-Aufgaben in PyTorch und TensorFlow zu trainieren. - -- [🤗 Transformers Notebooks](notebooks) enthält verschiedene Notebooks zur Feinabstimmung eines Modells für bestimmte Aufgaben in PyTorch und TensorFlow. \ No newline at end of file diff --git a/docs/source/de/transformers_agents.md b/docs/source/de/transformers_agents.md new file mode 100644 index 000000000000..1d676c395e17 --- /dev/null +++ b/docs/source/de/transformers_agents.md @@ -0,0 +1,323 @@ + + +# Transformers Agents + + + +Transformers Agents ist eine experimentelle API, die jederzeit geändert werden kann. Die von den Agenten zurückgegebenen Ergebnisse +zurückgegeben werden, können variieren, da sich die APIs oder die zugrunde liegenden Modelle ändern können. + + + +Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie können damit spielen in +[dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj). + +Kurz gesagt, es bietet eine API für natürliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen +Agenten, um natürliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert, +aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden. + +Beginnen wir mit einigen Beispielen dafür, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsfähig, wenn es um +Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen. + +```py +agent.run("Caption the following image", image=image) +``` + +| **Input** | **Output** | +|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------| +| | A beaver is swimming in the water | + +--- + +```py +agent.run("Read the following text out loud", text=text) +``` +| **Input** | **Output** | +|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------| +| A beaver is swimming in the water | + +--- + +```py +agent.run( + "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?", + document=document, +) +``` +| **Input** | **Output** | +|-----------------------------------------------------------------------------------------------------------------------------|----------------| +| | ballroom foyer | + +## Schnellstart + +Bevor Sie `agent.run` verwenden können, müssen Sie einen Agenten instanziieren, der ein großes Sprachmodell (LLM) ist. +Wir bieten Unterstützung für openAI-Modelle sowie für OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI +Modelle sind leistungsfähiger (erfordern aber einen openAI-API-Schlüssel, können also nicht kostenlos verwendet werden); Hugging Face +bietet kostenlosen Zugang zu Endpunkten für BigCode- und OpenAssistant-Modelle. + +To start with, please install the `agents` extras in order to install all default dependencies. +```bash +pip install transformers[agents] +``` + +Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abhängigkeit installiert haben: + +```bash +pip install openai +``` + + +```py +from transformers import OpenAiAgent + +agent = OpenAiAgent(model="text-davinci-003", api_key="") +``` + +Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zunächst an, um Zugriff auf die Inference API zu erhalten: + +```py +from huggingface_hub import login + +login("") +``` + +Dann instanziieren Sie den Agenten + +```py +from transformers import HfAgent + +# Starcoder +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") +# StarcoderBase +# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase") +# OpenAssistant +# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5") +``` + +Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verfügung stellt. Wenn Sie Ihren eigenen Inferenz +Endpunkt für dieses Modell (oder einen anderen) haben, können Sie die obige URL durch Ihren URL-Endpunkt ersetzen. + + + +StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte +nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI +Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt. + + + +Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verfügung stehen. + +### Einzelne Ausführung (run) + +Die Methode der einmaligen Ausführung ist die Verwendung der [`~Agent.run`] Methode des Agenten: + +```py +agent.run("Draw me a picture of rivers and lakes.") +``` + + + +Es wählt automatisch das (oder die) Werkzeug(e) aus, das (die) für die von Ihnen gewünschte Aufgabe geeignet ist (sind) und führt es (sie) entsprechend aus. Es +kann eine oder mehrere Aufgaben in der gleichen Anweisung ausführen (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein +der Agent scheitern). + +```py +agent.run("Draw me a picture of the sea then transform the picture to add an island") +``` + + + +
+ + +Jede [`~Agent.run`] Operation ist unabhängig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausführen können. + +Beachten Sie, dass Ihr `Agent` nur ein großsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung völlig unterschiedliche Ergebnisse liefern können. +unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausführen möchten, so genau wie möglich erklären. Wir gehen noch weiter ins Detail +wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs). + +Wenn Sie einen Status über Ausführungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte übergeben möchten, können Sie dies tun, indem Sie +Variablen, die der Agent verwenden soll. Sie könnten zum Beispiel das erste Bild von Flüssen und Seen erzeugen, +und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzufügen, indem Sie Folgendes tun: + +```python +picture = agent.run("Generate a picture of rivers and lakes.") +updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture) +``` + + + +Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel wäre: + +```py +agent.run("Draw me the picture of a capybara swimming in the sea") +``` + +Hier könnte das Modell auf zwei Arten interpretieren: +- Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt. +- Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen. + +Falls Sie das erste Szenario erzwingen möchten, können Sie dies tun, indem Sie die Eingabeaufforderung als Argument übergeben: + +```py +agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea") +``` + + + + +### Chat-basierte Ausführung (Chat) + +Der Agent verfügt auch über einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet: + +```py +agent.chat("Generate a picture of rivers and lakes") +``` + + + +```py +agent.chat("Transform the picture so that there is a rock in there") +``` + + + +
+ +Dies ist ein interessanter Ansatz, wenn Sie den Zustand über Anweisungen hinweg beibehalten möchten. Er ist besser für Experimente geeignet, +eignet sich aber eher für einzelne Anweisungen als für komplexe Anweisungen (die die [`~Agent.run`] +Methode besser verarbeiten kann). + +Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen übergeben möchten. + +### ⚠️ Fernausführung + +Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors für mehrere +der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit +[inference endpoints](https://huggingface.co/inference-endpoints). + +Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten können, +empfehlen wir die Lektüre des [custom tool guide](./custom_tools). + +### Was passiert hier? Was sind Tools und was sind Agenten? + + + +#### Agenten + +Der "Agent" ist hier ein großes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten. + +LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das +LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausführt. Diese Aufforderung wird dann ergänzt durch die +Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erhält er Zugriff auf die Dokumentation der +Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren. + +#### Tools + +Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools +um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der +in der Abfrage angefordert wurde. + +Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools. +Pipelines sind stärker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind dafür gedacht, sich auf +eine einzige, sehr einfache Aufgabe konzentrieren. + +#### Code-Ausführung?! + +Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools übergebenen Eingaben ausgeführt. +Wir hören Sie schon schreien "Willkürliche Codeausführung!", aber lassen Sie uns erklären, warum das nicht der Fall ist. + +Die einzigen Funktionen, die aufgerufen werden können, sind die von Ihnen zur Verfügung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschränkt sind +eingeschränkt, was ausgeführt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge für das Umarmungsgesicht beschränkt. + +Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht benötigt werden, um die +Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie müssten den LLM +dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, können Sie die +run()-Methode mit dem zusätzlichen Argument return_code=True ausführen. In diesem Fall gibt der Agent nur den auszuführenden Code +zur Ausführung zurück und Sie können entscheiden, ob Sie ihn ausführen möchten oder nicht. + +Die Ausführung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuführen, oder wenn ein regulärer Python-Fehler +mit dem vom Agenten generierten Code. + +### Ein kuratierter Satz von Tools + +Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterstützen können. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben +in `transformers` integriert haben: + +- **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut)) +- Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5)) +- **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip)) +- **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt)) +- **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg)) +- **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper)) +- **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5)) +- **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart)) +- **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen Sätzen zusammen ([BART](./model_doc/bart)) +- **Übersetzung**: Übersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb)) + +Diese Tools sind in Transformatoren integriert und können auch manuell verwendet werden, zum Beispiel: + +```py +from transformers import load_tool + +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") +``` + +### Benutzerdefinierte Tools + +Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen Überzeugung, dass der Hauptwert dieser Implementierung darin besteht +die Möglichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben. + +Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, können Sie das Tool +direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugefügt +**transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugefügt: + +- **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL +- **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion +- **Bildtransformation**: verändert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion +- **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab + +Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in +[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden +weiterhin solche Tools für diese und andere Organisationen veröffentlichen, um diese Implementierung weiter zu verbessern. + +Die Agenten haben standardmäßig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden. +Wie Sie Ihre eigenen Tools schreiben und freigeben können und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen können, erklären wir in [folgender Anleitung](custom_tools). + +### Code-Erzeugung + +Bisher haben wir gezeigt, wie Sie die Agenten nutzen können, um Aktionen für Sie durchzuführen. Der Agent generiert jedoch nur Code +den wir dann mit einem sehr eingeschränkten Python-Interpreter ausführen. Falls Sie den generierten Code in einer anderen Umgebung verwenden möchten +einer anderen Umgebung verwenden möchten, können Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zurückzugeben. + +Zum Beispiel die folgende Anweisung +```python +agent.run("Draw me a picture of rivers and lakes", return_code=True) +``` + +gibt den folgenden Code zurück + +```python +from transformers import load_tool + +image_generator = load_tool("huggingface-tools/text-to-image") + +image = image_generator(prompt="rivers and lakes") +``` + +die Sie dann selbst ändern und ausführen können. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml old mode 100755 new mode 100644 index 3573c6070cdc..d8e92f43b4f3 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -8,67 +8,55 @@ title: Get started - sections: - local: pipeline_tutorial - title: Pipelines for inference + title: Run inference with pipelines - local: autoclass_tutorial - title: Load pretrained instances with an AutoClass + title: Write portable code with AutoClass - local: preprocessing - title: Preprocess + title: Preprocess data - local: training title: Fine-tune a pretrained model + - local: run_scripts + title: Train with a script - local: accelerate - title: Distributed training with 🤗 Accelerate + title: Set up distributed training with 🤗 Accelerate + - local: peft + title: Load and train adapters with 🤗 PEFT - local: model_sharing - title: Share a model + title: Share your model + - local: transformers_agents + title: Agents + - local: llm_tutorial + title: Generation with LLMs title: Tutorials - sections: - - sections: - - local: create_a_model - title: Create a custom architecture - - local: custom_models - title: Sharing custom models - - local: run_scripts - title: Train with a script - - local: sagemaker - title: Run training on Amazon SageMaker - - local: converting_tensorflow_models - title: Converting from TensorFlow checkpoints - - local: serialization - title: Export to ONNX - - local: torchscript - title: Export to TorchScript - - local: troubleshooting - title: Troubleshoot - title: General usage - - sections: - - local: fast_tokenizers - title: Use tokenizers from 🤗 Tokenizers - - local: multilingual - title: Inference for multilingual models - - sections: - - local: tasks/sequence_classification - title: Text classification - - local: tasks/token_classification - title: Token classification - - local: tasks/question_answering - title: Question answering - - local: tasks/language_modeling - title: Language modeling - - local: tasks/translation - title: Translation - - local: tasks/summarization - title: Summarization - - local: tasks/multiple_choice - title: Multiple choice - title: Task guides - isExpanded: false + - isExpanded: false + sections: + - local: tasks/sequence_classification + title: Text classification + - local: tasks/token_classification + title: Token classification + - local: tasks/question_answering + title: Question answering + - local: tasks/language_modeling + title: Causal language modeling + - local: tasks/masked_language_modeling + title: Masked language modeling + - local: tasks/translation + title: Translation + - local: tasks/summarization + title: Summarization + - local: tasks/multiple_choice + title: Multiple choice title: Natural Language Processing - - sections: + - isExpanded: false + sections: - local: tasks/audio_classification title: Audio classification - local: tasks/asr title: Automatic speech recognition title: Audio - - sections: + - isExpanded: false + sections: - local: tasks/image_classification title: Image classification - local: tasks/semantic_segmentation @@ -77,22 +65,89 @@ title: Video classification - local: tasks/object_detection title: Object detection + - local: tasks/zero_shot_object_detection + title: Zero-shot object detection + - local: tasks/zero_shot_image_classification + title: Zero-shot image classification + - local: tasks/monocular_depth_estimation + title: Depth estimation title: Computer Vision + - isExpanded: false + sections: + - local: tasks/image_captioning + title: Image captioning + - local: tasks/document_question_answering + title: Document Question Answering + - local: tasks/visual_question_answering + title: Visual Question Answering + - local: tasks/text-to-speech + title: Text to speech + title: Multimodal + - isExpanded: false + sections: + - local: generation_strategies + title: Customize the generation strategy + title: Generation + - isExpanded: false + sections: + - local: tasks/idefics + title: Image tasks with IDEFICS + title: Prompting + title: Task Guides +- sections: + - local: fast_tokenizers + title: Use fast tokenizers from 🤗 Tokenizers + - local: multilingual + title: Run inference with multilingual models + - local: create_a_model + title: Use model-specific APIs + - local: custom_models + title: Share a custom model + - local: chat_templating + title: Templates for chat models + - local: sagemaker + title: Run training on Amazon SageMaker + - local: serialization + title: Export to ONNX + - local: tflite + title: Export to TFLite + - local: torchscript + title: Export to TorchScript + - local: benchmarks + title: Benchmarks + - local: notebooks + title: Notebooks with examples + - local: community + title: Community resources + - local: custom_tools + title: Custom Tools and Prompts + - local: troubleshooting + title: Troubleshoot + title: Developer guides +- sections: + - local: performance + title: Overview - sections: - - local: performance - title: Overview - local: perf_train_gpu_one - title: Training on one GPU + title: Methods and tools for efficient training on a single GPU - local: perf_train_gpu_many - title: Training on many GPUs + title: Multiple GPUs and parallelism - local: perf_train_cpu - title: Training on CPU + title: Efficient training on CPU - local: perf_train_cpu_many - title: Training on many CPUs + title: Distributed CPU training - local: perf_train_tpu title: Training on TPUs + - local: perf_train_tpu_tf + title: Training on TPU with TensorFlow - local: perf_train_special title: Training on Specialized Hardware + - local: perf_hardware + title: Custom hardware for training + - local: hpo_train + title: Hyperparameter Search using Trainer API + title: Efficient training techniques + - sections: - local: perf_infer_cpu title: Inference on CPU - local: perf_infer_gpu_one @@ -101,49 +156,45 @@ title: Inference on many GPUs - local: perf_infer_special title: Inference on Specialized Hardware - - local: perf_hardware - title: Custom hardware for training - - local: big_models - title: Instantiating a big model - - local: debugging - title: Debugging - - local: hpo_train - title: Hyperparameter Search using Trainer API - title: Performance and scalability - - sections: - - local: contributing - title: How to contribute to transformers? - - local: add_new_model - title: How to add a model to 🤗 Transformers? - - local: add_tensorflow_model - title: How to convert a 🤗 Transformers model to TensorFlow? - - local: add_new_pipeline - title: How to add a pipeline to 🤗 Transformers? - - local: testing - title: Testing - - local: pr_checks - title: Checks on a Pull Request - title: Contribute - - local: notebooks - title: 🤗 Transformers Notebooks - - local: community - title: Community resources - - local: benchmarks - title: Benchmarks - - local: migration - title: Migrating from previous packages - title: How-to guides + title: Optimizing inference + - local: big_models + title: Instantiating a big model + - local: debugging + title: Troubleshooting + - local: tf_xla + title: XLA Integration for TensorFlow Models + - local: perf_torch_compile + title: Optimize inference using `torch.compile()` + title: Performance and scalability +- sections: + - local: contributing + title: How to contribute to transformers? + - local: add_new_model + title: How to add a model to 🤗 Transformers? + - local: add_tensorflow_model + title: How to convert a 🤗 Transformers model to TensorFlow? + - local: add_new_pipeline + title: How to add a pipeline to 🤗 Transformers? + - local: testing + title: Testing + - local: pr_checks + title: Checks on a Pull Request + title: Contribute - sections: - local: philosophy title: Philosophy - local: glossary title: Glossary - local: task_summary - title: Summary of the tasks + title: What 🤗 Transformers can do + - local: tasks_explained + title: How 🤗 Transformers solve tasks - local: model_summary - title: Summary of the models + title: The Transformer model family - local: tokenizer_summary title: Summary of the tokenizers + - local: attention + title: Attention mechanisms - local: pad_truncation title: Padding and truncation - local: bertology @@ -152,9 +203,13 @@ title: Perplexity of fixed-length models - local: pipeline_webserver title: Pipelines for webserver inference + - local: model_memory_anatomy + title: Model training anatomy title: Conceptual guides - sections: - sections: + - local: main_classes/agent + title: Agents and Tools - local: model_doc/auto title: Auto Classes - local: main_classes/callback @@ -181,6 +236,8 @@ title: Pipelines - local: main_classes/processors title: Processors + - local: main_classes/quantization + title: Quantization - local: main_classes/tokenizer title: Tokenizer - local: main_classes/trainer @@ -233,10 +290,14 @@ title: CANINE - local: model_doc/codegen title: CodeGen + - local: model_doc/code_llama + title: CodeLlama - local: model_doc/convbert title: ConvBERT - local: model_doc/cpm title: CPM + - local: model_doc/cpmant + title: CPMANT - local: model_doc/ctrl title: CTRL - local: model_doc/deberta @@ -255,10 +316,16 @@ title: Encoder Decoder Models - local: model_doc/ernie title: ERNIE + - local: model_doc/ernie_m + title: ErnieM - local: model_doc/esm title: ESM + - local: model_doc/falcon + title: Falcon - local: model_doc/flan-t5 title: FLAN-T5 + - local: model_doc/flan-ul2 + title: FLAN-UL2 - local: model_doc/flaubert title: FlauBERT - local: model_doc/fnet @@ -279,6 +346,10 @@ title: GPT-J - local: model_doc/gpt2 title: GPT2 + - local: model_doc/gpt_bigcode + title: GPTBigCode + - local: model_doc/gptsan-japanese + title: GPTSAN Japanese - local: model_doc/gpt-sw3 title: GPTSw3 - local: model_doc/herbert @@ -287,12 +358,12 @@ title: I-BERT - local: model_doc/jukebox title: Jukebox - - local: model_doc/layoutlm - title: LayoutLM - local: model_doc/led title: LED - - local: model_doc/lilt - title: LiLT + - local: model_doc/llama + title: LLaMA + - local: model_doc/llama2 + title: Llama2 - local: model_doc/longformer title: Longformer - local: model_doc/longt5 @@ -307,16 +378,24 @@ title: MarkupLM - local: model_doc/mbart title: MBart and MBart-50 + - local: model_doc/mega + title: MEGA - local: model_doc/megatron-bert title: MegatronBERT - local: model_doc/megatron_gpt2 title: MegatronGPT2 + - local: model_doc/mistral + title: Mistral - local: model_doc/mluke title: mLUKE - local: model_doc/mobilebert title: MobileBERT - local: model_doc/mpnet title: MPNet + - local: model_doc/mpt + title: MPT + - local: model_doc/mra + title: MRA - local: model_doc/mt5 title: MT5 - local: model_doc/mvp @@ -325,14 +404,20 @@ title: NEZHA - local: model_doc/nllb title: NLLB + - local: model_doc/nllb-moe + title: NLLB-MoE - local: model_doc/nystromformer title: Nyströmformer + - local: model_doc/open-llama + title: Open-Llama - local: model_doc/opt title: OPT - local: model_doc/pegasus title: Pegasus - local: model_doc/pegasus_x title: PEGASUS-X + - local: model_doc/persimmon + title: Persimmon - local: model_doc/phobert title: PhoBERT - local: model_doc/plbart @@ -359,6 +444,8 @@ title: RoCBert - local: model_doc/roformer title: RoFormer + - local: model_doc/rwkv + title: RWKV - local: model_doc/splinter title: Splinter - local: model_doc/squeezebert @@ -369,14 +456,16 @@ title: T5 - local: model_doc/t5v1.1 title: T5v1.1 - - local: model_doc/tapas - title: TAPAS - local: model_doc/tapex title: TAPEX - local: model_doc/transfo-xl title: Transformer XL - local: model_doc/ul2 title: UL2 + - local: model_doc/umt5 + title: UMT5 + - local: model_doc/xmod + title: X-MOD - local: model_doc/xglm title: XGLM - local: model_doc/xlm @@ -387,6 +476,8 @@ title: XLM-RoBERTa - local: model_doc/xlm-roberta-xl title: XLM-RoBERTa-XL + - local: model_doc/xlm-v + title: XLM-V - local: model_doc/xlnet title: XLNet - local: model_doc/yoso @@ -402,26 +493,40 @@ title: Conditional DETR - local: model_doc/convnext title: ConvNeXT + - local: model_doc/convnextv2 + title: ConvNeXTV2 - local: model_doc/cvt title: CvT - local: model_doc/deformable_detr title: Deformable DETR - local: model_doc/deit title: DeiT + - local: model_doc/deta + title: DETA - local: model_doc/detr title: DETR - local: model_doc/dinat title: DiNAT + - local: model_doc/dinov2 + title: DINOV2 - local: model_doc/dit title: DiT - local: model_doc/dpt title: DPT + - local: model_doc/efficientformer + title: EfficientFormer + - local: model_doc/efficientnet + title: EfficientNet + - local: model_doc/focalnet + title: FocalNet - local: model_doc/glpn title: GLPN - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit title: LeViT + - local: model_doc/mask2former + title: Mask2Former - local: model_doc/maskformer title: MaskFormer - local: model_doc/mobilenet_v1 @@ -430,16 +535,22 @@ title: MobileNetV2 - local: model_doc/mobilevit title: MobileViT + - local: model_doc/mobilevitv2 + title: MobileViTV2 - local: model_doc/nat title: NAT - local: model_doc/poolformer title: PoolFormer + - local: model_doc/pvt + title: Pyramid Vision Transformer (PVT) - local: model_doc/regnet title: RegNet - local: model_doc/resnet title: ResNet - local: model_doc/segformer title: SegFormer + - local: model_doc/swiftformer + title: SwiftFormer - local: model_doc/swin title: Swin Transformer - local: model_doc/swinv2 @@ -450,6 +561,8 @@ title: Table Transformer - local: model_doc/timesformer title: TimeSformer + - local: model_doc/upernet + title: UperNet - local: model_doc/van title: VAN - local: model_doc/videomae @@ -458,10 +571,16 @@ title: Vision Transformer (ViT) - local: model_doc/vit_hybrid title: ViT Hybrid + - local: model_doc/vitdet + title: ViTDet - local: model_doc/vit_mae title: ViTMAE + - local: model_doc/vitmatte + title: ViTMatte - local: model_doc/vit_msn title: ViTMSN + - local: model_doc/vivit + title: ViViT - local: model_doc/yolos title: YOLOS title: Vision models @@ -469,10 +588,22 @@ sections: - local: model_doc/audio-spectrogram-transformer title: Audio Spectrogram Transformer + - local: model_doc/bark + title: Bark + - local: model_doc/clap + title: CLAP + - local: model_doc/encodec + title: EnCodec - local: model_doc/hubert title: Hubert - local: model_doc/mctct title: MCTCT + - local: model_doc/mms + title: MMS + - local: model_doc/musicgen + title: MusicGen + - local: model_doc/pop2piano + title: Pop2Piano - local: model_doc/sew title: SEW - local: model_doc/sew-d @@ -481,10 +612,14 @@ title: Speech2Text - local: model_doc/speech_to_text_2 title: Speech2Text2 + - local: model_doc/speecht5 + title: SpeechT5 - local: model_doc/unispeech title: UniSpeech - local: model_doc/unispeech-sat title: UniSpeech-SAT + - local: model_doc/vits + title: VITS - local: model_doc/wav2vec2 title: Wav2Vec2 - local: model_doc/wav2vec2-conformer @@ -502,10 +637,18 @@ title: Audio models - isExpanded: false sections: + - local: model_doc/align + title: ALIGN - local: model_doc/altclip title: AltCLIP - local: model_doc/blip title: BLIP + - local: model_doc/blip-2 + title: BLIP-2 + - local: model_doc/bridgetower + title: BridgeTower + - local: model_doc/bros + title: BROS - local: model_doc/chinese_clip title: Chinese-CLIP - local: model_doc/clip @@ -514,6 +657,8 @@ title: CLIPSeg - local: model_doc/data2vec title: Data2Vec + - local: model_doc/deplot + title: DePlot - local: model_doc/donut title: Donut - local: model_doc/flava @@ -522,22 +667,46 @@ title: GIT - local: model_doc/groupvit title: GroupViT + - local: model_doc/idefics + title: IDEFICS + - local: model_doc/instructblip + title: InstructBLIP + - local: model_doc/layoutlm + title: LayoutLM - local: model_doc/layoutlmv2 title: LayoutLMV2 - local: model_doc/layoutlmv3 title: LayoutLMV3 - local: model_doc/layoutxlm title: LayoutXLM + - local: model_doc/lilt + title: LiLT - local: model_doc/lxmert title: LXMERT + - local: model_doc/matcha + title: MatCha + - local: model_doc/mgp-str + title: MGP-STR + - local: model_doc/nougat + title: Nougat + - local: model_doc/oneformer + title: OneFormer - local: model_doc/owlvit title: OWL-ViT - local: model_doc/perceiver title: Perceiver + - local: model_doc/pix2struct + title: Pix2Struct + - local: model_doc/sam + title: Segment Anything - local: model_doc/speech-encoder-decoder title: Speech Encoder Decoder Models + - local: model_doc/tapas + title: TAPAS - local: model_doc/trocr title: TrOCR + - local: model_doc/tvlt + title: TVLT - local: model_doc/vilt title: ViLT - local: model_doc/vision-encoder-decoder @@ -558,9 +727,18 @@ title: Reinforcement learning models - isExpanded: false sections: + - local: model_doc/autoformer + title: Autoformer + - local: model_doc/informer + title: Informer - local: model_doc/time_series_transformer title: Time Series Transformer title: Time series models + - isExpanded: false + sections: + - local: model_doc/graphormer + title: Graphormer + title: Graph models title: Models - sections: - local: internal/modeling_utils @@ -575,7 +753,11 @@ title: Utilities for Generation - local: internal/image_processing_utils title: Utilities for Image Processors + - local: internal/audio_utils + title: Utilities for Audio processing - local: internal/file_utils title: General Utilities + - local: internal/time_series_utils + title: Utilities for Time Series title: Internal Helpers - title: API \ No newline at end of file + title: API diff --git a/docs/source/en/accelerate.md b/docs/source/en/accelerate.md new file mode 100644 index 000000000000..b0f0e4efe647 --- /dev/null +++ b/docs/source/en/accelerate.md @@ -0,0 +1,136 @@ + + +# Distributed training with 🤗 Accelerate + +As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment. + +## Setup + +Get started by installing 🤗 Accelerate: + +```bash +pip install accelerate +``` + +Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device. + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## Prepare to accelerate + +The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## Backward + +The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`]method: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## Train + +Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory. + +### Train with a script + +If you are running your training from a script, run the following command to create and save a configuration file: + +```bash +accelerate config +``` + +Then launch your training with: + +```bash +accelerate launch train.py +``` + +### Train with a notebook + +🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate). diff --git a/docs/source/en/accelerate.mdx b/docs/source/en/accelerate.mdx deleted file mode 100644 index 02e05df39074..000000000000 --- a/docs/source/en/accelerate.mdx +++ /dev/null @@ -1,132 +0,0 @@ - - -# Distributed training with 🤗 Accelerate - -As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment. - -## Setup - -Get started by installing 🤗 Accelerate: - -```bash -pip install accelerate -``` - -Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device. - -```py ->>> from accelerate import Accelerator - ->>> accelerator = Accelerator() -``` - -## Prepare to accelerate - -The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer: - -```py ->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -... train_dataloader, eval_dataloader, model, optimizer -... ) -``` - -## Backward - -The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`]method: - -```py ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... outputs = model(**batch) -... loss = outputs.loss -... accelerator.backward(loss) - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training! - -```diff -+ from accelerate import Accelerator - from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler - -+ accelerator = Accelerator() - - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) - optimizer = AdamW(model.parameters(), lr=3e-5) - -- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -- model.to(device) - -+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -+ train_dataloader, eval_dataloader, model, optimizer -+ ) - - num_epochs = 3 - num_training_steps = num_epochs * len(train_dataloader) - lr_scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=num_training_steps - ) - - progress_bar = tqdm(range(num_training_steps)) - - model.train() - for epoch in range(num_epochs): - for batch in train_dataloader: -- batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss -- loss.backward() -+ accelerator.backward(loss) - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) -``` - -## Train - -Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory. - -### Train with a script - -If you are running your training from a script, run the following command to create and save a configuration file: - -```bash -accelerate config -``` - -Then launch your training with: - -```bash -accelerate launch train.py -``` - -### Train with a notebook - -🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]: - -```py ->>> from accelerate import notebook_launcher - ->>> notebook_launcher(training_function) -``` - -For more information about 🤗 Accelerate and it's rich features, refer to the [documentation](https://huggingface.co/docs/accelerate). \ No newline at end of file diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md new file mode 100644 index 000000000000..6766c8ecf048 --- /dev/null +++ b/docs/source/en/add_new_model.md @@ -0,0 +1,895 @@ + + +# How to add a model to 🤗 Transformers? + +The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)). + + + +If you're interested in implementing a TensorFlow model, take a look at the [How to convert a 🤗 Transformers model to TensorFlow](add_tensorflow_model) guide! + + + +Along the way, you'll: + +- get insights into open-source best practices +- understand the design principles behind one of the most popular deep learning libraries +- learn how to efficiently test large models +- learn how to integrate Python utilities like `black`, `ruff`, and `make fix-copies` to ensure clean and readable code + +A Hugging Face team member will be available to help you along the way so you'll never be alone. 🤗 ❤️ + +To get started, open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue for the model you want to see in 🤗 Transformers. If you're not especially picky about contributing a specific model, you can filter by the [New model label](https://github.com/huggingface/transformers/labels/New%20model) to see if there are any unclaimed model requests and work on it. + +Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already! + +## General overview of 🤗 Transformers + +First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a +chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we +found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗 +Transformers while keeping maintenance costs at a reasonable level. + +A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models: + +- Composition is generally favored over-abstraction +- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model +- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only + have to look into the respective `modeling_....py` file. + +In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for +inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the +person who will use your model, but also everybody who will read, try to understand, and possibly tweak your code. + +With this in mind, let's go a bit deeper into the general library design. + +### Overview of models + +To successfully add a model, it is important to understand the interaction between your model and its config, +[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will +call the model to be added to 🤗 Transformers `BrandNewBert`. + +Let's take a look: + + + +As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute +minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel` +inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and +that's it. As a general rule, we want to make sure that a new model only depends on +[`PreTrainedModel`]. The important functionalities that are automatically provided to every new +model are [`~PreTrainedModel.from_pretrained`] and +[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the +other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new +`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as +`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel` +as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a +configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in +[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes +inheriting from `BrandNewBertPreTrainedModel`: + +```python +model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") +model.config # model has access to its config +``` + +Similar to the model, the configuration inherits basic serialization and deserialization functionalities from +[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two +different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling +[`~PreTrainedModel.save_pretrained`] will automatically call +[`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved. + + +### Code style + +When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our +own regarding how code should be written :-) + +1. The forward pass of your model should be fully written in the modeling file while being fully independent of other + models in the library. If you want to reuse a block from another model, copy the code and paste it with a + `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160) + for a good example and [there](pr_checks#check-copies) for more documentation on Copied from). +2. The code should be fully understandable, even by a non-native English speaker. This means you should pick + descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`. + One-letter variable names are strongly discouraged unless it's an index in a for loop. +3. More generally we prefer longer explicit code to short magical one. +4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone + using your code can quickly debug it by adding print statements or breaking points. +5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and + understandable than type annotations. + +### Overview of tokenizers + +Not quite ready yet :-( This section will be added soon! + +## Step-by-step recipe to add a model to 🤗 Transformers + +Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries +of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model: + +1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf) +2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas) + +From experience, we can tell you that the most important things to keep in mind when adding a model are: + +- Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist + somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy + from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your + friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and + your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code + is based on XLM. +- It's more of an engineering challenge than a scientific challenge. You should spend more time creating an + efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper. +- Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so we at Hugging Face are more + than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making + progress. + +In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers. + +The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do +List: + +☐ (Optional) Understood the model's theoretical aspects
+☐ Prepared 🤗 Transformers dev environment
+☐ Set up debugging environment of the original repository
+☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint
+☐ Successfully added the model skeleton to 🤗 Transformers
+☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint
+☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint
+☐ Finished model tests in 🤗 Transformers
+☐ Successfully added tokenizer in 🤗 Transformers
+☐ Run end-to-end integration tests
+☐ Finished docs
+☐ Uploaded model weights to the Hub
+☐ Submitted the pull request
+☐ (Optional) Added a demo notebook + +To begin with, we usually recommend starting by getting a good theoretical understanding of `BrandNewBert`. However, +if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive +into the `BrandNewBert`'s code-base. This option might suit you better if your engineering skills are better than +your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming +much more than reading scientific papers. + +### 1. (Optional) Theoretical aspects of BrandNewBert + +You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large +sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is +not to get a deep theoretical understanding of the paper, but to extract the necessary information required to +effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the +theoretical aspects, but rather focus on the practical ones, namely: + +- What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like + encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those. +- What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,* + summarization? +- What is the novel feature of the model that makes it different from BERT/GPT-2/BART? +- Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most + similar to *brand_new_bert*? +- What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used + for BERT or BART? + +After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the +Hugging Face team with any questions you might have. This might include questions regarding the model's architecture, +its attention layer, etc. We will be more than happy to help you. + +### 2. Next prepare your environment + +1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the + repository's page. This creates a copy of the code under your GitHub user account. + +2. Clone your `transformers` fork to your local disk, and add the base repository as a remote: + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + +3. Set up a development environment, for instance by running the following command: + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a +failure with this command. If that's the case make sure to install the Deep Learning framework you are working with +(PyTorch, TensorFlow and/or Flax) then do: + +```bash +pip install -e ".[quality]" +``` + +which should be enough for most use cases. You can then return to the parent directory + +```bash +cd .. +``` + +4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the + instructions on https://pytorch.org/get-started/locally/. + +**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. + +5. To port *brand_new_bert*, you will also need access to its original repository: + +```bash +git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +cd brand_new_bert +pip install -e . +``` + +Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers. + +### 3.-4. Run a pretrained checkpoint using the original repository + +At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very +“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should +be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people +stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make +it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement +models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**. + +You should start thereby by diving into the original repository. + +Successfully running the official pretrained model in the original repository is often **the most difficult** step. +From our experience, it is very important to spend some time getting familiar with the original code-base. You need to +figure out the following: + +- Where to find the pretrained weights? +- How to load the pretrained weights into the corresponding model? +- How to run the tokenizer independently from the model? +- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually, + you only have to reimplement those functions. +- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes, + *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers, + *e.g.* *self-attention*, *cross-attention*...? +- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you + work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm? + +It is very important that before you start the porting process, you can **efficiently** debug code in the original +repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or +even a pull request in the original repository. The maintainers of this repository are most likely very happy about +someone looking into their code! + +At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original +model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to +dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only +at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the +model also works as expected on GPU. + +In general, there are two possible debugging environments for running the original model + +- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb) +- Local python scripts. + +Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split +logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also, +notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging +Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you work with them. + +The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend +some time adjusting to the new programming environment and you might not be able to use your known debugging tools +anymore, like `ipdb`. + +For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a +single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in +pseudocode): + +```python +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +``` + +Next, regarding the debugging strategy, there are generally a few from which to choose from: + +- Decompose the original model into many small testable components and run a forward pass on each of those for + verification +- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on + those, and use intermediate print statements or breakpoints for verification + +Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code +base. + +If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original +code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages +to taking the more difficult road in the beginning: + +- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically + for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead + of relying on visual comparison via print statements +- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting + individual components and thus structure your work better +- separating the model into logical meaningful components will help you to get a better overview of the model's design + and thus to better understand the model +- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue + changing your code + +[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA +gives a nice example of how this can be done. + +However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode, +it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good +example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is +very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one +often relies on verifying print statements. + +No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the +starting layers first and the ending layers last. + +It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following +layers in the following order: + +1. Retrieve the input IDs passed to the model +2. Retrieve the word embeddings +3. Retrieve the input of the first Transformer layer +4. Retrieve the output of the first Transformer layer +5. Retrieve the output of the following n - 1 Transformer layers +6. Retrieve the output of the whole BrandNewBert Model + +Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]` + +The outputs of the following layers often consist of multi-dimensional float arrays and can look like this: + +``` +[[ + [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], + ..., + [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], +``` + +We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original +model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001! +Since it is normal that the exact same model written in different libraries can give a slightly different output +depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives +nearly the same output, they have to be almost identical. Therefore, you will certainly compare the intermediate +outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of +*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely +important. Here is some advice to make your debugging environment as efficient as possible. + +- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should + probably take the time to write a longer script that decomposes the original model into smaller sub-components to + retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on + TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output + intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when + running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196). +- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle + becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds. + In case only very large checkpoints are available, it might make more sense to create a dummy model in the new + environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version + of your model +- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to + find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called + `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward` + multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`. +- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where + you have to input a string, then try to find out where in the forward call the string input is changed to input ids + and start from this point. This might mean that you have to possibly write a small script yourself or change the + original code so that you can directly input the ids instead of an input string. +- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield + random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging + environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed* + if the old and new implementations are in the same framework. + +The following section gives you more specific details/tips on how you can do this for *brand_new_bert*. + +### 5.-14. Port BrandNewBert to 🤗 Transformers + +Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork: + +```bash +cd transformers +``` + +In the special case that you are adding a model whose architecture exactly matches the model architecture of an +existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script). +In this case, you can just re-use the whole model architecture of the already existing model. + +Otherwise, let's start generating a new model. You have two choices here: + +- `transformers-cli add-new-model-like` to add a new model like an existing one +- `transformers-cli add-new-model` to add a new model from our template (will look like BERT or Bart depending on the type of model you select) + +In both cases, you will be prompted with a questionnaire to fill in the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model). + +**Open a Pull Request on the main huggingface/transformers repo** + +Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull +request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work +side-by-side on integrating the model into 🤗 Transformers. + +You should do the following: + +1. Create a branch with a descriptive name from your main branch + +```bash +git checkout -b add_brand_new_bert +``` + +2. Commit the automatically generated code: + +```bash +git add . +git commit +``` + +3. Fetch and rebase to current main + +```bash +git fetch upstream +git rebase upstream/main +``` + +4. Push the changes to your account using: + +```bash +git push -u origin a-descriptive-name-for-my-changes +``` + +5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the + GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for + future changes. + +6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page. + +In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so +that it shows in the pull request. Additionally, you should make sure to update your work with the current main from +time to time by doing: + +```bash +git fetch upstream +git merge upstream/main +``` + +In general, all questions you might have regarding the model or your implementation should be asked in your PR and +discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or +if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging +Face team can efficiently understand your problem or question. + +To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you +want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved, +you can click on the “Resolve” button of the created comment. + +In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions +on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the +Hugging Face team by Slack or email. + +**5. Adapt the generated models code for brand_new_bert** + +At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be +found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and +`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. + +Now you can finally start coding :). The generated code in +`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if +it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what +you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or +BART?*". Implement those changes which often means changing the *self-attention* layer, the order of the normalization +layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to +get a better feeling of how your model should be implemented. + +**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is +advised to add a first *unclean*, copy-pasted version of the original code to +`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is +added. From our experience, it is much more efficient to quickly add a first version of the required code and +improve/correct the code iteratively with the conversion script as described in the next section. The only thing that +has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the +following command should work: + +```python +from transformers import BrandNewBertModel, BrandNewBertConfig + +model = BrandNewBertModel(BrandNewBertConfig()) +``` + +The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with +random weights, thus making sure that the `init()` methods of all components works. + +Note that all random initialization should happen in the `_init_weights` method of your `BrandnewBertPreTrainedModel` +class. It should initialize all leaf modules depending on the variables of the config. Here is an example with the +BERT `_init_weights` method: + +```py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) +``` + +You can have some more custom schemes if you need a special initialization for some modules. For instance, in +`Wav2Vec2ForPreTraining`, the last two linear layers need to have the initialization of the regular PyTorch `nn.Linear` +but all the other ones should use an initialization as above. This is coded like this: + +```py +def _init_weights(self, module): + """Initialize the weights""" + if isinstnace(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() +``` + +The `_is_hf_initialized` flag is internally used to make sure we only initialize a submodule once. By setting it to +`True` for `module.project_q` and `module.project_hid`, we make sure the custom initialization we did is not overridden later on, +the `_init_weights` function won't be applied to them. + +**6. Write a conversion script** + +Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in +the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of +*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already +existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in +the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and +slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already +existing conversion script for your model. + +- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91) +- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py) + +In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the +name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in +PyTorch, called `SimpleModel` as follows: + +```python +from torch import nn + + +class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.dense = nn.Linear(10, 10) + self.intermediate = nn.Linear(10, 10) + self.layer_norm = nn.LayerNorm(10) +``` + +Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`, +`layer_norm` with random weights. We can print the model to see its architecture + +```python +model = SimpleModel() + +print(model) +``` + +This will print out the following: + +``` +SimpleModel( + (dense): Linear(in_features=10, out_features=10, bias=True) + (intermediate): Linear(in_features=10, out_features=10, bias=True) + (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) +) +``` + +We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight +values of a specific layer: + +```python +print(model.dense.weight.data) +``` + +to see that the weights were randomly initialized + +``` +tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, + -0.2077, 0.2157], + [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, + 0.2166, -0.0212], + [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, + -0.1023, -0.0447], + [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, + -0.1876, -0.2467], + [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, + 0.2577, 0.0402], + [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, + 0.2132, 0.1680], + [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, + 0.2707, -0.2509], + [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, + 0.1829, -0.1568], + [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, + 0.0333, -0.0536], + [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, + 0.2220, 0.2358]]). +``` + +In the conversion script, you should fill those randomly initialized weights with the exact weights of the +corresponding layer in the checkpoint. *E.g.* + +```python +# retrieve matching layer weights, e.g. by +# recursive algorithm +layer_name = "dense" +pretrained_weight = array_of_dense_layer + +model_pointer = getattr(model, "dense") + +model_pointer.weight.data = torch.from_numpy(pretrained_weight) +``` + +While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding +pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert +statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like: + +```python +assert ( + model_pointer.weight.shape == pretrained_weight.shape +), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" +``` + +Besides, you should also print out the names of both weights to make sure they match, *e.g.* + +```python +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +``` + +If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly +initialized layer of the 🤗 Transformers implementation. + +An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that +do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that +PyTorch's implementation of a layer requires the weight to be transposed beforehand. + +Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that +were not used for initialization to make sure the model is correctly converted. It is completely normal, that the +conversion trials fail with either a wrong shape statement or a wrong name assignment. This is most likely because either +you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers +implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers +implementation or you need to transpose one of the checkpoint weights. + +This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the +Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save +the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a +`pytorch_model.bin` file and a `config.json` file: + +```python +model.save_pretrained("/path/to/converted/checkpoint/folder") +``` + +**7. Implement the forward pass** + +Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make +sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward +pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers +implementation instead of the original one. It should look as follows: + +```python +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states +``` + +It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact +same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First, +you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are +used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long` +instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve +certain errors. + +The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are +equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.* +`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original +implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult +parts of adding a new model. Common mistakes why the outputs are not identical are: + +- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten +- The word embedding matrix was not tied +- The wrong positional embeddings are used because the original implementation uses on offset +- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout + layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout) + +The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗 +Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out +intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗 +Transformers implementation shows a different output than the original implementation. First, make sure that the +hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of +the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the +network. At some point, you will notice a difference between the two implementations, which should point you to the bug +in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements +in both the original implementation and 🤗 Transformers implementation, at the same positions in the network +respectively, and to successively remove print statements showing the same values for intermediate presentations. + +When you're confident that both implementations yield the same output, verify the outputs with +`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the +work left to be done should be a cakewalk 😊. + +**8. Adding all necessary model tests** + +At this point, you have successfully added a new model. However, it is very much possible that the model does not yet +fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all +common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under +the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common +tests pass: + +```bash +pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py +``` + +Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that + +- a) The community can easily understand your work by looking at specific tests of *brand_new_bert* +- b) Future changes to your model will not break any important feature of the model. + +At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts +you used earlier to implement the model to 🤗 Transformers. A template of those model tests has already added by the +Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those +tests are passing, run + +```bash +RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +``` + + + +In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1` + + + +Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under +`BrandNewBertModelTester`/``BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two +ways: + +- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the + special features of *brand_new_bert* should work. +- Future contributors can quickly test changes to the model by running those special tests. + + +**9. Implement the tokenizer** + +Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent to or very similar to an +already existing tokenizer of 🤗 Transformers. + +It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗 +Transformers' implementation of the tokenizer. + +To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository +that inputs a string and returns the `input_ids``. It could look similar to this (in pseudo-code): + +```python +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = model.tokenize(input_str) +``` + +You might have to take a deeper look again into the original repository to find the correct tokenizer function or you +might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written +a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be +created. It should look similar to this: + +```python +from transformers import BrandNewBertTokenizer + +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." + +tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") + +input_ids = tokenizer(input_str).input_ids +``` + +When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added. + +Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should +contain a couple of hard-coded integration tests. + +**10. Run End-to-end integration tests** + +Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the +tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers. +Such a test should show on a meaningful +text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can +include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none +of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a +final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can +happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a +test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those +tests for you. + +**11. Add Docstring** + +Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is +a nice docstring and a doc page. The Cookiecutter should have added a template file called +`docs/source/model_doc/brand_new_bert.md` that you should fill out. Users of your model will usually first look at +this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for +the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team +regarding the docstrings. + +Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is +correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always to good to remind oneself that documentation should +be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact +point of the community with the model. + +**Code refactor** + +Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential +incorrect code style by running: + +```bash +make style +``` + +and verify that your coding style passes the quality check: + +```bash +make quality +``` + +There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in +the tests of your pull request. This is often because of some missing information in the docstring or some incorrect +naming. The Hugging Face team will surely help you if you're stuck here. + +Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all +tests passing, now it's a good time to go over the added code again and do some refactoring. + +You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎 + +**12. Upload the models to the model hub** + +In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each +uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each +checkpoint and to get the required access rights to be able to upload the model under the author's organization of +*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below: + +```python +brand_new_bert.push_to_hub("brand_new_bert") +# Uncomment the following line to push to an organization. +# brand_new_bert.push_to_hub("/brand_new_bert") +``` + +It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the +specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint +pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to +correctly use the model. + +**13. (Optional) Add notebook** + +It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or +fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community. + +**14. Submit your finished PR** + +You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the +Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished +PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your +reviewer. + +### Share your work!! + +Now, it's time to get some credit from the community for your work! Having completed a model addition is a major +contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be +used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share +your achievements with the community. + +**You have made another model that is super easy to access for everyone in the community! 🤯** diff --git a/docs/source/en/add_new_model.mdx b/docs/source/en/add_new_model.mdx deleted file mode 100644 index d22f2326f852..000000000000 --- a/docs/source/en/add_new_model.mdx +++ /dev/null @@ -1,841 +0,0 @@ - - -# How to add a model to 🤗 Transformers? - -The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)). - - - -If you're interested in implementing a TensorFlow model, take a look at the [How to convert a 🤗 Transformers model to TensorFlow](add_tensorflow_model) guide! - - - -Along the way, you'll: - -- get insights into open-source best practices -- understand the design principles behind one of the most popular deep learning libraries -- learn how to efficiently test large models -- learn how to integrate Python utilities like `black`, `isort`, and `make fix-copies` to ensure clean and readable code - -A Hugging Face team member will be available to help you along the way so you'll never be alone. 🤗 ❤️ - -To get started, open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue for the model you want to see in 🤗 Transformers. If you're not especially picky about contributing a specific model, you can filter by the [New model label](https://github.com/huggingface/transformers/labels/New%20model) to see if there are any unclaimed model requests and work on it. - -Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already! - -## General overview of 🤗 Transformers - -First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a -chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we -found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗 -Transformers while keeping maintenance costs at a reasonable level. - -A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models: - -- Composition is generally favored over-abstraction -- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model -- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only - have to look into the respective `modeling_....py` file. - -In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for -inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the -person that will use your model, but also everybody that will read, try to understand, and possibly tweak your code. - -With this in mind, let's go a bit deeper into the general library design. - -### Overview of models - -To successfully add a model, it is important to understand the interaction between your model and its config, -[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will -call the model to be added to 🤗 Transformers `BrandNewBert`. - -Let's take a look: - - - -As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute -minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel` -inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and -that's it. As a general rule, we want to make sure that a new model only depends on -[`PreTrainedModel`]. The important functionalities that are automatically provided to every new -model are [`~PreTrainedModel.from_pretrained`] and -[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the -other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new -`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as -`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel` -as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a -configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in -[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes -inheriting from `BrandNewBertPreTrainedModel`: - -```python -model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") -model.config # model has access to its config -``` - -Similar to the model, the configuration inherits basic serialization and deserialization functionalities from -[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two -different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling -[`~PreTrainedModel.save_pretrained`] will automatically call -[`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved. - - -### Code style - -When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our -own regarding how code should be written :-) - -1. The forward pass of your model should be fully written in the modeling file while being fully independent of other - models in the library. If you want to reuse a block from another model, copy the code and paste it with a - `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160) - for a good example). -2. The code should be fully understandable, even by a non-native English speaker. This means you should pick - descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`. - One-letter variable names are strongly discouraged unless it's an index in a for loop. -3. More generally we prefer longer explicit code to short magical one. -4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone - using your code can quickly debug it by adding print statements or breaking points. -5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and - understandable than type annotations. - -### Overview of tokenizers - -Not quite ready yet :-( This section will be added soon! - -## Step-by-step recipe to add a model to 🤗 Transformers - -Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries -of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model: - -1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf) -2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas) - -From experience, we can tell you that the most important things to keep in mind when adding a model are: - -- Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist - somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy - from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your - friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and - your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code - is based on XLM. -- It's more of an engineering challenge than a scientific challenge. You should spend more time on creating an - efficient debugging environment than trying to understand all theoretical aspects of the model in the paper. -- Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so that we at Hugging Face are more - than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making - progress. - -In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers. - -The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do -List: - -☐ (Optional) Understood the model's theoretical aspects
-☐ Prepared 🤗 Transformers dev environment
-☐ Set up debugging environment of the original repository
-☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint
-☐ Successfully added the model skeleton to 🤗 Transformers
-☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint
-☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint
-☐ Finished model tests in 🤗 Transformers
-☐ Successfully added tokenizer in 🤗 Transformers
-☐ Run end-to-end integration tests
-☐ Finished docs
-☐ Uploaded model weights to the Hub
-☐ Submitted the pull request
-☐ (Optional) Added a demo notebook - -To begin with, we usually recommend to start by getting a good theoretical understanding of `BrandNewBert`. However, -if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive -into the `BrandNewBert`'s code-base. This option might suit you better, if your engineering skills are better than -your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming -much more than reading scientific papers. - -### 1. (Optional) Theoretical aspects of BrandNewBert - -You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large -sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is -not to get a deep theoretical understanding of the paper, but to extract the necessary information required to -effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the -theoretical aspects, but rather focus on the practical ones, namely: - -- What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like - encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those. -- What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,* - summarization? -- What is the novel feature of the model making it different from BERT/GPT-2/BART? -- Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most - similar to *brand_new_bert*? -- What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used - for BERT or BART? - -After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the -Hugging Face team with any questions you might have. This might include questions regarding the model's architecture, -its attention layer, etc. We will be more than happy to help you. - -### 2. Next prepare your environment - -1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the - repository's page. This creates a copy of the code under your GitHub user account. - -2. Clone your `transformers` fork to your local disk, and add the base repository as a remote: - -```bash -git clone https://github.com/[your Github handle]/transformers.git -cd transformers -git remote add upstream https://github.com/huggingface/transformers.git -``` - -3. Set up a development environment, for instance by running the following command: - -```bash -python -m venv .env -source .env/bin/activate -pip install -e ".[dev]" -``` - -and return to the parent directory - -```bash -cd .. -``` - -4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the - instructions on https://pytorch.org/get-started/locally/. - -**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. - -5. To port *brand_new_bert*, you will also need access to its original repository: - -```bash -git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git -cd brand_new_bert -pip install -e . -``` - -Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers. - -### 3.-4. Run a pretrained checkpoint using the original repository - -At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very -“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should -be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people -stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make -it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement -models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**. - -You should start thereby by diving into the original repository. - -Successfully running the official pretrained model in the original repository is often **the most difficult** step. -From our experience, it is very important to spend some time getting familiar with the original code-base. You need to -figure out the following: - -- Where to find the pretrained weights? -- How to load the pretrained weights into the corresponding model? -- How to run the tokenizer independently from the model? -- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually, - you only have to reimplement those functions. -- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes, - *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers, - *e.g.* *self-attention*, *cross-attention*...? -- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you - work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm? - -It is very important that before you start the porting process, that you can **efficiently** debug code in the original -repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or -even a pull request in the original repository. The maintainers of this repository are most likely very happy about -someone looking into their code! - -At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original -model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to -dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only -at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the -model also works as expected on GPU. - -In general, there are two possible debugging environments for running the original model - -- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb) -- Local python scripts. - -Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split -logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also, -notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging -Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you to work with them. - -The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend -some time adjusting to the new programming environment and that you might not be able to use your known debugging tools -anymore, like `ipdb`. - -For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a -single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in -pseudocode): - -```python -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") -input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids -original_output = model.predict(input_ids) -``` - -Next, regarding the debugging strategy, there are generally a few from which to choose from: - -- Decompose the original model into many small testable components and run a forward pass on each of those for - verification -- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on - those, and use intermediate print statements or breakpoints for verification - -Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code -base. - -If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original -code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages -to taking the more difficult road in the beginning: - -- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically - for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead - of relying on visual comparison via print statements -- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting - individual components and thus structure your work better -- separating the model into logical meaningful components will help you to get a better overview of the model's design - and thus to better understand the model -- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue - changing your code - -[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA -gives a nice example of how this can be done. - -However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode, -it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good -example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is -very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one -often relies on verifying print statements. - -No matter which strategy you choose, the recommended procedure is often the same in that you should start to debug the -starting layers first and the ending layers last. - -It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following -layers in the following order: - -1. Retrieve the input IDs passed to the model -2. Retrieve the word embeddings -3. Retrieve the input of the first Transformer layer -4. Retrieve the output of the first Transformer layer -5. Retrieve the output of the following n - 1 Transformer layers -6. Retrieve the output of the whole BrandNewBert Model - -Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]` - -The outputs of the following layers often consist of multi-dimensional float arrays and can look like this: - -``` -[[ - [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], - [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], - [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], - ..., - [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], - [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], - [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], -``` - -We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original -model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001! -Since it is normal that the exact same model written in different libraries can give a slightly different output -depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives -nearly the same output, they have to be the almost identical. Therefore, you will certainly compare the intermediate -outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of -*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely -important. Here is some advice is to make your debugging environment as efficient as possible. - -- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should - probably take the time to write a longer script that decomposes the original model into smaller sub-components to - retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on - TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output - intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when - running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196). -- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle - becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds. - In case only very large checkpoints are available, it might make more sense to create a dummy model in the new - environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version - of your model -- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to - find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called - `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward` - multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`. -- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where - you have to input a string, then try to find out where in the forward call the string input is changed to input ids - and start from this point. This might mean that you have to possibly write a small script yourself or change the - original code so that you can directly input the ids instead of an input string. -- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield - random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging - environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed* - if the old and new implementations are in the same framework. - -The following section gives you more specific details/tips on how you can do this for *brand_new_bert*. - -### 5.-14. Port BrandNewBert to 🤗 Transformers - -Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork: - -```bash -cd transformers -``` - -In the special case that you are adding a model whose architecture exactly matches the model architecture of an -existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script). -In this case, you can just re-use the whole model architecture of the already existing model. - -Otherwise, let's start generating a new model. You have two choices here: - -- `transformers-cli add-new-model-like` to add a new model like an existing one -- `transformers-cli add-new-model` to add a new model from our template (will look like BERT or Bart depending on the type of model you select) - -In both cases, you will be prompted with a questionnaire to fill the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model). - -**Open a Pull Request on the main huggingface/transformers repo** - -Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull -request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work -side-by-side on integrating the model into 🤗 Transformers. - -You should do the following: - -1. Create a branch with a descriptive name from your main branch - -```bash -git checkout -b add_brand_new_bert -``` - -2. Commit the automatically generated code: - -```bash -git add . -git commit -``` - -3. Fetch and rebase to current main - -```bash -git fetch upstream -git rebase upstream/main -``` - -4. Push the changes to your account using: - -```bash -git push -u origin a-descriptive-name-for-my-changes -``` - -5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the - GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for - future changes. - -6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page. - -In the following, whenever you have done some progress, don't forget to commit your work and push it to your account so -that it shows in the pull request. Additionally, you should make sure to update your work with the current main from -time to time by doing: - -```bash -git fetch upstream -git merge upstream/main -``` - -In general, all questions you might have regarding the model or your implementation should be asked in your PR and -discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or -if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging -Face team can efficiently understand your problem or question. - -To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you -want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved, -you can click on the “Resolve” button of the created comment. - -In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions -on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the -Hugging Face team by Slack or email. - -**5. Adapt the generated models code for brand_new_bert** - -At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be -found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and -`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. - -Now you can finally start coding :). The generated code in -`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if -it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what -you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or -BART?*". Implement those changes which often means to change the *self-attention* layer, the order of the normalization -layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to -get a better feeling of how your model should be implemented. - -**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is -advised to add a first *unclean*, copy-pasted version of the original code to -`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is -added. From our experience, it is much more efficient to quickly add a first version of the required code and -improve/correct the code iteratively with the conversion script as described in the next section. The only thing that -has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the -following command should work: - -```python -from transformers import BrandNewBertModel, BrandNewBertConfig - -model = BrandNewBertModel(BrandNewBertConfig()) -``` - -The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with -random weights, thus making sure that the `init()` methods of all components works. - -**6. Write a conversion script** - -Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in -the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of -*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already -existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in -the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and -slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already -existing conversion script for your model. - -- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91) -- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py) - -In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the -name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in -PyTorch, called `SimpleModel` as follows: - -```python -from torch import nn - - -class SimpleModel(nn.Module): - def __init__(self): - super().__init__() - self.dense = nn.Linear(10, 10) - self.intermediate = nn.Linear(10, 10) - self.layer_norm = nn.LayerNorm(10) -``` - -Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`, -`layer_norm` with random weights. We can print the model to see its architecture - -```python -model = SimpleModel() - -print(model) -``` - -This will print out the following: - -``` -SimpleModel( - (dense): Linear(in_features=10, out_features=10, bias=True) - (intermediate): Linear(in_features=10, out_features=10, bias=True) - (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) -) -``` - -We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight -values of a specific layer: - -```python -print(model.dense.weight.data) -``` - -to see that the weights were randomly initialized - -``` -tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, - -0.2077, 0.2157], - [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, - 0.2166, -0.0212], - [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, - -0.1023, -0.0447], - [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, - -0.1876, -0.2467], - [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, - 0.2577, 0.0402], - [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, - 0.2132, 0.1680], - [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, - 0.2707, -0.2509], - [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, - 0.1829, -0.1568], - [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, - 0.0333, -0.0536], - [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, - 0.2220, 0.2358]]). -``` - -In the conversion script, you should fill those randomly initialized weights with the exact weights of the -corresponding layer in the checkpoint. *E.g.* - -```python -# retrieve matching layer weights, e.g. by -# recursive algorithm -layer_name = "dense" -pretrained_weight = array_of_dense_layer - -model_pointer = getattr(model, "dense") - -model_pointer.weight.data = torch.from_numpy(pretrained_weight) -``` - -While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding -pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert -statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like: - -```python -assert ( - model_pointer.weight.shape == pretrained_weight.shape -), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" -``` - -Besides, you should also print out the names of both weights to make sure they match, *e.g.* - -```python -logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") -``` - -If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly -initialized layer of the 🤗 Transformers implementation. - -An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that -do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that -PyTorch's implementation of a layer requires the weight to be transposed beforehand. - -Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that -were not used for initialization to make sure the model is correctly converted. It is completely normal, that the -conversion trials fail with either a wrong shape statement or wrong name assignment. This is most likely because either -you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers -implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers -implementation or you need to transpose one of the checkpoint weights. - -This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the -Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save -the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a -`pytorch_model.bin` file and a `config.json` file: - -```python -model.save_pretrained("/path/to/converted/checkpoint/folder") -``` - -**7. Implement the forward pass** - -Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make -sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward -pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers -implementation instead of the original one. It should look as follows: - -```python -model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") -input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] -output = model(input_ids).last_hidden_states -``` - -It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact -same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First, -you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are -used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long` -instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve -certain errors. - -The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are -equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.* -`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original -implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult -parts of adding a new model. Common mistakes why the outputs are not identical are: - -- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten -- The word embedding matrix was not tied -- The wrong positional embeddings are used because the original implementation uses on offset -- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout - layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout) - -The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗 -Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out -intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗 -Transformers implementation shows a different output than the original implementation. First, make sure that the -hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of -the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the -network. At some point, you will notice a difference between the two implementations, which should point you to the bug -in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements -in both the original implementation and 🤗 Transformers implementation, at the same positions in the network -respectively, and to successively remove print statements showing the same values for intermediate presentations. - -When you're confident that both implementations yield the same output, verifying the outputs with -`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the -work left to be done should be a cakewalk 😊. - -**8. Adding all necessary model tests** - -At this point, you have successfully added a new model. However, it is very much possible that the model does not yet -fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all -common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under -the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common -tests pass: - -```bash -pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py -``` - -Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that - -- a) The community can easily understand your work by looking at specific tests of *brand_new_bert* -- b) Future changes to your model will not break any important feature of the model. - -At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts -you used earlier to implement the model to 🤗 Transformers. A template of those model tests is already added by the -Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those -tests are passing, run - -```bash -RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests -``` - - - -In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1` - - - -Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under -`BrandNewBertModelTester`/``BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two -ways: - -- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the - special features of *brand_new_bert* should work. -- Future contributors can quickly test changes to the model by running those special tests. - - -**9. Implement the tokenizer** - -Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent or very similar to an -already existing tokenizer of 🤗 Transformers. - -It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗 -Transformers' implementation of the tokenizer. - -To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository -that inputs a string and returns the `input_ids``. It could look similar to this (in pseudo-code): - -```python -input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") -input_ids = model.tokenize(input_str) -``` - -You might have to take a deeper look again into the original repository to find the correct tokenizer function or you -might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written -a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be -created. It should look similar to this: - -```python -from transformers import BrandNewBertTokenizer - -input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." - -tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") - -input_ids = tokenizer(input_str).input_ids -``` - -When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added. - -Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should -contain a couple of hard-coded integration tests. - -**10. Run End-to-end integration tests** - -Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the -tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers. -Such a test should show on a meaningful -text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can -include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none -of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a -final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can -happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a -test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those -tests for you. - -**11. Add Docstring** - -Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is -a nice docstring and a doc page. The Cookiecutter should have added a template file called -`docs/source/model_doc/brand_new_bert.mdx` that you should fill out. Users of your model will usually first look at -this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for -the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team -regarding the docstrings. - -Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is -correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always to good to remind oneself that documentation should -be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact -point of the community with the model. - -**Code refactor** - -Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential -incorrect code style by running: - -```bash -make style -``` - -and verify that your coding style passes the quality check: - -```bash -make quality -``` - -There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in -the tests of your pull request. This is often because of some missing information in the docstring or some incorrect -naming. The Hugging Face team will surely help you if you're stuck here. - -Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all -tests passing, now it's a good time to go over the added code again and do some refactoring. - -You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎 - -**12. Upload the models to the model hub** - -In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each -uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each -checkpoint and to get the required access rights to be able to upload the model under the author's organization of -*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below: - -```python -brand_new_bert.push_to_hub("brand_new_bert") -# Uncomment the following line to push to an organization. -# brand_new_bert.push_to_hub("/brand_new_bert") -``` - -It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the -specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint -pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to -correctly use the model. - -**13. (Optional) Add notebook** - -It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or -fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community. - -**14. Submit your finished PR** - -You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the -Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished -PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your -reviewer. - -### Share your work!! - -Now, it's time to get some credit from the community for your work! Having completed a model addition is a major -contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be -used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share -your achievement with the community. - -**You have made another model that is super easy to access for everyone in the community! 🤯** diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md new file mode 100644 index 000000000000..70f62bf9909e --- /dev/null +++ b/docs/source/en/add_new_pipeline.md @@ -0,0 +1,258 @@ + + +# How to create a custom pipeline? + +In this guide, we will see how to create a custom pipeline and share it on the [Hub](hf.co/models) or add it to the +🤗 Transformers library. + +First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes, +dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible +as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the +pipeline (`preprocess`). + +Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of +`postprocess` method. + +Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`, +`_forward`, `postprocess`, and `_sanitize_parameters`. + + +```python +from transformers import Pipeline + + +class MyPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} + + def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} + + def _forward(self, model_inputs): + # model_inputs == {"model_input": model_input} + outputs = self.model(**model_inputs) + # Maybe {"logits": Tensor(...)} + return outputs + + def postprocess(self, model_outputs): + best_class = model_outputs["logits"].softmax(-1) + return best_class +``` + +The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing +pre/postprocessing on the CPU on different threads + +`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might +contain more information and is usually a `Dict`. + +`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred +called method as it contains safeguards to make sure everything is working on the expected device. If anything is +linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess. + +`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided +earlier. + +`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization +time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. + +The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`, +`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That +allows to keep the default arguments in the function definition which is always more "natural". + +A classic example would be a `top_k` argument in the post processing in classification tasks. + +```python +>>> pipe = pipeline("my-new-task") +>>> pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] + +>>> pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] +``` + +In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit +`_sanitize_parameters` to allow this new parameter. + + +```python +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # Add logic to handle top_k + return best_class + + +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + + postprocess_kwargs = {} + if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] + return preprocess_kwargs, {}, postprocess_kwargs +``` + +Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy +without requiring users to understand new kinds of objects. It's also relatively common to support many different types +of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes) + + + +## Adding it to the list of supported tasks + +To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`: + +```python +from transformers.pipelines import PIPELINE_REGISTRY + +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) +``` + +You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type: + +```python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # current support type: text, audio, image, multimodal +) +``` + +## Share your pipeline on the Hub + +To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a +python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this: + +```py +import numpy as np + +from transformers import Pipeline + + +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} + + def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + + def _forward(self, model_inputs): + return self.model(**model_inputs) + + def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} +``` + +The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in +a file named `pair_classification.py`, we can then import it and register it like this: + +```py +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification + +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) +``` + +Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been +fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. + +```py +from transformers import pipeline + +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") +``` + +Then we can share it on the Hub by using the `save_pretrained` method in a `Repository`: + +```py +from huggingface_hub import Repository + +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() +``` + +This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`, +along with saving the model and tokenizer of the pipeline, before pushing everything into the repository +`{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option +`trust_remote_code=True`: + +```py +from transformers import pipeline + +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) +``` + +## Add the pipeline to 🤗 Transformers + +If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule +with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`. + +Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests. + +The `run_pipeline_test` function will be very generic and run on small random models on every possible +architecture as defined by `model_mapping` and `tf_model_mapping`. + +This is very important to test future compatibility, meaning if someone adds a new model for +`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's +impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the +output of the pipeline TYPE. + +You also *need* to implement 2 (ideally 4) tests. + +- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) + and test the pipeline outputs. The results should be the same as `test_small_model_tf`. +- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) + and test the pipeline outputs. The results should be the same as `test_small_model_pt`. +- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to + make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make + sure there is no drift in future releases. +- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to + make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make + sure there is no drift in future releases. diff --git a/docs/source/en/add_new_pipeline.mdx b/docs/source/en/add_new_pipeline.mdx deleted file mode 100644 index b0cc2cd0ff72..000000000000 --- a/docs/source/en/add_new_pipeline.mdx +++ /dev/null @@ -1,254 +0,0 @@ - - -# How to create a custom pipeline? - -In this guide, we will see how to create a custom pipeline and share it on the [Hub](hf.co/models) or add it to the -🤗 Transformers library. - -First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes, -dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible -as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the -pipeline (`preprocess`). - -Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of -`postprocess` method. - -Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`, -`_forward`, `postprocess`, and `_sanitize_parameters`. - - -```python -from transformers import Pipeline - - -class MyPipeline(Pipeline): - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - return preprocess_kwargs, {}, {} - - def preprocess(self, inputs, maybe_arg=2): - model_input = Tensor(inputs["input_ids"]) - return {"model_input": model_input} - - def _forward(self, model_inputs): - # model_inputs == {"model_input": model_input} - outputs = self.model(**model_inputs) - # Maybe {"logits": Tensor(...)} - return outputs - - def postprocess(self, model_outputs): - best_class = model_outputs["logits"].softmax(-1) - return best_class -``` - -The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing -pre/postprocessing on the CPU on different threads - -`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might -contain more information and is usually a `Dict`. - -`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred -called method as it contains safeguards to make sure everything is working on the expected device. If anything is -linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess. - -`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided -earlier. - -`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization -time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. - -The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`, -`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That -allows to keep the default arguments in the function definition which is always more "natural". - -A classic example would be a `top_k` argument in the post processing in classification tasks. - -```python ->>> pipe = pipeline("my-new-task") ->>> pipe("This is a test") -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} -{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] - ->>> pipe("This is a test", top_k=2) -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] -``` - -In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit -`_sanitize_parameters` to allow this new parameter. - - -```python -def postprocess(self, model_outputs, top_k=5): - best_class = model_outputs["logits"].softmax(-1) - # Add logic to handle top_k - return best_class - - -def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - - postprocess_kwargs = {} - if "top_k" in kwargs: - postprocess_kwargs["top_k"] = kwargs["top_k"] - return preprocess_kwargs, {}, postprocess_kwargs -``` - -Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy -without requiring users to understand new kind of objects. It's also relatively common to support many different types -of arguments for ease of use (audio files, can be filenames, URLs or pure bytes) - - - -## Adding it to the list of supported tasks - -To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`: - -```python -from transformers.pipelines import PIPELINE_REGISTRY - -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, -) -``` - -You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type: - -```python -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, - default={"pt": ("user/awesome_model", "abcdef")}, - type="text", # current support type: text, audio, image, multimodal -) -``` - -## Share your pipeline on the Hub - -To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a -python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this: - -```py -import numpy as np - -from transformers import Pipeline - - -def softmax(outputs): - maxes = np.max(outputs, axis=-1, keepdims=True) - shifted_exp = np.exp(outputs - maxes) - return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) - - -class PairClassificationPipeline(Pipeline): - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "second_text" in kwargs: - preprocess_kwargs["second_text"] = kwargs["second_text"] - return preprocess_kwargs, {}, {} - - def preprocess(self, text, second_text=None): - return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) - - def _forward(self, model_inputs): - return self.model(**model_inputs) - - def postprocess(self, model_outputs): - logits = model_outputs.logits[0].numpy() - probabilities = softmax(logits) - - best_class = np.argmax(probabilities) - label = self.model.config.id2label[best_class] - score = probabilities[best_class].item() - logits = logits.tolist() - return {"label": label, "score": score, "logits": logits} -``` - -The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in -a file named `pair_classification.py`, we can then import it and register it like this: - -```py -from pair_classification import PairClassificationPipeline -from transformers.pipelines import PIPELINE_REGISTRY -from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification - -PIPELINE_REGISTRY.register_pipeline( - "pair-classification", - pipeline_class=PairClassificationPipeline, - pt_model=AutoModelForSequenceClassification, - tf_model=TFAutoModelForSequenceClassification, -) -``` - -Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been -fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. - -```py -from transformers import pipeline - -classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") -``` - -Then we can share it on the Hub by using the `save_pretrained` method in a `Repository`: - -```py -from huggingface_hub import Repository - -repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") -classifier.save_pretrained("test-dynamic-pipeline") -repo.push_to_hub() -``` - -This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`, -along with saving the model and tokenizer of the pipeline, before pushing everything in the repository -`{your_username}/test-dynamic-pipeline`. After that anyone can use it as long as they provide the option -`trust_remote_code=True`: - -```py -from transformers import pipeline - -classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) -``` - -## Add the pipeline to 🤗 Transformers - -If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule -with the code of your pipeline, then add it in the list of tasks defined in `pipelines/__init__.py`. - -Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests. - -The `run_pipeline_test` function will be very generic and run on small random models on every possible -architecture as defined by `model_mapping` and `tf_model_mapping`. - -This is very important to test future compatibility, meaning if someone adds a new model for -`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's -impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the -output of the pipeline TYPE. - -You also *need* to implement 2 (ideally 4) tests. - -- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) - and test the pipeline outputs. The results should be the same as `test_small_model_tf`. -- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) - and test the pipeline outputs. The results should be the same as `test_small_model_pt`. -- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to - make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make - sure there is no drift in future releases. -- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to - make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make - sure there is no drift in future releases. diff --git a/docs/source/en/add_tensorflow_model.md b/docs/source/en/add_tensorflow_model.md new file mode 100644 index 000000000000..7ea81a9fe976 --- /dev/null +++ b/docs/source/en/add_tensorflow_model.md @@ -0,0 +1,356 @@ + + +# How to convert a 🤗 Transformers model to TensorFlow? + +Having multiple frameworks available to use with 🤗 Transformers gives you flexibility to play their strengths when +designing your application, but it implies that compatibility must be added on a per-model basis. The good news is that +adding TensorFlow compatibility to an existing model is simpler than [adding a new model from scratch](add_new_model)! +Whether you wish to have a deeper understanding of large TensorFlow models, make a major open-source contribution, or +enable TensorFlow for your model of choice, this guide is for you. + +This guide empowers you, a member of our community, to contribute TensorFlow model weights and/or +architectures to be used in 🤗 Transformers, with minimal supervision from the Hugging Face team. Writing a new model +is no small feat, but hopefully this guide will make it less of a rollercoaster 🎢 and more of a walk in the park 🚶. +Harnessing our collective experiences is absolutely critical to make this process increasingly easier, and thus we +highly encourage that you suggest improvements to this guide! + +Before you dive deeper, it is recommended that you check the following resources if you're new to 🤗 Transformers: +- [General overview of 🤗 Transformers](add_new_model#general-overview-of-transformers) +- [Hugging Face's TensorFlow Philosophy](https://huggingface.co/blog/tensorflow-philosophy) + +In the remainder of this guide, you will learn what's needed to add a new TensorFlow model architecture, the +procedure to convert PyTorch into TensorFlow model weights, and how to efficiently debug mismatches across ML +frameworks. Let's get started! + + + +Are you unsure whether the model you wish to use already has a corresponding TensorFlow architecture? + +  + +Check the `model_type` field of the `config.json` of your model of choice +([example](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in +🤗 Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow +architecture ([example](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)). + + + + +## Step-by-step guide to add TensorFlow model architecture code + +There are many ways to design a large model architecture, and multiple ways of implementing said design. However, +you might recall from our [general overview of 🤗 Transformers](add_new_model#general-overview-of-transformers) +that we are an opinionated bunch - the ease of use of 🤗 Transformers relies on consistent design choices. From +experience, we can tell you a few important things about adding TensorFlow models: + +- Don't reinvent the wheel! More often than not, there are at least two reference implementations you should check: the +PyTorch equivalent of the model you are implementing and other TensorFlow models for the same class of problems. +- Great model implementations survive the test of time. This doesn't happen because the code is pretty, but rather +because the code is clear, easy to debug and build upon. If you make the life of the maintainers easy with your +TensorFlow implementation, by replicating the same patterns as in other TensorFlow models and minimizing the mismatch +to the PyTorch implementation, you ensure your contribution will be long lived. +- Ask for help when you're stuck! The 🤗 Transformers team is here to help, and we've probably found solutions to the same +problems you're facing. + +Here's an overview of the steps needed to add a TensorFlow model architecture: +1. Select the model you wish to convert +2. Prepare transformers dev environment +3. (Optional) Understand theoretical aspects and the existing implementation +4. Implement the model architecture +5. Implement model tests +6. Submit the pull request +7. (Optional) Build demos and share with the world + +### 1.-3. Prepare your model contribution + +**1. Select the model you wish to convert** + +Let's start off with the basics: the first thing you need to know is the architecture you want to convert. If you +don't have your eyes set on a specific architecture, asking the 🤗 Transformers team for suggestions is a great way to +maximize your impact - we will guide you towards the most prominent architectures that are missing on the TensorFlow +side. If the specific model you want to use with TensorFlow already has a TensorFlow architecture implementation in +🤗 Transformers but is lacking weights, feel free to jump straight into the +[weight conversion section](#adding-tensorflow-weights-to-hub) +of this page. + +For simplicity, the remainder of this guide assumes you've decided to contribute with the TensorFlow version of +*BrandNewBert* (the same example as in the [guide](add_new_model) to add a new model from scratch). + + + +Before starting the work on a TensorFlow model architecture, double-check that there is no ongoing effort to do so. +You can search for `BrandNewBert` on the +[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr) to confirm that there is no +TensorFlow-related pull request. + + + + +**2. Prepare transformers dev environment** + +Having selected the model architecture, open a draft PR to signal your intention to work on it. Follow the +instructions below to set up your environment and open a draft PR. + +1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the 'Fork' button on the + repository's page. This creates a copy of the code under your GitHub user account. + +2. Clone your `transformers` fork to your local disk, and add the base repository as a remote: + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + +3. Set up a development environment, for instance by running the following command: + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a +failure with this command. If that's the case make sure to install TensorFlow then do: + +```bash +pip install -e ".[quality]" +``` + +**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. + +4. Create a branch with a descriptive name from your main branch + +```bash +git checkout -b add_tf_brand_new_bert +``` + +5. Fetch and rebase to current main + +```bash +git fetch upstream +git rebase upstream/main +``` + +6. Add an empty `.py` file in `transformers/src/models/brandnewbert/` named `modeling_tf_brandnewbert.py`. This will +be your TensorFlow model file. + +7. Push the changes to your account using: + +```bash +git add . +git commit -m "initial commit" +git push -u origin add_tf_brand_new_bert +``` + +8. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the + GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for + future changes. + +9. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page. + + +Now you have set up a development environment to port *BrandNewBert* to TensorFlow in 🤗 Transformers. + + +**3. (Optional) Understand theoretical aspects and the existing implementation** + +You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large +sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is +not to get a deep theoretical understanding of the paper, but to extract the necessary information required to +effectively re-implement the model in 🤗 Transformers using TensorFlow. That being said, you don't have to spend too +much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation +page (e.g. [model docs for BERT](model_doc/bert)). + +After you've grasped the basics of the models you are about to implement, it's important to understand the existing +implementation. This is a great chance to confirm that a working implementation matches your expectations for the +model, as well as to foresee technical challenges on the TensorFlow side. + +It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. It is +definitely not a requirement that you understand all facets of the model at this stage. Nevertheless, we highly +encourage you to clear any pressing questions in our [forum](https://discuss.huggingface.co/). + + +### 4. Model implementation + +Now it's time to finally start coding. Our suggested starting point is the PyTorch file itself: copy the contents of +`modeling_brand_new_bert.py` inside `src/transformers/models/brand_new_bert/` into +`modeling_tf_brand_new_bert.py`. The goal of this section is to modify the file and update the import structure of +🤗 Transformers such that you can import `TFBrandNewBert` and +`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` successfully loads a working TensorFlow *BrandNewBert* model. + +Sadly, there is no prescription to convert a PyTorch model into TensorFlow. You can, however, follow our selection of +tips to make the process as smooth as possible: +- Prepend `TF` to the name of all classes (e.g. `BrandNewBert` becomes `TFBrandNewBert`). +- Most PyTorch operations have a direct TensorFlow replacement. For example, `torch.nn.Linear` corresponds to + `tf.keras.layers.Dense`, `torch.nn.Dropout` corresponds to `tf.keras.layers.Dropout`, etc. If you're not sure + about a specific operation, you can use the [TensorFlow documentation](https://www.tensorflow.org/api_docs/python/tf) + or the [PyTorch documentation](https://pytorch.org/docs/stable/). +- Look for patterns in the 🤗 Transformers codebase. If you come across a certain operation that doesn't have a direct + replacement, the odds are that someone else already had the same problem. +- By default, keep the same variable names and structure as in PyTorch. This will make it easier to debug, track + issues, and add fixes down the line. +- Some layers have different default values in each framework. A notable example is the batch normalization layer's + epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d) + and `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)). + Double-check the documentation! +- PyTorch's `nn.Parameter` variables typically need to be initialized within TF Layer's `build()`. See the following + example: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) / + [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220) +- If the PyTorch model has a `#copied from ...` on top of a function, the odds are that your TensorFlow model can also + borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture. +- Assigning the `name` attribute correctly in TensorFlow functions is critical to do the `from_pt=True` weight + cross-loading. `name` is almost always the name of the corresponding variable in the PyTorch code. If `name` is not + properly set, you will see it in the error message when loading the model weights. +- The logic of the base model class, `BrandNewBertModel`, will actually reside in `TFBrandNewBertMainLayer`, a Keras + layer subclass ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)). + `TFBrandNewBertModel` will simply be a wrapper around this layer. +- Keras models need to be built in order to load pretrained weights. For that reason, `TFBrandNewBertPreTrainedModel` + will need to hold an example of inputs to the model, the `dummy_inputs` + ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)). +- If you get stuck, ask for help - we're here to help you! 🤗 + +In addition to the model file itself, you will also need to add the pointers to the model classes and related +documentation pages. You can complete this part entirely following the patterns in other PRs +([example](https://github.com/huggingface/transformers/pull/18020/files)). Here's a list of the needed manual +changes: +- Include all public classes of *BrandNewBert* in `src/transformers/__init__.py` +- Add *BrandNewBert* classes to the corresponding Auto classes in `src/transformers/models/auto/modeling_tf_auto.py` +- Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py` +- Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py` +- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md` +- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md` +- Finally, add a green tick ✅ to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.md` + +When you're happy with your implementation, run the following checklist to confirm that your model architecture is +ready: +1. All layers that behave differently at train time (e.g. Dropout) are called with a `training` argument, which is +propagated all the way from the top-level classes +2. You have used `#copied from ...` whenever possible +3. `TFBrandNewBertMainLayer` and all classes that use it have their `call` function decorated with `@unpack_inputs` +4. `TFBrandNewBertMainLayer` is decorated with `@keras_serializable` +5. A TensorFlow model can be loaded from PyTorch weights using `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` +6. You can call the TensorFlow model using the expected input format + + +### 5. Add model tests + +Hurray, you've implemented a TensorFlow model! Now it's time to add tests to make sure that your model behaves as +expected. As in the previous section, we suggest you start by copying the `test_modeling_brand_new_bert.py` file in +`tests/models/brand_new_bert/` into `test_modeling_tf_brand_new_bert.py`, and continue by making the necessary +TensorFlow replacements. For now, in all `.from_pretrained()` calls, you should use the `from_pt=True` flag to load +the existing PyTorch weights. + +After you're done, it's time for the moment of truth: run the tests! 😬 + +```bash +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +``` + +The most likely outcome is that you'll see a bunch of errors. Don't worry, this is expected! Debugging ML models is +notoriously hard, and the key ingredient to success is patience (and `breakpoint()`). In our experience, the hardest +problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide. +In other cases, a general test might not be directly applicable to your model, in which case we suggest an override +at the model test class level. Regardless of the issue, don't hesitate to ask for help in your draft pull request if +you're stuck. + +When all tests pass, congratulations, your model is nearly ready to be added to the 🤗 Transformers library! 🎉 + +### 6.-7. Ensure everyone can use your model + +**6. Submit the pull request** + +Once you're done with the implementation and the tests, it's time to submit a pull request. Before pushing your code, +run our code formatting utility, `make fixup` 🪄. This will automatically fix any formatting issues, which would cause +our automatic checks to fail. + +It's now time to convert your draft pull request into a real pull request. To do so, click on the "Ready for +review" button and add Joao (`@gante`) and Matt (`@Rocketknight1`) as reviewers. A model pull request will need +at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model. + +After all reviewers are happy with the state of your PR, the final action point is to remove the `from_pt=True` flag in +`.from_pretrained()` calls. Since there are no TensorFlow weights, you will have to add them! Check the section +below for instructions on how to do it. + +Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are +green, double-check the tests locally one last time + +```bash +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +``` + +and we will merge your PR! Congratulations on the milestone 🎉 + +**7. (Optional) Build demos and share with the world** + +One of the hardest parts about open-source is discovery. How can the other users learn about the existence of your +fabulous TensorFlow contribution? With proper communication, of course! 📣 + +There are two main ways to share your model with the community: +- Build demos. These include Gradio demos, notebooks, and other fun ways to show off your model. We highly + encourage you to add a notebook to our [community-driven demos](https://huggingface.co/docs/transformers/community). +- Share stories on social media like Twitter and LinkedIn. You should be proud of your work and share + your achievement with the community - your model can now be used by thousands of engineers and researchers around + the world 🌍! We will be happy to retweet your posts and help you share your work with the community. + + +## Adding TensorFlow weights to 🤗 Hub + +Assuming that the TensorFlow model architecture is available in 🤗 Transformers, converting PyTorch weights into +TensorFlow weights is a breeze! + +Here's how to do it: +1. Make sure you are logged into your Hugging Face account in your terminal. You can log in using the command + `huggingface-cli login` (you can find your access tokens [here](https://huggingface.co/settings/tokens)) +2. Run `transformers-cli pt-to-tf --model-name foo/bar`, where `foo/bar` is the name of the model repository + containing the PyTorch weights you want to convert +3. Tag `@joaogante` and `@Rocketknight1` in the 🤗 Hub PR the command above has just created + +That's it! 🎉 + + +## Debugging mismatches across ML frameworks 🐛 + +At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you +might come across errors complaining about mismatches between PyTorch and TensorFlow. You might even decide to open the +model architecture code for the two frameworks, and find that they look identical. What's going on? 🤔 + +First of all, let's talk about why understanding these mismatches matters. Many community members will use 🤗 +Transformers models out of the box, and trust that our models behave as expected. When there is a large mismatch +between the two frameworks, it implies that the model is not following the reference implementation for at least one +of the frameworks. This might lead to silent failures, in which the model runs but has poor performance. This is +arguably worse than a model that fails to run at all! To that end, we aim at having a framework mismatch smaller than +`1e-5` at all stages of the model. + +As in other numerical problems, the devil is in the details. And as in any detail-oriented craft, the secret +ingredient here is patience. Here is our suggested workflow for when you come across this type of issues: +1. Locate the source of mismatches. The model you're converting probably has near identical inner variables up to a + certain point. Place `breakpoint()` statements in the two frameworks' architectures, and compare the values of the + numerical variables in a top-down fashion until you find the source of the problems. +2. Now that you've pinpointed the source of the issue, get in touch with the 🤗 Transformers team. It is possible + that we've seen a similar problem before and can promptly provide a solution. As a fallback, scan popular pages + like StackOverflow and GitHub issues. +3. If there is no solution in sight, it means you'll have to go deeper. The good news is that you've located the + issue, so you can focus on the problematic instruction, abstracting away the rest of the model! The bad news is + that you'll have to venture into the source implementation of said instruction. In some cases, you might find an + issue with a reference implementation - don't abstain from opening an issue in the upstream repository. + +In some cases, in discussion with the 🤗 Transformers team, we might find that fixing the mismatch is infeasible. +When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we +might decide to ignore it in favor of distributing the model. The `pt-to-tf` CLI mentioned above has a `--max-error` +flag to override the error message at weight conversion time. diff --git a/docs/source/en/add_tensorflow_model.mdx b/docs/source/en/add_tensorflow_model.mdx deleted file mode 100644 index e145a7d00184..000000000000 --- a/docs/source/en/add_tensorflow_model.mdx +++ /dev/null @@ -1,346 +0,0 @@ - - -# How to convert a 🤗 Transformers model to TensorFlow? - -Having multiple frameworks available to use with 🤗 Transformers gives you flexibility to play their strengths when -designing your application, but it implies that compatibility must be added on a per-model basis. The good news is that -adding TensorFlow compatibility to an existing model is simpler than [adding a new model from scratch](add_new_model)! -Whether you wish to have a deeper understanding of large TensorFlow models, make a major open-source contribution, or -enable TensorFlow for your model of choice, this guide is for you. - -This guide empowers you, a member of our community, to contribute TensorFlow model weights and/or -architectures to be used in 🤗 Transformers, with minimal supervision from the Hugging Face team. Writing a new model -is no small feat, but hopefully this guide will make it less of a rollercoaster 🎢 and more of a walk in the park 🚶. -Harnessing our collective experiences is absolutely critical to make this process increasingly easier, and thus we -highly encourage that you suggest improvements to this guide! - -Before you dive deeper, it is recommended that you check the following resources if you're new to 🤗 Transformers: -- [General overview of 🤗 Transformers](add_new_model#general-overview-of-transformers) -- [Hugging Face's TensorFlow Philosophy](https://huggingface.co/blog/tensorflow-philosophy) - -In the remainder of this guide, you will learn what's needed to add a new TensorFlow model architecture, the -procedure to convert PyTorch into TensorFlow model weights, and how to efficiently debug mismatches across ML -frameworks. Let's get started! - - - -Are you unsure whether the model you wish to use already has a corresponding TensorFlow architecture? - -  - -Check the `model_type` field of the `config.json` of your model of choice -([example](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in -🤗 Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow -architecture ([example](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)). - - - - -## Step-by-step guide to add TensorFlow model architecture code - -There are many ways to design a large model architecture, and multiple ways of implementing said design. However, -you might recall from our [general overview of 🤗 Transformers](add_new_model#general-overview-of-transformers) -that we are an opinionated bunch - the ease of use of 🤗 Transformers relies on consistent design choices. From -experience, we can tell you a few important things about adding TensorFlow models: - -- Don't reinvent the wheel! More often that not, there are at least two reference implementations you should check: the -PyTorch equivalent of the model you are implementing and other TensorFlow models for the same class of problems. -- Great model implementations survive the test of time. This doesn't happen because the code is pretty, but rather -because the code is clear, easy to debug and build upon. If you make the life of the maintainers easy with your -TensorFlow implementation, by replicating the same patterns as in other TensorFlow models and minimizing the mismatch -to the PyTorch implementation, you ensure your contribution will be long lived. -- Ask for help when you're stuck! The 🤗 Transformers team is here to help, and we've probably found solutions to the same -problems you're facing. - -Here's an overview of the steps needed to add a TensorFlow model architecture: -1. Select the model you wish to convert -2. Prepare transformers dev environment -3. (Optional) Understand theoretical aspects and the existing implementation -4. Implement the model architecture -5. Implement model tests -6. Submit the pull request -7. (Optional) Build demos and share with the world - -### 1.-3. Prepare your model contribution - -**1. Select the model you wish to convert** - -Let's start off with the basics: the first thing you need to know is the architecture you want to convert. If you -don't have your eyes set on a specific architecture, asking the 🤗 Transformers team for suggestions is a great way to -maximize your impact - we will guide you towards the most prominent architectures that are missing on the TensorFlow -side. If the specific model you want to use with TensorFlow already has a TensorFlow architecture implementation in -🤗 Transformers but is lacking weights, feel free to jump straight into the -[weight conversion section](#adding-tensorflow-weights-to-hub) -of this page. - -For simplicity, the remainder of this guide assumes you've decided to contribute with the TensorFlow version of -*BrandNewBert* (the same example as in the [guide](add_new_model) to add a new model from scratch). - - - -Before starting the work on a TensorFlow model architecture, double-check that there is no ongoing effort to do so. -You can search for `BrandNewBert` on the -[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr) to confirm that there is no -TensorFlow-related pull request. - - - - -**2. Prepare transformers dev environment** - -Having selected the model architecture, open an draft PR to signal your intention to work on it. Follow the -instructions below to set up your environment and open a draft PR. - -1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the 'Fork' button on the - repository's page. This creates a copy of the code under your GitHub user account. - -2. Clone your `transformers` fork to your local disk, and add the base repository as a remote: - -```bash -git clone https://github.com/[your Github handle]/transformers.git -cd transformers -git remote add upstream https://github.com/huggingface/transformers.git -``` - -3. Set up a development environment, for instance by running the following command: - -```bash -python -m venv .env -source .env/bin/activate -pip install -e ".[dev]" -``` - -**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. - -4. Create a branch with a descriptive name from your main branch - -```bash -git checkout -b add_tf_brand_new_bert -``` - -5. Fetch and rebase to current main - -```bash -git fetch upstream -git rebase upstream/main -``` - -6. Add an empty `.py` file in `transformers/src/models/brandnewbert/` named `modeling_tf_brandnewbert.py`. This will -be your TensorFlow model file. - -7. Push the changes to your account using: - -```bash -git add . -git commit -m "initial commit" -git push -u origin add_tf_brand_new_bert -``` - -8. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the - GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for - future changes. - -9. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page. - - -Now you have set up a development environment to port *BrandNewBert* to TensorFlow in 🤗 Transformers. - - -**3. (Optional) Understand theoretical aspects and the existing implementation** - -You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large -sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is -not to get a deep theoretical understanding of the paper, but to extract the necessary information required to -effectively re-implement the model in 🤗 Transformers using TensorFlow. That being said, you don't have to spend too -much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation -page (e.g. [model docs for BERT](model_doc/bert)). - -After you've grasped the basics of the models you are about to implement, it's important to understand the existing -implementation. This is a great chance to confirm that a working implementation matches your expectations for the -model, as well as to foresee technical challenges on the TensorFlow side. - -It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. It is -definitely not a requirement that you understand all facets of the model at this stage. Nevertheless, we highly -encourage you to clear any pressing questions in our [forum](https://discuss.huggingface.co/). - - -### 4. Model implementation - -Now it's time to finally start coding. Our suggested starting point is the PyTorch file itself: copy the contents of -`modeling_brand_new_bert.py` inside `src/transformers/models/brand_new_bert/` into -`modeling_tf_brand_new_bert.py`. The goal of this section is to modify the file and update the import structure of -🤗 Transformers such that you can import `TFBrandNewBert` and -`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` successfully loads a working TensorFlow *BrandNewBert* model. - -Sadly, there is no prescription to convert a PyTorch model into TensorFlow. You can, however, follow our selection of -tips to make the process as smooth as possible: -- Prepend `TF` to the name of all classes (e.g. `BrandNewBert` becomes `TFBrandNewBert`). -- Most PyTorch operations have a direct TensorFlow replacement. For example, `torch.nn.Linear` corresponds to - `tf.keras.layers.Dense`, `torch.nn.Dropout` corresponds to `tf.keras.layers.Dropout`, etc. If you're not sure - about a specific operation, you can use the [TensorFlow documentation](https://www.tensorflow.org/api_docs/python/tf) - or the [PyTorch documentation](https://pytorch.org/docs/stable/). -- Look for patterns in the 🤗 Transformers codebase. If you come across a certain operation that doesn't have a direct - replacement, the odds are that someone else already had the same problem. -- By default, keep the same variable names and structure as in PyTorch. This will make it easier to debug, track - issues, and add fixes down the line. -- Some layers have different default values in each framework. A notable example is the batch normalization layer's - epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d) - and `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)). - Double-check the documentation! -- PyTorch's `nn.Parameter` variables typically need to be initialized within TF Layer's `build()`. See the following - example: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) / - [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220) -- If the PyTorch model has a `#copied from ...` on top of a function, the odds are that your TensorFlow model can also - borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture. -- Assigning the `name` attribute correctly in TensorFlow functions is critical to do the `from_pt=True` weight - cross-loading. `name` is almost always the name of the corresponding variable in the PyTorch code. If `name` is not - properly set, you will see it in the error message when loading the model weights. -- The logic of the base model class, `BrandNewBertModel`, will actually reside in `TFBrandNewBertMainLayer`, a Keras - layer subclass ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)). - `TFBrandNewBertModel` will simply be a wrapper around this layer. -- Keras models need to be built in order to load pretrained weights. For that reason, `TFBrandNewBertPreTrainedModel` - will need to hold an example of inputs to the model, the `dummy_inputs` - ([example](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)). -- If you get stuck, ask for help - we're here to help you! 🤗 - -In addition to the model file itself, you will also need to add the pointers to the model classes and related -documentation pages. You can complete this part entirely following the patterns in other PRs -([example](https://github.com/huggingface/transformers/pull/18020/files)). Here's a list of the needed manual -changes: -- Include all public classes of *BrandNewBert* in `src/transformers/__init__.py` -- Add *BrandNewBert* classes to the corresponding Auto classes in `src/transformers/models/auto/modeling_tf_auto.py` -- Include the modeling file in the documentation test file list in `utils/documentation_tests.txt` -- Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py` -- Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py` -- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.mdx` -- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.mdx` -- Finally, add a green tick ✅ to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.mdx` - -When you're happy with your implementation, run the following checklist to confirm that your model architecture is -ready: -1. All layers that behave differently at train time (e.g. Dropout) are called with a `training` argument, which is -propagated all the way from the top-level classes -2. You have used `#copied from ...` whenever possible -3. `TFBrandNewBertMainLayer` and all classes that use it have their `call` function decorated with `@unpack_inputs` -4. `TFBrandNewBertMainLayer` is decorated with `@keras_serializable` -5. A TensorFlow model can be loaded from PyTorch weights using `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` -6. You can call the TensorFlow model using the expected input format - - -### 5. Add model tests - -Hurray, you've implemented a TensorFlow model! Now it's time to add tests to make sure that your model behaves as -expected. As in the previous section, we suggest you start by copying the `test_modeling_brand_new_bert.py` file in -`tests/models/brand_new_bert/` into `test_modeling_tf_brand_new_bert.py`, and continue by making the necessary -TensorFlow replacements. For now, in all `.from_pretrained()` calls, you should use the `from_pt=True` flag to load -the existing PyTorch weights. - -After you're done, it's time for the moment of truth: run the tests! 😬 - -```bash -NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ -py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py -``` - -The most likely outcome is that you'll see a bunch of errors. Don't worry, this is expected! Debugging ML models is -notoriously hard, and the key ingredient to success is patience (and `breakpoint()`). In our experience, the hardest -problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide. -In other cases, a general test might not be directly applicable to your model, in which case we suggest an override -at the model test class level. Regardless of the issue, don't hesitate to ask for help in your draft pull request if -you're stuck. - -When all tests pass, congratulations, your model is nearly ready to be added to the 🤗 Transformers library! 🎉 - -### 6.-7. Ensure everyone can use your model - -**6. Submit the pull request** - -Once you're done with the implementation and the tests, it's time to submit a pull request. Before pushing your code, -run our code formatting utility, `make fixup` 🪄. This will automatically fix any formatting issues, which would cause -our automatic checks to fail. - -It's now time to convert your draft pull request into a real pull request. To do so, click on the "Ready for -review" button and add Joao (`@gante`) and Matt (`@Rocketknight1`) as reviewers. A model pull request will need -at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model. - -After all reviewers are happy with the state of your PR, the final action point is to remove the `from_pt=True` flag in -`.from_pretrained()` calls. Since there are no TensorFlow weights, you will have to add them! Check the section -below for instructions on how to do it. - -Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are -green, double-check the tests locally one last time - -```bash -NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ -py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py -``` - -and we will merge your PR! Congratulations on the milestone 🎉 - -**7. (Optional) Build demos and share with the world** - -One of the hardest parts about open-source is discovery. How can the other users learn about the existence of your -fabulous TensorFlow contribution? With proper communication, of course! 📣 - -There are two main ways to share your model with the community: -- Build demos. These include Gradio demos, notebooks, and other fun ways to show off your model. We highly - encourage you to add a notebook to our [community-driven demos](https://huggingface.co/docs/transformers/community). -- Share stories on social media like Twitter and LinkedIn. You should be proud of your work and share - your achievement with the community - your model can now be used by thousands of engineers and researchers around - the world 🌍! We will be happy to retweet your posts and help you share your work with the community. - - -## Adding TensorFlow weights to 🤗 Hub - -Assuming that the TensorFlow model architecture is available in 🤗 Transformers, converting PyTorch weights into -TensorFlow weights is a breeze! - -Here's how to do it: -1. Make sure you are logged into your Hugging Face account in your terminal. You can log in using the command - `huggingface-cli login` (you can find your access tokens [here](https://huggingface.co/settings/tokens)) -2. Run `transformers-cli pt-to-tf --model-name foo/bar`, where `foo/bar` is the name of the model repository - containing the PyTorch weights you want to convert -3. Tag `@joaogante` and `@Rocketknight1` in the 🤗 Hub PR the command above has just created - -That's it! 🎉 - - -## Debugging mismatches across ML frameworks 🐛 - -At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you -might come across errors compaining about mismatches between PyTorch and TensorFlow. You might even decide to open the -model architecture code for the two frameworks, and find that they look identical. What's going on? 🤔 - -First of all, let's talk about why understanding these mismatches matters. Many community members will use 🤗 -Transformers models out of the box, and trust that our models behave as expected. When there is a large mismatch -between the two frameworks, it implies that the model is not following the reference implementation for at least one -of the frameworks. This might lead to silent failures, in which the model runs but has poor performance. This is -arguably worse than a model that fails to run at all! To that end, we aim at having a framework mismatch smaller than -`1e-5` at all stages of the model. - -As in other numerical problems, the devil is in the details. And as in any detail-oriented craft, the secret -ingredient here is patience. Here is our suggested workflow for when you come across this type of issues: -1. Locate the source of mismatches. The model you're converting probably has near identical inner variables up to a - certain point. Place `breakpoint()` statements in the two frameworks' architectures, and compare the values of the - numerical variables in a top-down fashion until you find the source of the problems. -2. Now that you've pinpointed the source of the issue, get in touch with the 🤗 Transformers team. It is possible - that we've seen a similar problem before and can promptly provide a solution. As a fallback, scan popular pages - like StackOverflow and GitHub issues. -3. If there is no solution in sight, it means you'll have to go deeper. The good news is that you've located the - issue, so you can focus on the problematic instruction, abstracting away the rest of the model! The bad news is - that you'll have to venture into the source implementation of said instruction. In some cases, you might find an - issue with a reference implementation - don't abstain from opening an issue in the upstream repository. - -In some cases, in dicussion with the 🤗 Transformers team, we might find that the fixing the mismatch is infeasible. -When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we -might decide to ignore it in favor of distributing the model. The `pt-to-tf` CLI mentioned above has a `--max-error` -flag to override the error message at weight conversion time. diff --git a/docs/source/en/attention.md b/docs/source/en/attention.md new file mode 100644 index 000000000000..3a4f93b33ff2 --- /dev/null +++ b/docs/source/en/attention.md @@ -0,0 +1,61 @@ + + +# Attention mechanisms + +Most transformer models use full attention in the sense that the attention matrix is square. It can be a big +computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and +use a sparse version of the attention matrix to speed up training. + +## LSH attention + +[Reformer](#reformer) uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax +dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only +the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is +modified to mask the current token (except at the first position), because it will give a query and a key equal (so +very similar to each other). Since the hash can be a bit random, several hash functions are used in practice +(determined by a n_rounds parameter) and then are averaged together. + +## Local attention + +[Longformer](#longformer) uses local attention: often, the local context (e.g., what are the two tokens to the +left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small +window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a +representation of the whole sentence. + +Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access +all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in +their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask: + +
+ +
+ +Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence +length. + +## Other tricks + +### Axial positional encodings + +[Reformer](#reformer) uses axial positional encodings: in traditional transformer models, the positional encoding +E is a matrix of size \\(l\\) by \\(d\\), \\(l\\) being the sequence length and \\(d\\) the dimension of the +hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate +that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with +dimensions \\(l_{1} \times d_{1}\\) and \\(l_{2} \times d_{2}\\), such that \\(l_{1} \times l_{2} = l\\) and +\\(d_{1} + d_{2} = d\\) (with the product for the lengths, this ends up being way smaller). The embedding for time +step \\(j\\) in E is obtained by concatenating the embeddings for timestep \\(j \% l1\\) in E1 and \\(j // l1\\) +in E2. diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md new file mode 100644 index 000000000000..833882aa713e --- /dev/null +++ b/docs/source/en/autoclass_tutorial.md @@ -0,0 +1,143 @@ + + +# Load pretrained instances with an AutoClass + +With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different. + + + +Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint. + + + +In this tutorial, learn to: + +* Load a pretrained tokenizer. +* Load a pretrained image processor +* Load a pretrained feature extractor. +* Load a pretrained processor. +* Load a pretrained model. + +## AutoTokenizer + +Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model. + +Load a tokenizer with [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +Then tokenize your input as shown below: + +```py +>>> sequence = "In a hole in the ground there lived a hobbit." +>>> print(tokenizer(sequence)) +{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +## AutoImageProcessor + +For vision tasks, an image processor processes the image into the correct input format. + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +``` + + +## AutoFeatureExtractor + +For audio tasks, a feature extractor processes the audio signal the correct input format. + +Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained( +... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +## AutoProcessor + +Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them. + +Load a processor with [`AutoProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") +``` + +## AutoModel + + + +Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Easily reuse the same checkpoint to load an architecture for a different task: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + + + +For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG. + +TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue. + + + +Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. + + +Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Easily reuse the same checkpoint to load an architecture for a different task: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. + + diff --git a/docs/source/en/autoclass_tutorial.mdx b/docs/source/en/autoclass_tutorial.mdx deleted file mode 100644 index 6b44e41a856c..000000000000 --- a/docs/source/en/autoclass_tutorial.mdx +++ /dev/null @@ -1,139 +0,0 @@ - - -# Load pretrained instances with an AutoClass - -With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infer and load the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different. - - - -Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint. - - - -In this tutorial, learn to: - -* Load a pretrained tokenizer. -* Load a pretrained image processor -* Load a pretrained feature extractor. -* Load a pretrained processor. -* Load a pretrained model. - -## AutoTokenizer - -Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model. - -Load a tokenizer with [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -``` - -Then tokenize your input as shown below: - -```py ->>> sequence = "In a hole in the ground there lived a hobbit." ->>> print(tokenizer(sequence)) -{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -## AutoImageProcessor - -For vision tasks, an image processor processes the image into the correct input format. - -```py ->>> from transformers import AutoImageProcessor - ->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -``` - - -## AutoFeatureExtractor - -For audio tasks, a feature extractor processes the audio signal the correct input format. - -Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained( -... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -## AutoProcessor - -Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them. - -Load a processor with [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") -``` - -## AutoModel - - - -Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Easily reuse the same checkpoint to load an architecture for a different task: - -```py ->>> from transformers import AutoModelForTokenClassification - ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - - - -For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG. - -TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue. - - - -Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. - - -Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Easily reuse the same checkpoint to load an architecture for a different task: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - -Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. - - diff --git a/docs/source/en/benchmarks.md b/docs/source/en/benchmarks.md new file mode 100644 index 000000000000..5023d2486979 --- /dev/null +++ b/docs/source/en/benchmarks.md @@ -0,0 +1,387 @@ + + +# Benchmarks + + + +Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed +and memory complexity of Transformer models. + + + +[[open-in-colab]] + +Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks. + +A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb). + +## How to benchmark 🤗 Transformers models + +The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_. + + + +Hereby, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and +backward pass. + + + +The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an object of type [`PyTorchBenchmarkArguments`] and +[`TensorFlowBenchmarkArguments`], respectively, for instantiation. [`PyTorchBenchmarkArguments`] and [`TensorFlowBenchmarkArguments`] are data classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it is shown how a BERT model of type _bert-base-cased_ can be benchmarked. + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments + +>>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) +>>> benchmark = PyTorchBenchmark(args) +``` + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments + +>>> args = TensorFlowBenchmarkArguments( +... models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> benchmark = TensorFlowBenchmark(args) +``` + + + +Here, three arguments are given to the benchmark argument data classes, namely `models`, `batch_sizes`, and +`sequence_lengths`. The argument `models` is required and expects a `list` of model identifiers from the +[model hub](https://huggingface.co/models) The `list` arguments `batch_sizes` and `sequence_lengths` define +the size of the `input_ids` on which the model is benchmarked. There are many more parameters that can be configured +via the benchmark argument data classes. For more detail on these one can either directly consult the files +`src/transformers/benchmark/benchmark_args_utils.py`, `src/transformers/benchmark/benchmark_args.py` (for PyTorch) +and `src/transformers/benchmark/benchmark_args_tf.py` (for Tensorflow). Alternatively, running the following shell +commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow +respectively. + + + +```bash +python examples/pytorch/benchmarking/run_benchmark.py --help +``` + +An instantiated benchmark object can then simply be run by calling `benchmark.run()`. + +```py +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base-uncased 8 8 0.006 +bert-base-uncased 8 32 0.006 +bert-base-uncased 8 128 0.018 +bert-base-uncased 8 512 0.088 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base-uncased 8 8 1227 +bert-base-uncased 8 32 1281 +bert-base-uncased 8 128 1307 +bert-base-uncased 8 512 1539 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 08:58:43.371351 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + +```bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help +``` + +An instantiated benchmark object can then simply be run by calling `benchmark.run()`. + +```py +>>> results = benchmark.run() +>>> print(results) +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base-uncased 8 8 0.005 +bert-base-uncased 8 32 0.008 +bert-base-uncased 8 128 0.022 +bert-base-uncased 8 512 0.105 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base-uncased 8 8 1330 +bert-base-uncased 8 32 1330 +bert-base-uncased 8 128 1330 +bert-base-uncased 8 512 1770 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:26:35.617317 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +By default, the _time_ and the _required memory_ for _inference_ are benchmarked. In the example output above the first +two sections show the result corresponding to _inference time_ and _inference memory_. In addition, all relevant +information about the computing environment, _e.g._ the GPU type, the system, the library versions, etc... are printed +out in the third section under _ENVIRONMENT INFORMATION_. This information can optionally be saved in a _.csv_ file +when adding the argument `save_to_csv=True` to [`PyTorchBenchmarkArguments`] and +[`TensorFlowBenchmarkArguments`] respectively. In this case, every section is saved in a separate +_.csv_ file. The path to each _.csv_ file can optionally be defined via the argument data classes. + +Instead of benchmarking pre-trained models via their model identifier, _e.g._ `bert-base-uncased`, the user can +alternatively benchmark an arbitrary configuration of any available model class. In this case, a `list` of +configurations must be inserted with the benchmark args as follows. + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig + +>>> args = PyTorchBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base 8 8 1277 +bert-base 8 32 1281 +bert-base 8 128 1307 +bert-base 8 512 1539 +bert-384-hid 8 8 1005 +bert-384-hid 8 32 1027 +bert-384-hid 8 128 1035 +bert-384-hid 8 512 1255 +bert-6-lay 8 8 1097 +bert-6-lay 8 32 1101 +bert-6-lay 8 128 1127 +bert-6-lay 8 512 1359 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:35:25.143267 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +>>> args = TensorFlowBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 8 0.005 +bert-base 8 32 0.008 +bert-base 8 128 0.022 +bert-base 8 512 0.106 +bert-384-hid 8 8 0.005 +bert-384-hid 8 32 0.007 +bert-384-hid 8 128 0.018 +bert-384-hid 8 512 0.064 +bert-6-lay 8 8 0.002 +bert-6-lay 8 32 0.003 +bert-6-lay 8 128 0.0011 +bert-6-lay 8 512 0.074 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base 8 8 1330 +bert-base 8 32 1330 +bert-base 8 128 1330 +bert-base 8 512 1770 +bert-384-hid 8 8 1330 +bert-384-hid 8 32 1330 +bert-384-hid 8 128 1330 +bert-384-hid 8 512 1540 +bert-6-lay 8 8 1330 +bert-6-lay 8 32 1330 +bert-6-lay 8 128 1330 +bert-6-lay 8 512 1540 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:38:15.487125 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +Again, _inference time_ and _required memory_ for _inference_ are measured, but this time for customized configurations +of the `BertModel` class. This feature can especially be helpful when deciding for which configuration the model +should be trained. + + +## Benchmark best practices + +This section lists a couple of best practices one should be aware of when benchmarking a model. + +- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user + specifies on which device the code should be run by setting the `CUDA_VISIBLE_DEVICES` environment variable in the + shell, _e.g._ `export CUDA_VISIBLE_DEVICES=0` before running the code. +- The option `no_multi_processing` should only be set to `True` for testing and debugging. To ensure accurate + memory measurement it is recommended to run each memory benchmark in a separate process by making sure + `no_multi_processing` is set to `True`. +- One should always state the environment information when sharing the results of a model benchmark. Results can vary + heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very + useful for the community. + + +## Sharing your benchmark + +Previously all available core models (10 at the time) have been benchmarked for _inference time_, across many different +settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were +done across CPUs (except for TensorFlow XLA) and GPUs. + +The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) and the results are +available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). + +With the new _benchmark_ tools, it is easier than ever to share your benchmark results with the community + +- [PyTorch Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md). +- [TensorFlow Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md). diff --git a/docs/source/en/benchmarks.mdx b/docs/source/en/benchmarks.mdx deleted file mode 100644 index 244112001f5c..000000000000 --- a/docs/source/en/benchmarks.mdx +++ /dev/null @@ -1,383 +0,0 @@ - - -# Benchmarks - - - -Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed -and memory complexity of Transformer models. - - - -[[open-in-colab]] - -Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks. - -A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb). - -## How to benchmark 🤗 Transformers models - -The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_. - - - -Hereby, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and -backward pass. - - - -The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an object of type [`PyTorchBenchmarkArguments`] and -[`TensorFlowBenchmarkArguments`], respectively, for instantiation. [`PyTorchBenchmarkArguments`] and [`TensorFlowBenchmarkArguments`] are data classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it is shown how a BERT model of type _bert-base-cased_ can be benchmarked. - - - -```py ->>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments - ->>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) ->>> benchmark = PyTorchBenchmark(args) -``` - - -```py ->>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments - ->>> args = TensorFlowBenchmarkArguments( -... models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] -... ) ->>> benchmark = TensorFlowBenchmark(args) -``` - - - -Here, three arguments are given to the benchmark argument data classes, namely `models`, `batch_sizes`, and -`sequence_lengths`. The argument `models` is required and expects a `list` of model identifiers from the -[model hub](https://huggingface.co/models) The `list` arguments `batch_sizes` and `sequence_lengths` define -the size of the `input_ids` on which the model is benchmarked. There are many more parameters that can be configured -via the benchmark argument data classes. For more detail on these one can either directly consult the files -`src/transformers/benchmark/benchmark_args_utils.py`, `src/transformers/benchmark/benchmark_args.py` (for PyTorch) -and `src/transformers/benchmark/benchmark_args_tf.py` (for Tensorflow). Alternatively, running the following shell -commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow -respectively. - - - -```bash -python examples/pytorch/benchmarking/run_benchmark.py --help -``` - -An instantiated benchmark object can then simply be run by calling `benchmark.run()`. - -```py ->>> results = benchmark.run() ->>> print(results) -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -bert-base-uncased 8 8 0.006 -bert-base-uncased 8 32 0.006 -bert-base-uncased 8 128 0.018 -bert-base-uncased 8 512 0.088 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -bert-base-uncased 8 8 1227 -bert-base-uncased 8 32 1281 -bert-base-uncased 8 128 1307 -bert-base-uncased 8 512 1539 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: PyTorch -- use_torchscript: False -- framework_version: 1.4.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 08:58:43.371351 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - -```bash -python examples/tensorflow/benchmarking/run_benchmark_tf.py --help -``` - -An instantiated benchmark object can then simply be run by calling `benchmark.run()`. - -```py ->>> results = benchmark.run() ->>> print(results) ->>> results = benchmark.run() ->>> print(results) -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -bert-base-uncased 8 8 0.005 -bert-base-uncased 8 32 0.008 -bert-base-uncased 8 128 0.022 -bert-base-uncased 8 512 0.105 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -bert-base-uncased 8 8 1330 -bert-base-uncased 8 32 1330 -bert-base-uncased 8 128 1330 -bert-base-uncased 8 512 1770 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: Tensorflow -- use_xla: False -- framework_version: 2.2.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 09:26:35.617317 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - - -By default, the _time_ and the _required memory_ for _inference_ are benchmarked. In the example output above the first -two sections show the result corresponding to _inference time_ and _inference memory_. In addition, all relevant -information about the computing environment, _e.g._ the GPU type, the system, the library versions, etc... are printed -out in the third section under _ENVIRONMENT INFORMATION_. This information can optionally be saved in a _.csv_ file -when adding the argument `save_to_csv=True` to [`PyTorchBenchmarkArguments`] and -[`TensorFlowBenchmarkArguments`] respectively. In this case, every section is saved in a separate -_.csv_ file. The path to each _.csv_ file can optionally be defined via the argument data classes. - -Instead of benchmarking pre-trained models via their model identifier, _e.g._ `bert-base-uncased`, the user can -alternatively benchmark an arbitrary configuration of any available model class. In this case, a `list` of -configurations must be inserted with the benchmark args as follows. - - - -```py ->>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig - ->>> args = PyTorchBenchmarkArguments( -... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] -... ) ->>> config_base = BertConfig() ->>> config_384_hid = BertConfig(hidden_size=384) ->>> config_6_lay = BertConfig(num_hidden_layers=6) - ->>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) ->>> benchmark.run() -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -bert-base 8 128 0.006 -bert-base 8 512 0.006 -bert-base 8 128 0.018 -bert-base 8 512 0.088 -bert-384-hid 8 8 0.006 -bert-384-hid 8 32 0.006 -bert-384-hid 8 128 0.011 -bert-384-hid 8 512 0.054 -bert-6-lay 8 8 0.003 -bert-6-lay 8 32 0.004 -bert-6-lay 8 128 0.009 -bert-6-lay 8 512 0.044 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -bert-base 8 8 1277 -bert-base 8 32 1281 -bert-base 8 128 1307 -bert-base 8 512 1539 -bert-384-hid 8 8 1005 -bert-384-hid 8 32 1027 -bert-384-hid 8 128 1035 -bert-384-hid 8 512 1255 -bert-6-lay 8 8 1097 -bert-6-lay 8 32 1101 -bert-6-lay 8 128 1127 -bert-6-lay 8 512 1359 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: PyTorch -- use_torchscript: False -- framework_version: 1.4.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 09:35:25.143267 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - -```py ->>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig - ->>> args = TensorFlowBenchmarkArguments( -... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] -... ) ->>> config_base = BertConfig() ->>> config_384_hid = BertConfig(hidden_size=384) ->>> config_6_lay = BertConfig(num_hidden_layers=6) - ->>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) ->>> benchmark.run() -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -bert-base 8 8 0.005 -bert-base 8 32 0.008 -bert-base 8 128 0.022 -bert-base 8 512 0.106 -bert-384-hid 8 8 0.005 -bert-384-hid 8 32 0.007 -bert-384-hid 8 128 0.018 -bert-384-hid 8 512 0.064 -bert-6-lay 8 8 0.002 -bert-6-lay 8 32 0.003 -bert-6-lay 8 128 0.0011 -bert-6-lay 8 512 0.074 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -bert-base 8 8 1330 -bert-base 8 32 1330 -bert-base 8 128 1330 -bert-base 8 512 1770 -bert-384-hid 8 8 1330 -bert-384-hid 8 32 1330 -bert-384-hid 8 128 1330 -bert-384-hid 8 512 1540 -bert-6-lay 8 8 1330 -bert-6-lay 8 32 1330 -bert-6-lay 8 128 1330 -bert-6-lay 8 512 1540 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: Tensorflow -- use_xla: False -- framework_version: 2.2.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 09:38:15.487125 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - - -Again, _inference time_ and _required memory_ for _inference_ are measured, but this time for customized configurations -of the `BertModel` class. This feature can especially be helpful when deciding for which configuration the model -should be trained. - - -## Benchmark best practices - -This section lists a couple of best practices one should be aware of when benchmarking a model. - -- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user - specifies on which device the code should be run by setting the `CUDA_VISIBLE_DEVICES` environment variable in the - shell, _e.g._ `export CUDA_VISIBLE_DEVICES=0` before running the code. -- The option `no_multi_processing` should only be set to `True` for testing and debugging. To ensure accurate - memory measurement it is recommended to run each memory benchmark in a separate process by making sure - `no_multi_processing` is set to `True`. -- One should always state the environment information when sharing the results of a model benchmark. Results can vary - heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very - useful for the community. - - -## Sharing your benchmark - -Previously all available core models (10 at the time) have been benchmarked for _inference time_, across many different -settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were -done across CPUs (except for TensorFlow XLA) and GPUs. - -The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) and the results are -available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). - -With the new _benchmark_ tools, it is easier than ever to share your benchmark results with the community - -- [PyTorch Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md). -- [TensorFlow Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md). diff --git a/docs/source/en/bertology.md b/docs/source/en/bertology.md new file mode 100644 index 000000000000..ba1b4bd4002b --- /dev/null +++ b/docs/source/en/bertology.md @@ -0,0 +1,41 @@ + + +# BERTology + +There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT +(that some call "BERTology"). Some good examples of this field are: + + +- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: + https://arxiv.org/abs/1905.05950 +- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 +- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. + Manning: https://arxiv.org/abs/1906.04341 +- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 + +In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to +help people access the inner representations, mainly adapted from the great work of Paul Michel +(https://arxiv.org/abs/1905.10650): + + +- accessing all the hidden-states of BERT/GPT/GPT-2, +- accessing all the attention weights for each head of BERT/GPT/GPT-2, +- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained + in https://arxiv.org/abs/1905.10650. + +To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on +GLUE. diff --git a/docs/source/en/bertology.mdx b/docs/source/en/bertology.mdx deleted file mode 100644 index e64379d6580d..000000000000 --- a/docs/source/en/bertology.mdx +++ /dev/null @@ -1,36 +0,0 @@ - - -# BERTology - -There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT -(that some call "BERTology"). Some good examples of this field are: - - -- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: - https://arxiv.org/abs/1905.05950 -- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 -- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. - Manning: https://arxiv.org/abs/1906.04341 - -In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to -help people access the inner representations, mainly adapted from the great work of Paul Michel -(https://arxiv.org/abs/1905.10650): - - -- accessing all the hidden-states of BERT/GPT/GPT-2, -- accessing all the attention weights for each head of BERT/GPT/GPT-2, -- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained - in https://arxiv.org/abs/1905.10650. - -To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on -GLUE. diff --git a/docs/source/en/big_models.md b/docs/source/en/big_models.md new file mode 100644 index 000000000000..9b57e4331760 --- /dev/null +++ b/docs/source/en/big_models.md @@ -0,0 +1,123 @@ + + +# Instantiating a big model + +When you want to use a very big pretrained model, one challenge is to minimize the use of the RAM. The usual workflow +from PyTorch is: + +1. Create your model with random weights. +2. Load your pretrained weights. +3. Put those pretrained weights in your random model. + +Step 1 and 2 both require a full version of the model in memory, which is not a problem in most cases, but if your model starts weighing several GigaBytes, those two copies can make you get out of RAM. Even worse, if you are using `torch.distributed` to launch a distributed training, each process will load the pretrained model and store these two copies in RAM. + + + +Note that the randomly created model is initialized with "empty" tensors, which take the space in memory without filling it (thus the random values are whatever was in this chunk of memory at a given time). The random initialization following the appropriate distribution for the kind of model/parameters instantiated (like a normal distribution for instance) is only performed after step 3 on the non-initialized weights, to be as fast as possible! + + + +In this guide, we explore the solutions Transformers offer to deal with this issue. Note that this is an area of active development, so the APIs explained here may change slightly in the future. + +## Sharded checkpoints + +Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. In terms of having one single checkpoint when you do `model.save_pretrained(save_dir)`, you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in. + +You can control the maximum size before sharding with the `max_shard_size` parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model. + +```py +from transformers import AutoModel + +model = AutoModel.from_pretrained("bert-base-cased") +``` + +If you save it using [`~PreTrainedModel.save_pretrained`], you will get a new folder with two files: the config of the model and its weights: + +```py +>>> import os +>>> import tempfile + +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir) +... print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model.bin'] +``` + +Now let's use a maximum shard size of 200MB: + +```py +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json'] +``` + +On top of the configuration of the model, we see three different weights files, and an `index.json` file which is our index. A checkpoint like this can be fully reloaded using the [`~PreTrainedModel.from_pretrained`] method: + +```py +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... new_model = AutoModel.from_pretrained(tmp_dir) +``` + +The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard. + +Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary: + +```py +>>> import json + +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f: +... index = json.load(f) + +>>> print(index.keys()) +dict_keys(['metadata', 'weight_map']) +``` + +The metadata just consists of the total size of the model for now. We plan to add other information in the future: + +```py +>>> index["metadata"] +{'total_size': 433245184} +``` + +The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model `state_dict`) to the file it's stored in: + +```py +>>> index["weight_map"] +{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin', + 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin', + ... +``` + +If you want to directly load such a sharded checkpoint inside a model without using [`~PreTrainedModel.from_pretrained`] (like you would do `model.load_state_dict()` for a full checkpoint) you should use [`~modeling_utils.load_sharded_checkpoint`]: + +```py +>>> from transformers.modeling_utils import load_sharded_checkpoint + +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... load_sharded_checkpoint(model, tmp_dir) +``` + +## Low memory loading + +Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library. + +Please read the following guide for more information: [Large model loading using Accelerate](./main_classes/model#large-model-loading) diff --git a/docs/source/en/big_models.mdx b/docs/source/en/big_models.mdx deleted file mode 100644 index 971403b62d4a..000000000000 --- a/docs/source/en/big_models.mdx +++ /dev/null @@ -1,119 +0,0 @@ - - -# Instantiating a big model - -When you want to use a very big pretrained model, one challenge is to minimize the use of the RAM. The usual workflow -from PyTorch is: - -1. Create your model with random weights. -2. Load your pretrained weights. -3. Put those pretrained weights in your random model. - -Step 1 and 2 both require a full version of the model in memory, which is not a problem in most cases, but if your model starts weighing several GigaBytes, those two copies can make you got our of RAM. Even worse, if you are using `torch.distributed` to launch a distributed training, each process will load the pretrained model and store these two copies in RAM. - - - -Note that the randomly created model is initialized with "empty" tensors, which take the space in memory without filling it (thus the random values are whatever was in this chunk of memory at a given time). The random initialization following the appropriate distribution for the kind of model/parameters instatiated (like a normal distribution for instance) is only performed after step 3 on the non-initialized weights, to be as fast as possible! - - - -In this guide, we explore the solutions Transformers offer to deal with this issue. Note that this is an area of active development, so the APIs explained here may change slightly in the future. - -## Sharded checkpoints - -Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. In terms of having one single checkpoint when you do `model.save_pretrained(save_dir)`, you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in. - -You can control the maximum size before sharding with the `max_shard_size` parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model. - -```py -from transformers import AutoModel - -model = AutoModel.from_pretrained("bert-base-cased") -``` - -If you save it using [`~PreTrainedModel.save_pretrained`], you will get a new folder with two files: the config of the model and its weights: - -```py ->>> import os ->>> import tempfile - ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir) -... print(sorted(os.listdir(tmp_dir))) -['config.json', 'pytorch_model.bin'] -``` - -Now let's use a maximum shard size of 200MB: - -```py ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="200MB") -... print(sorted(os.listdir(tmp_dir))) -['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json'] -``` - -On top of the configuration of the model, we see three different weights files, and an `index.json` file which is our index. A checkpoint like this can be fully reloaded using the [`~PreTrainedModel.from_pretrained`] method: - -```py ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="200MB") -... new_model = AutoModel.from_pretrained(tmp_dir) -``` - -The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard. - -Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary: - -```py ->>> import json - ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="200MB") -... with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f: -... index = json.load(f) - ->>> print(index.keys()) -dict_keys(['metadata', 'weight_map']) -``` - -The metadata just consists of the total size of the model for now. We plan to add other information in the future: - -```py ->>> index["metadata"] -{'total_size': 433245184} -``` - -The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model `state_dict`) to the file it's stored in: - -```py ->>> index["weight_map"] -{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin', - 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin', - ... -``` - -If you want to directly load such a sharded checkpoint inside a model without using [`~PreTrainedModel.from_pretrained`] (like you would do `model.load_state_dict()` for a full checkpoint) you should use [`~modeling_utils.load_sharded_checkpoint`]: - -```py ->>> from transformers.modeling_utils import load_sharded_checkpoint - ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="200MB") -... load_sharded_checkpoint(model, tmp_dir) -``` - -## Low memory loading - -Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library. - -Please read the following guide for more information: [Large model loading using Accelerate](./main_classes/model#large-model-loading) \ No newline at end of file diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md new file mode 100644 index 000000000000..115c9f51677d --- /dev/null +++ b/docs/source/en/chat_templating.md @@ -0,0 +1,367 @@ + + +# Templates for Chat Models + +## Introduction + +An increasingly common use case for LLMs is **chat**. In a chat context, rather than continuing a single string +of text (as is the case with a standard language model), the model instead continues a conversation that consists +of one or more **messages**, each of which includes a **role** as well as message text. + +Most commonly, these roles are "user" for messages sent by the user, and "assistant" for messages sent by the model. +Some models also support a "system" role. System messages are usually sent at the beginning of the conversation +and include directives about how the model should behave in the subsequent chat. + +All language models, including models fine-tuned for chat, operate on linear sequences of tokens and do not intrinsically +have special handling for roles. This means that role information is usually injected by adding control tokens +between messages, to indicate both the message boundary and the relevant roles. + +Unfortunately, there isn't (yet!) a standard for which tokens to use, and so different models have been trained +with wildly different formatting and control tokens for chat. This can be a real problem for users - if you use the +wrong format, then the model will be confused by your input, and your performance will be a lot worse than it should be. +This is the problem that **chat templates** aim to resolve. + +Chat conversations are typically represented as a list of dictionaries, where each dictionary contains `role` +and `content` keys, and represents a single chat message. Chat templates are strings containing a Jinja template that +specifies how to format a conversation for a given model into a single tokenizable sequence. By storing this information +with the tokenizer, we can ensure that models get input data in the format they expect. + +Let's make this concrete with a quick example using the `BlenderBot` model. BlenderBot has an extremely simple default +template, which mostly just adds whitespace between rounds of dialogue: + +```python +>>> from transformers import AutoTokenizer +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + +>>> chat = [ +... {"role": "user", "content": "Hello, how are you?"}, +... {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, +... {"role": "user", "content": "I'd like to show off how chat templating works!"}, +... ] + +>>> tokenizer.apply_chat_template(chat, tokenize=False) +" Hello, how are you? I'm doing great. How can I help you today? I'd like to show off how chat templating works!" +``` + +Notice how the entire chat is condensed into a single string. If we use `tokenize=True`, which is the default setting, +that string will also be tokenized for us. To see a more complex template in action, though, let's use the +`meta-llama/Llama-2-7b-chat-hf` model. Note that this model has gated access, so you will have to +[request access on the repo](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) if you want to run this code yourself: + +```python +>> from transformers import AutoTokenizer +>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") + +>> chat = [ +... {"role": "user", "content": "Hello, how are you?"}, +... {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, +... {"role": "user", "content": "I'd like to show off how chat templating works!"}, +... ] + +>> tokenizer.use_default_system_prompt = False +>> tokenizer.apply_chat_template(chat, tokenize=False) +"[INST] Hello, how are you? [/INST] I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" +``` + +Note that this time, the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of +user messages (but not assistant messages!) + +## How do chat templates work? + +The chat template for a model is stored on the `tokenizer.chat_template` attribute. If no chat template is set, the +default template for that model class is used instead. Let's take a look at the template for `BlenderBot`: + +```python + +>>> from transformers import AutoTokenizer +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + +>>> tokenizer.default_chat_template +"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" +``` + +That's kind of intimidating. Let's add some newlines and indentation to make it more readable. Note that the first +newline after each block as well as any preceding whitespace before a block are ignored by default, using the +Jinja `trim_blocks` and `lstrip_blocks` flags. However, be cautious - although leading whitespace on each +line is stripped, spaces between blocks on the same line are not. We strongly recommend checking that your template +isn't printing extra spaces where it shouldn't be! + +``` +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ ' ' }} + {% endif %} + {{ message['content'] }} + {% if not loop.last %} + {{ ' ' }} + {% endif %} +{% endfor %} +{{ eos_token }} +``` + +If you've never seen one of these before, this is a [Jinja template](https://jinja.palletsprojects.com/en/3.1.x/templates/). +Jinja is a templating language that allows you to write simple code that generates text. In many ways, the code and +syntax resembles Python. In pure Python, this template would look something like this: + +```python +for idx, message in enumerate(messages): + if message['role'] == 'user': + print(' ') + print(message['content']) + if not idx == len(messages) - 1: # Check for the last message in the conversation + print(' ') +print(eos_token) +``` + +Effectively, the template does three things: +1. For each message, if the message is a user message, add a blank space before it, otherwise print nothing. +2. Add the message content +3. If the message is not the last message, add two spaces after it. After the final message, print the EOS token. + +This is a pretty simple template - it doesn't add any control tokens, and it doesn't support "system" messages, which +are a common way to give the model directives about how it should behave in the subsequent conversation. +But Jinja gives you a lot of flexibility to do those things! Let's see a Jinja template that can format inputs +similarly to the way LLaMA formats them (note that the real LLaMA template includes handling for default system +messages and slightly different system message handling in general - don't use this one in your actual code!) + +``` +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }} + {% elif message['role'] == 'system' %} + {{ '<>\\n' + message['content'] + '\\n<>\\n\\n' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + message['content'] + ' ' + eos_token }} + {% endif %} +{% endfor %} +``` + +Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens based +on the "role" of each message, which represents who sent it. User, assistant and system messages are clearly +distinguishable to the model because of the tokens they're wrapped in. + +## How do I create a chat template? + +Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an +existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template +above and add "[ASST]" and "[/ASST]" to assistant messages: + +``` +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} + {% elif message['role'] == 'system' %} + {{ '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} + {% elif message['role'] == 'assistant' %} + {{ '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} + {% endif %} +{% endfor %} +``` + +Now, simply set the `tokenizer.chat_template` attribute. Next time you use [`~PreTrainedTokenizer.apply_chat_template`], it will +use your new template! This attribute will be saved in the `tokenizer_config.json` file, so you can use +[`~utils.PushToHubMixin.push_to_hub`] to upload your new template to the Hub and make sure everyone's using the right +template for your model! + +```python +template = tokenizer.chat_template +template = template.replace("SYS", "SYSTEM") # Change the system token +tokenizer.chat_template = template # Set the new template +tokenizer.push_to_hub("model_name") # Upload your new template to the Hub! +``` + +The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`ConversationalPipeline`] class, so +once you set the correct chat template, your model will automatically become compatible with [`ConversationalPipeline`]. + +## What are "default" templates? + +Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards +compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a +model does not have a chat template set, but there is a default template for its model class, the `ConversationalPipeline` +class and methods like `apply_chat_template` will use the class template instead. You can find out what the default +template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute. + +This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when +the class template is appropriate for your model, we strongly recommend overriding the default template by +setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured +for chat, and to future-proof in case the default templates are ever altered or deprecated. + +## What template should I use? + +When setting the template for a model that's already been trained for chat, you should ensure that the template +exactly matches the message formatting that the model saw during training, or else you will probably experience +performance degradation. This is true even if you're training the model further - you will probably get the best +performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the +best performance for inference or fine-tuning when you precisely match the tokenization used during training. + +If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand, +you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different +input formats. Our default template for models that don't have a class-specific template follows the +[ChatML format](https://github.com/openai/openai-python/blob/main/chatml.md), and this is a good, flexible choice for many use-cases. It looks like this: + +``` +{% for message in messages %} + {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}} +{% endfor %} +``` + +If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes +handy support for "generation prompts" - see the next section for more! + +``` +tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +``` + +This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which +allows for flexibility in the roles you train with. The output looks like this: + +``` +<|im_start|>system +You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> +<|im_start|>user +How are you?<|im_end|> +<|im_start|>assistant +I'm doing great!<|im_end|> +``` + +The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense, +particularly if you want your model to operate well with [`ConversationalPipeline`]. However, you are not limited +to these roles - templating is extremely flexible, and any string can be a role. + +## What are "generation prompts"? + +You may notice that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells +the template to add tokens that indicate the start of a bot response. For example, consider the following chat: + +```python +messages = [ + {"role": "user", "content": "Hi there!"}, + {"role": "assistant", "content": "Nice to meet you!"}, + {"role": "user", "content": "Can I ask a question?"} +] +``` + +Here's what this will look like without a generation prompt, using the ChatML template we described above: + +```python +>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +""" +``` + +And here's what it looks like **with** a generation prompt: + +```python +>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +<|im_start|>assistant +""" +``` + +Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model +generates text it will write a bot response instead of doing something unexpected, like continuing the user's +message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a +special kind of text to them! You need to guide them with the appropriate control tokens so they know what they're +supposed to be doing. + +Not all models require generation prompts. Some models, like BlenderBot and LLaMA, don't have any +special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact +effect that `add_generation_prompt` has will depend on the template being used. + +## I want to use chat templates! How should I get started? + +If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using +[`~PreTrainedTokenizer.apply_chat_template`]. This applies even if you're not the model owner - if you're using a model +with an empty chat template, or one that's still using the default class template, please open a [pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) to +the model repository so that this attribute can be set properly! + +Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_template` will now work correctly for that +model, which means it is also automatically supported in places like `ConversationalPipeline`! + +By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of +open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - +it's time to put an end to them! + +## Template writing tips + +If you're unfamiliar with Jinja, we generally find that the easiest way to write a chat template is to first +write a short Python script that formats messages the way you want, and then convert that script into a template. + +Remember that the template handler will receive the conversation history as a variable called `messages`. Each +message is a dictionary with two keys, `role` and `content`. You will be able to access `messages` in your template +just like you can in Python, which means you can loop over it with `{% for message in messages %}` or access +individual messages with, for example, `{{ messages[0] }}`. + +You can also use the following tips to convert your code to Jinja: + +### For loops + +For loops in Jinja look like this: + +``` +{% for message in messages %} +{{ message['content'] }} +{% endfor %} +``` + +Note that whatever's inside the {{ expression block }} will be printed to the output. You can use operators like +`+` to combine strings inside expression blocks. + +### If statements + +If statements in Jinja look like this: + +``` +{% if message['role'] == 'user' %} +{{ message['content'] }} +{% endif %} +``` + +Note how where Python uses whitespace to mark the beginnings and ends of `for` and `if` blocks, Jinja requires you +to explicitly end them with `{% endfor %}` and `{% endif %}`. + +### Special variables + +Inside your template, you will have access to the list of `messages`, but you can also access several other special +variables. These include special tokens like `bos_token` and `eos_token`, as well as the `add_generation_prompt` +variable that we discussed above. You can also use the `loop` variable to access information about the current loop +iteration, for example using `{% if loop.last %}` to check if the current message is the last message in the +conversation. Here's an example that puts these ideas together to add a generation prompt at the end of the +conversation if add_generation_prompt is `True`: + +``` +{% if loop.last and add_generation_prompt %} +{{ bos_token + 'Assistant:\n' }} +{% endif %} +``` + +### Notes on whitespace + +As much as possible, we've tried to get Jinja to ignore whitespace outside of {{ expressions }}. However, be aware +that Jinja is a general-purpose templating engine, and it may treat whitespace between blocks on the same line +as significant and print it to the output. We **strongly** recommend checking that your template isn't printing extra +spaces where it shouldn't be before you upload it! \ No newline at end of file diff --git a/docs/source/en/community.md b/docs/source/en/community.md new file mode 100644 index 000000000000..0305844a1be8 --- /dev/null +++ b/docs/source/en/community.md @@ -0,0 +1,69 @@ + + +# Community + +This page regroups resources around 🤗 Transformers developed by the community. + +## Community resources: + +| Resource | Description | Author | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learned/revised using [Anki ](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## Community notebooks: + +| Notebook | Description | Author | | +|:----------|:-------------|:-------------|------:| +| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | How to train on sequences as long as 500,000 tokens with Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | How to build a "long" version of existing pretrained models | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine-tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | diff --git a/docs/source/en/community.mdx b/docs/source/en/community.mdx deleted file mode 100644 index 808b16779dd9..000000000000 --- a/docs/source/en/community.mdx +++ /dev/null @@ -1,65 +0,0 @@ -# Community - -This page regroups resources around 🤗 Transformers developed by the community. - -## Community resources: - -| Resource | Description | Author | -|:----------|:-------------|------:| -| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learnt/revised using [Anki ](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | - -## Community notebooks: - -| Notebook | Description | Author | | -|:----------|:-------------|:-------------|------:| -| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | -| [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | -| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | -| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | -| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | -| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | How to train on sequences as long as 500,000 tokens with Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | -| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | -| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | -| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | -| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | How to build a "long" version of existing pretrained models | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | -| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | -| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | -| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | -| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| -|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| -|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| -|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| -|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| -|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| -|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| -|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| -|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| -|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| -|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| -|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| -|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| -|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| -|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| -|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| -|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| -|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| -|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| -|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| -|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| -|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| -|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| -|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| -|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| -|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| -|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| -| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | -| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | -| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | -| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | -| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | -| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | -| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | -| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | -| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | -| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | -| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | diff --git a/docs/source/en/converting_tensorflow_models.mdx b/docs/source/en/converting_tensorflow_models.mdx deleted file mode 100644 index 8dc51dd61670..000000000000 --- a/docs/source/en/converting_tensorflow_models.mdx +++ /dev/null @@ -1,162 +0,0 @@ - - -# Converting From Tensorflow Checkpoints - -A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models -that can be loaded using the `from_pretrained` methods of the library. - - - -Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any -transformers >= 2.3.0 installation. - -The documentation below reflects the **transformers-cli convert** command format. - - - -## BERT - -You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the -[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script. - -This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated -configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from -the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can -be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ). - -You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow -checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\ -`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too. - -To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch. - -Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model: - -```bash -export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 - -transformers-cli convert --model_type bert \ - --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \ - --config $BERT_BASE_DIR/bert_config.json \ - --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin -``` - -You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models). - -## ALBERT - -Convert TensorFlow model checkpoints of ALBERT to PyTorch using the -[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script. - -The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying -configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will -need to have TensorFlow and PyTorch installed. - -Here is an example of the conversion process for the pre-trained `ALBERT Base` model: - -```bash -export ALBERT_BASE_DIR=/path/to/albert/albert_base - -transformers-cli convert --model_type albert \ - --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \ - --config $ALBERT_BASE_DIR/albert_config.json \ - --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin -``` - -You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models). - -## OpenAI GPT - -Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint -save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\ -) - -```bash -export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights - -transformers-cli convert --model_type gpt \ - --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config OPENAI_GPT_CONFIG] \ - [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \ -``` - -## OpenAI GPT-2 - -Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2)) - -```bash -export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights - -transformers-cli convert --model_type gpt2 \ - --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config OPENAI_GPT2_CONFIG] \ - [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK] -``` - -## Transformer-XL - -Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)) - -```bash -export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint - -transformers-cli convert --model_type transfo_xl \ - --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config TRANSFO_XL_CONFIG] \ - [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK] -``` - -## XLNet - -Here is an example of the conversion process for a pre-trained XLNet model: - -```bash -export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint -export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config - -transformers-cli convert --model_type xlnet \ - --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \ - --config $TRANSFO_XL_CONFIG_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--finetuning_task_name XLNET_FINETUNED_TASK] \ -``` - -## XLM - -Here is an example of the conversion process for a pre-trained XLM model: - -```bash -export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint - -transformers-cli convert --model_type xlm \ - --tf_checkpoint $XLM_CHECKPOINT_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT - [--config XML_CONFIG] \ - [--finetuning_task_name XML_FINETUNED_TASK] -``` - -## T5 - -Here is an example of the conversion process for a pre-trained T5 model: - -```bash -export T5=/path/to/t5/uncased_L-12_H-768_A-12 - -transformers-cli convert --model_type t5 \ - --tf_checkpoint $T5/t5_model.ckpt \ - --config $T5/t5_config.json \ - --pytorch_dump_output $T5/pytorch_model.bin -``` diff --git a/docs/source/en/create_a_model.md b/docs/source/en/create_a_model.md new file mode 100644 index 000000000000..ba384d437b80 --- /dev/null +++ b/docs/source/en/create_a_model.md @@ -0,0 +1,389 @@ + + +# Create a custom architecture + +An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to: + +- Load and customize a model configuration. +- Create a model architecture. +- Create a slow and fast tokenizer for text. +- Create an image processor for vision tasks. +- Create a feature extractor for audio tasks. +- Create a processor for multimodal tasks. + +## Configuration + +A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with. + +Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes: + +```py +>>> from transformers import DistilBertConfig + +>>> config = DistilBertConfig() +>>> print(config) +DistilBertConfig { + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to: + +- Try a different activation function with the `activation` parameter. +- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter. + +```py +>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) +>>> print(my_config) +DistilBertConfig { + "activation": "relu", + "attention_dropout": 0.4, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function: + +```py +>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) +``` + +Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory: + +```py +>>> my_config.save_pretrained(save_directory="./your_model_save_path") +``` + +To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]: + +```py +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +``` + + + +You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details. + + + +## Model + +The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. This means models are compatible with each of their respective framework's usage. + + + +Load your custom configuration attributes into the model: + +```py +>>> from transformers import DistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +>>> model = DistilBertModel(my_config) +``` + +This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. + +Create a pretrained model with [`~PreTrainedModel.from_pretrained`]: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + +Load your custom configuration attributes into the model: + +```py +>>> from transformers import TFDistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> tf_model = TFDistilBertModel(my_config) +``` + +This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. + +Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + + +### Model heads + +At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation). + + + +For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs. + +```py +>>> from transformers import DistilBertForSequenceClassification + +>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. + +```py +>>> from transformers import DistilBertForQuestionAnswering + +>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + +For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs. + +```py +>>> from transformers import TFDistilBertForSequenceClassification + +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. + +```py +>>> from transformers import TFDistilBertForQuestionAnswering + +>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + + +## Tokenizer + +The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers: + +- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer. +- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters. + +Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens. + + + +Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support. + + + +If you trained your own tokenizer, you can create one from your *vocabulary* file: + +```py +>>> from transformers import DistilBertTokenizer + +>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") +``` + +It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class: + +```py +>>> from transformers import DistilBertTokenizer + +>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") +``` + +Create a fast tokenizer with the [`DistilBertTokenizerFast`] class: + +```py +>>> from transformers import DistilBertTokenizerFast + +>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") +``` + + + +By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`. + + + +## Image Processor + +An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class. + +To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification: + +```py +>>> from transformers import ViTImageProcessor + +>>> vit_extractor = ViTImageProcessor() +>>> print(vit_extractor) +ViTImageProcessor { + "do_normalize": true, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} +``` + + + +If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters. + + + +Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor: + +```py +>>> from transformers import ViTImageProcessor + +>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +>>> print(my_vit_extractor) +ViTImageProcessor { + "do_normalize": false, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} +``` + +## Feature Extractor + +A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs. + +To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor() +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} +``` + + + +If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters. + + + +Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False) +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 8000 +} +``` + + +## Processor + +For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer. + +Create a feature extractor to handle the audio inputs: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) +``` + +Create a tokenizer to handle the text inputs: + +```py +>>> from transformers import Wav2Vec2CTCTokenizer + +>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") +``` + +Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]: + +```py +>>> from transformers import Wav2Vec2Processor + +>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) +``` + +With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune. diff --git a/docs/source/en/create_a_model.mdx b/docs/source/en/create_a_model.mdx deleted file mode 100644 index b0bafa4589b2..000000000000 --- a/docs/source/en/create_a_model.mdx +++ /dev/null @@ -1,385 +0,0 @@ - - -# Create a custom architecture - -An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to: - -- Load and customize a model configuration. -- Create a model architecture. -- Create a slow and fast tokenizer for text. -- Create an image processor for vision tasks. -- Create a feature extractor for audio tasks. -- Create a processor for multimodal tasks. - -## Configuration - -A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with. - -Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes: - -```py ->>> from transformers import DistilBertConfig - ->>> config = DistilBertConfig() ->>> print(config) -DistilBertConfig { - "activation": "gelu", - "attention_dropout": 0.1, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to: - -- Try a different activation function with the `activation` parameter. -- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter. - -```py ->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) ->>> print(my_config) -DistilBertConfig { - "activation": "relu", - "attention_dropout": 0.4, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function: - -```py ->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) -``` - -Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory: - -```py ->>> my_config.save_pretrained(save_directory="./your_model_save_path") -``` - -To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]: - -```py ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") -``` - - - -You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details. - - - -## Model - -The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. This means models are compatible with each of their respective framework's usage. - - - -Load your custom configuration attributes into the model: - -```py ->>> from transformers import DistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> model = DistilBertModel(my_config) -``` - -This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. - -Create a pretrained model with [`~PreTrainedModel.from_pretrained`]: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") -``` - -When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) -``` - - -Load your custom configuration attributes into the model: - -```py ->>> from transformers import TFDistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> tf_model = TFDistilBertModel(my_config) -``` - -This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. - -Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") -``` - -When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) -``` - - - -### Model heads - -At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation). - - - -For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs. - -```py ->>> from transformers import DistilBertForSequenceClassification - ->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. - -```py ->>> from transformers import DistilBertForQuestionAnswering - ->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - -For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs. - -```py ->>> from transformers import TFDistilBertForSequenceClassification - ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. - -```py ->>> from transformers import TFDistilBertForQuestionAnswering - ->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - - -## Tokenizer - -The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers: - -- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer. -- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to it's Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters. - -Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens. - - - -Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support. - - - -If you trained your own tokenizer, you can create one from your *vocabulary* file: - -```py ->>> from transformers import DistilBertTokenizer - ->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") -``` - -It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class: - -```py ->>> from transformers import DistilBertTokenizer - ->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -``` - -Create a fast tokenizer with the [`DistilBertTokenizerFast`] class: - -```py ->>> from transformers import DistilBertTokenizerFast - ->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") -``` - - - -By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`. - - - -## Image Processor - -An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class. - -To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification: - -```py ->>> from transformers import ViTImageProcessor - ->>> vit_extractor = ViTImageProcessor() ->>> print(vit_extractor) -ViTImageProcessor { - "do_normalize": true, - "do_resize": true, - "feature_extractor_type": "ViTImageProcessor", - "image_mean": [ - 0.5, - 0.5, - 0.5 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": 2, - "size": 224 -} -``` - - - -If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters. - - - -Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor: - -```py ->>> from transformers import ViTImageProcessor - ->>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) ->>> print(my_vit_extractor) -ViTImageProcessor { - "do_normalize": false, - "do_resize": true, - "feature_extractor_type": "ViTImageProcessor", - "image_mean": [ - 0.3, - 0.3, - 0.3 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": "PIL.Image.BOX", - "size": 224 -} -``` - -## Feature Extractor - -A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs. - -To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> w2v2_extractor = Wav2Vec2FeatureExtractor() ->>> print(w2v2_extractor) -Wav2Vec2FeatureExtractor { - "do_normalize": true, - "feature_extractor_type": "Wav2Vec2FeatureExtractor", - "feature_size": 1, - "padding_side": "right", - "padding_value": 0.0, - "return_attention_mask": false, - "sampling_rate": 16000 -} -``` - - - -If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters. - - - -Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False) ->>> print(w2v2_extractor) -Wav2Vec2FeatureExtractor { - "do_normalize": false, - "feature_extractor_type": "Wav2Vec2FeatureExtractor", - "feature_size": 1, - "padding_side": "right", - "padding_value": 0.0, - "return_attention_mask": false, - "sampling_rate": 8000 -} -``` - - -## Processor - -For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer. - -Create a feature extractor to handle the audio inputs: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) -``` - -Create a tokenizer to handle the text inputs: - -```py ->>> from transformers import Wav2Vec2CTCTokenizer - ->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") -``` - -Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]: - -```py ->>> from transformers import Wav2Vec2Processor - ->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) -``` - -With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune. diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md new file mode 100644 index 000000000000..d709772eed06 --- /dev/null +++ b/docs/source/en/custom_models.md @@ -0,0 +1,356 @@ + + +# Sharing custom models + +The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder +of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs. + +If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you +how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it +with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗 +Transformers library. + +We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the +[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`]. + +## Writing a custom configuration + +Before we dive into the model, let's first write its configuration. The configuration of a model is an object that +will contain all the necessary information to build the model. As we will see in the next section, the model can only +take a `config` to be initialized, so we really need that object to be as complete as possible. + +In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different +configurations will then give us the different types of ResNets that are possible. We then just store those arguments, +after checking the validity of a few of them. + +```python +from transformers import PretrainedConfig +from typing import List + + +class ResnetConfig(PretrainedConfig): + model_type = "resnet" + + def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, + ): + if block_type not in ["basic", "bottleneck"]: + raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") + if stem_type not in ["", "deep", "deep-tiered"]: + raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") + + self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) +``` + +The three important things to remember when writing you own configuration are the following: +- you have to inherit from `PretrainedConfig`, +- the `__init__` of your `PretrainedConfig` must accept any kwargs, +- those `kwargs` need to be passed to the superclass `__init__`. + +The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other +constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a +config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the +superclass. + +Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to +register your model with the auto classes (see last section). + +With this done, you can easily create and save your configuration like you would do with any other model config of the +library. Here is how we can create a resnet50d config and save it: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d_config.save_pretrained("custom-resnet") +``` + +This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the +`from_pretrained` method: + +```py +resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") +``` + +You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to +directly upload your config to the Hub. + +## Writing a custom model + +Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that +extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image +classification (like [`BertForSequenceClassification`]). + +As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only +thing we need to do before writing this class is a map between the block types and actual block classes. Then the +model is defined from the configuration by passing everything to the `ResNet` class: + +```py +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig + + +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} + + +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor): + return self.model.forward_features(tensor) +``` + +For the model that will classify images, we just change the forward method: + +```py +import torch + + +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} +``` + +In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config` +(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless +you want to register your model with the auto classes (see last section). + + + +If your model is very similar to a model inside the library, you can re-use the same configuration as this model. + + + +You can have your model return anything you want, but returning a dictionary like we did for +`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly +usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own +training loop or another library for training. + +Now that we have our model class, let's create one: + +```py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +``` + +Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or +[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights +with the code of our model. But first, let's load some pretrained weights inside our model. + +In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial, +we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be +easy to transfer those weights: + +```py +import timm + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the +code of the model is saved. + +## Sending the code to the Hub + + + +This API is experimental and may have some slight breaking changes in the next releases. + + + +First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as +long as all the files are in the same directory (we don't support submodules for this feature yet). For our example, +we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working +directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file +contains the code of `ResnetModel` and `ResnetModelForImageClassification`. + +``` +. +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +``` + +The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module. + + + +If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file +to import from the `transformers` package. + + + +Note that you can re-use (or subclass) an existing configuration/model. + +To share your model with the community, follow those steps: first import the ResNet model and config from the newly +created files: + +```py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +``` + +Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained` +method and properly register them with a given Auto class (especially for models), just run: + +```py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +``` + +Note that there is no need to specify an auto class for the configuration (there is only one auto class for them, +[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you +have to specify which one of the auto classes is the correct one for your model. + +Next, let's create the config and models as we did before: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +Now to send the model to the Hub, make sure you are logged in. Either run in your terminal: + +```bash +huggingface-cli login +``` + +or from a notebook: + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +You can then push to your own namespace (or an organization you are a member of) like this: + +```py +resnet50d.push_to_hub("custom-resnet50d") +``` + +On top of the modeling weights and the configuration in json format, this also copied the modeling and +configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result +in this [model repo](https://huggingface.co/sgugger/custom-resnet50d). + +See the [sharing tutorial](model_sharing) for more information on the push to Hub method. + +## Using a model with custom code + +You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and +the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still +review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use +a model with custom code: + +```py +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) +``` + +It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not +update the code with some malicious new lines (unless you fully trust the authors of the models). + +```py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +``` + +Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit +hash of any commit. + +## Registering a model with custom code to the auto classes + +If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own +model. This is different from pushing the code to the Hub in the sense that users will need to import your library to +get the custom models (contrarily to automatically downloading the model code from the Hub). + +As long as your config has a `model_type` attribute that is different from existing model types, and that your model +classes have the right `config_class` attributes, you can just add them to the auto classes like this: + +```py +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification + +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) +``` + +Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type` +of your custom config, and the first argument used when registering your custom models to any auto model class needs +to match the `config_class` of those models. diff --git a/docs/source/en/custom_models.mdx b/docs/source/en/custom_models.mdx deleted file mode 100644 index f5ad55856243..000000000000 --- a/docs/source/en/custom_models.mdx +++ /dev/null @@ -1,352 +0,0 @@ - - -# Sharing custom models - -The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder -of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs. - -If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you -how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it -with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗 -Transformers library. - -We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the -[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`]. - -## Writing a custom configuration - -Before we dive into the model, let's first write its configuration. The configuration of a model is an object that -will contain all the necessary information to build the model. As we will see in the next section, the model can only -take a `config` to be initialized, so we really need that object to be as complete as possible. - -In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different -configurations will then give us the different types of ResNets that are possible. We then just store those arguments, -after checking the validity of a few of them. - -```python -from transformers import PretrainedConfig -from typing import List - - -class ResnetConfig(PretrainedConfig): - model_type = "resnet" - - def __init__( - self, - block_type="bottleneck", - layers: List[int] = [3, 4, 6, 3], - num_classes: int = 1000, - input_channels: int = 3, - cardinality: int = 1, - base_width: int = 64, - stem_width: int = 64, - stem_type: str = "", - avg_down: bool = False, - **kwargs, - ): - if block_type not in ["basic", "bottleneck"]: - raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") - if stem_type not in ["", "deep", "deep-tiered"]: - raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") - - self.block_type = block_type - self.layers = layers - self.num_classes = num_classes - self.input_channels = input_channels - self.cardinality = cardinality - self.base_width = base_width - self.stem_width = stem_width - self.stem_type = stem_type - self.avg_down = avg_down - super().__init__(**kwargs) -``` - -The three important things to remember when writing you own configuration are the following: -- you have to inherit from `PretrainedConfig`, -- the `__init__` of your `PretrainedConfig` must accept any kwargs, -- those `kwargs` need to be passed to the superclass `__init__`. - -The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other -constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a -config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the -superclass. - -Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to -register your model with the auto classes (see last section). - -With this done, you can easily create and save your configuration like you would do with any other model config of the -library. Here is how we can create a resnet50d config and save it: - -```py -resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) -resnet50d_config.save_pretrained("custom-resnet") -``` - -This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the -`from_pretrained` method: - -```py -resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") -``` - -You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to -directly upload your config to the Hub. - -## Writing a custom model - -Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that -extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image -classification (like [`BertForSequenceClassification`]). - -As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only -thing we need to do before writing this class is a map between the block types and actual block classes. Then the -model is defined from the configuration by passing everything to the `ResNet` class: - -```py -from transformers import PreTrainedModel -from timm.models.resnet import BasicBlock, Bottleneck, ResNet -from .configuration_resnet import ResnetConfig - - -BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} - - -class ResnetModel(PreTrainedModel): - config_class = ResnetConfig - - def __init__(self, config): - super().__init__(config) - block_layer = BLOCK_MAPPING[config.block_type] - self.model = ResNet( - block_layer, - config.layers, - num_classes=config.num_classes, - in_chans=config.input_channels, - cardinality=config.cardinality, - base_width=config.base_width, - stem_width=config.stem_width, - stem_type=config.stem_type, - avg_down=config.avg_down, - ) - - def forward(self, tensor): - return self.model.forward_features(tensor) -``` - -For the model that will classify images, we just change the forward method: - -```py -import torch - - -class ResnetModelForImageClassification(PreTrainedModel): - config_class = ResnetConfig - - def __init__(self, config): - super().__init__(config) - block_layer = BLOCK_MAPPING[config.block_type] - self.model = ResNet( - block_layer, - config.layers, - num_classes=config.num_classes, - in_chans=config.input_channels, - cardinality=config.cardinality, - base_width=config.base_width, - stem_width=config.stem_width, - stem_type=config.stem_type, - avg_down=config.avg_down, - ) - - def forward(self, tensor, labels=None): - logits = self.model(tensor) - if labels is not None: - loss = torch.nn.cross_entropy(logits, labels) - return {"loss": loss, "logits": logits} - return {"logits": logits} -``` - -In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config` -(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless -you want to register your model with the auto classes (see last section). - - - -If your model is very similar to a model inside the library, you can re-use the same configuration as this model. - - - -You can have your model return anything you want, but returning a dictionary like we did for -`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly -usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own -training loop or another library for training. - -Now that we have our model class, let's create one: - -```py -resnet50d = ResnetModelForImageClassification(resnet50d_config) -``` - -Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or -[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights -with the code of our model. But first, let's load some pretrained weights inside our model. - -In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial, -we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be -easy to transfer those weights: - -```py -import timm - -pretrained_model = timm.create_model("resnet50d", pretrained=True) -resnet50d.model.load_state_dict(pretrained_model.state_dict()) -``` - -Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the -code of the model is saved. - -## Sending the code to the Hub - - - -This API is experimental and may have some slight breaking changes in the next releases. - - - -First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as -long as all the files are in the same directory (we don't support submodules for this feature yet). For our example, -we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working -directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file -contains the code of `ResnetModel` and `ResnetModelForImageClassification`. - -``` -. -└── resnet_model - ├── __init__.py - ├── configuration_resnet.py - └── modeling_resnet.py -``` - -The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module. - - - -If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file -to import from the `transformers` package. - - - -Note that you can re-use (or subclass) an existing configuration/model. - -To share your model with the community, follow those steps: first import the ResNet model and config from the newly -created files: - -```py -from resnet_model.configuration_resnet import ResnetConfig -from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification -``` - -Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained` -method and properly register them with a given Auto class (especially for models), just run: - -```py -ResnetConfig.register_for_auto_class() -ResnetModel.register_for_auto_class("AutoModel") -ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") -``` - -Note that there is no need to specify an auto class for the configuration (there is only one auto class for them, -[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you -have to specify which one of the auto classes is the correct one for your model. - -Next, let's create the config and models as we did before: - -```py -resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) -resnet50d = ResnetModelForImageClassification(resnet50d_config) - -pretrained_model = timm.create_model("resnet50d", pretrained=True) -resnet50d.model.load_state_dict(pretrained_model.state_dict()) -``` - -Now to send the model to the Hub, make sure you are logged in. Either run in your terminal: - -```bash -huggingface-cli login -``` - -or from a notebook: - -```py -from huggingface_hub import notebook_login - -notebook_login() -``` - -You can then push to your own namespace (or an organization you are a member of) like this: - -```py -resnet50d.push_to_hub("custom-resnet50d") -``` - -On top of the modeling weights and the configuration in json format, this also copied the modeling and -configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result -in this [model repo](https://huggingface.co/sgugger/custom-resnet50d). - -See the [sharing tutorial](model_sharing) for more information on the push to Hub method. - -## Using a model with custom code - -You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and -the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still -review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use -a model with custom code: - -```py -from transformers import AutoModelForImageClassification - -model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) -``` - -It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not -update the code with some malicious new lines (unless you fully trust the authors of the models). - -```py -commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" -model = AutoModelForImageClassification.from_pretrained( - "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash -) -``` - -Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit -hash of any commit. - -## Registering a model with custom code to the auto classes - -If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own -model. This is different from pushing the code to the Hub in the sense that users will need to import your library to -get the custom models (contrarily to automatically downloading the model code from the Hub). - -As long as your config has a `model_type` attribute that is different from existing model types, and that your model -classes have the right `config_class` attributes, you can just add them to the auto classes likes this: - -```py -from transformers import AutoConfig, AutoModel, AutoModelForImageClassification - -AutoConfig.register("resnet", ResnetConfig) -AutoModel.register(ResnetConfig, ResnetModel) -AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) -``` - -Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type` -of your custom config, and the first argument used when registering your custom models to any auto model class needs -to match the `config_class` of those models. diff --git a/docs/source/en/custom_tools.md b/docs/source/en/custom_tools.md new file mode 100644 index 000000000000..86183a80752e --- /dev/null +++ b/docs/source/en/custom_tools.md @@ -0,0 +1,789 @@ + + +# Custom Tools and Prompts + + + +If you are not aware of what tools and agents are in the context of transformers, we recommend you read the +[Transformers Agents](transformers_agents) page first. + + + + + +Transformers Agents is an experimental API that is subject to change at any time. Results returned by the agents +can vary as the APIs or underlying models are prone to change. + + + +Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks. +In this guide we'll take a look at: + +- How to customize the prompt +- How to use custom tools +- How to create custom tools + +## Customizing the prompt + +As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode. +Both the `run` and `chat` modes underlie the same logic. The language model powering the agent is conditioned on a long +prompt and completes the prompt by generating the next tokens until the stop token is reached. +The only difference between the two modes is that during the `chat` mode the prompt is extended with +previous user inputs and model generations. This allows the agent to have access to past interactions, +seemingly giving the agent some kind of memory. + +### Structure of the prompt + +Let's take a closer look at how the prompt is structured to understand how it can be best customized. +The prompt is structured broadly into four parts. + +- 1. Introduction: how the agent should behave, explanation of the concept of tools. +- 2. Description of all the tools. This is defined by a `<>` token that is dynamically replaced at runtime with the tools defined/chosen by the user. +- 3. A set of examples of tasks and their solution +- 4. Current example, and request for solution. + +To better understand each part, let's look at a shortened version of how the `run` prompt can look like: + +````text +I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task. +[...] +You can print intermediate results if it makes sense to do so. + +Tools: +- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question. +- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English. +[...] + +Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French." + +I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image. + +Answer: +```py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") +answer = image_qa(image=image, question=translated_question) +print(f"The answer is {answer}") +``` + +Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner." + +I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. + +Answer: +```py +answer = document_qa(document, question="What is the oldest person?") +print(f"The answer is {answer}.") +image = image_generator("A banner showing " + answer) +``` + +[...] + +Task: "Draw me a picture of rivers and lakes" + +I will use the following +```` + +The introduction (the text before *"Tools:"*) explains precisely how the model shall behave and what it should do. +This part most likely does not need to be customized as the agent shall always behave the same way. + +The second part (the bullet points below *"Tools"*) is dynamically added upon calling `run` or `chat`. There are +exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name +and description of the tool: + +```text +- : +``` + +Let's verify this quickly by loading the document_qa tool and printing out the name and description. + +```py +from transformers import load_tool + +document_qa = load_tool("document-question-answering") +print(f"- {document_qa.name}: {document_qa.description}") +``` + +which gives: +```text +- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question. +``` + +We can see that the tool name is short and precise. The description includes two parts, the first explaining +what the tool does and the second states what input arguments and return values are expected. + +A good tool name and tool description are very important for the agent to correctly use it. Note that the only +information the agent has about the tool is its name and description, so one should make sure that both +are precisely written and match the style of the existing tools in the toolbox. In particular make sure the description +mentions all the arguments expected by name in code-style, along with the expected type and a description of what they +are. + + + +Check the naming and description of the curated Transformers tools to better understand what name and +description a tool is expected to have. You can see all tools with the [`Agent.toolbox`] property. + + + +The third part includes a set of curated examples that show the agent exactly what code it should produce +for what kind of user request. The large language models empowering the agent are extremely good at +recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important +that the examples are written in a way that maximizes the likelihood of the agent to generating correct, +executable code in practice. + +Let's have a look at one example: + +````text +Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner." + +I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. + +Answer: +```py +answer = document_qa(document, question="What is the oldest person?") +print(f"The answer is {answer}.") +image = image_generator("A banner showing " + answer) +``` + +```` + +The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of +what it intends to do, and finally the generated code. Every example that is part of the prompt has this exact +pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens. + +The prompt examples are curated by the Transformers team and rigorously evaluated on a set of +[problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py) +to ensure that the agent's prompt is as good as possible to solve real use cases of the agent. + +The final part of the prompt corresponds to: +```text +Task: "Draw me a picture of rivers and lakes" + +I will use the following +``` + +is a final and unfinished example that the agent is tasked to complete. The unfinished example +is dynamically created based on the actual user input. For the above example, the user ran: + +```py +agent.run("Draw me a picture of rivers and lakes") +``` + +The user input - *a.k.a* the task: *"Draw me a picture of rivers and lakes"* is cast into the +prompt template: "Task: \n\n I will use the following". This sentence makes up the final lines of the +prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example +exactly in the same way it was previously done in the examples. + +Without going into too much detail, the chat template has the same prompt structure with the +examples having a slightly different style, *e.g.*: + +````text +[...] + +===== + +Human: Answer the question in the variable `question` about the image stored in the variable `image`. + +Assistant: I will use the tool `image_qa` to answer the question on the input image. + +```py +answer = image_qa(text=question, image=image) +print(f"The answer is {answer}") +``` + +Human: I tried this code, it worked but didn't give me a good result. The question is in French + +Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this. + +```py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") +answer = image_qa(text=translated_question, image=image) +print(f"The answer is {answer}") +``` + +===== + +[...] +```` + +Contrary, to the examples of the `run` prompt, each `chat` prompt example has one or more exchanges between the +*Human* and the *Assistant*. Every exchange is structured similarly to the example of the `run` prompt. +The user's input is appended to behind *Human:* and the agent is prompted to first generate what needs to be done +before generating code. An exchange can be based on previous exchanges, therefore allowing the user to refer +to past exchanges as is done *e.g.* above by the user's input of "I tried **this** code" refers to the +previously generated code of the agent. + +Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form: +```text +Human: \n\nAssistant: +``` +which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example +to the prompt, thus giving the agent more context for the next `chat` turn. + +Great now that we know how the prompt is structured, let's see how we can customize it! + +### Writing good user inputs + +While large language models are getting better and better at understanding users' intentions, it helps +enormously to be as precise as possible to help the agent pick the correct task. What does it mean to be +as precise as possible? + +The agent sees a list of tool names and their description in its prompt. The more tools are added the +more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose +the correct sequences of tools to run. Let's look at a common failure case, here we will only return +the code to analyze it. + +```py +from transformers import HfAgent + +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") + +agent.run("Show me a tree", return_code=True) +``` + +gives: + +```text +==Explanation from the agent== +I will use the following tool: `image_segmenter` to create a segmentation mask for the image. + + +==Code generated by the agent== +mask = image_segmenter(image, prompt="tree") +``` + +which is probably not what we wanted. Instead, it is more likely that we want an image of a tree to be generated. +To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that +are present in the tool's name and description. Let's have a look. +```py +agent.toolbox["image_generator"].description +``` + +```text +'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image. +``` + +The name and description make use of the keywords "image", "prompt", "create" and "generate". Using these words will most likely work better here. Let's refine our prompt a bit. + +```py +agent.run("Create an image of a tree", return_code=True) +``` + +gives: +```text +==Explanation from the agent== +I will use the following tool `image_generator` to generate an image of a tree. + + +==Code generated by the agent== +image = image_generator(prompt="tree") +``` + +Much better! That looks more like what we want. In short, when you notice that the agent struggles to +correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name +and description and try refining your task request with it. + +### Customizing the tool descriptions + +As we've seen before the agent has access to each of the tools' names and descriptions. The base tools +should have very precise names and descriptions, however, you might find that it could help to change the +the description or name of a tool for your specific use case. This might become especially important +when you've added multiple tools that are very similar or if you want to use your agent only for a certain +domain, *e.g.* image generation and transformations. + +A common problem is that the agent confuses image generation with image transformation/modification when +used a lot for image generation tasks, *e.g.* +```py +agent.run("Make an image of a house and a car", return_code=True) +``` +returns +```text +==Explanation from the agent== +I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house. + +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") +house_car_image = image_transformer(image=car_image, prompt="A house") +``` + +which is probably not exactly what we want here. It seems like the agent has a difficult time +to understand the difference between `image_generator` and `image_transformer` and often uses the two together. + +We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier` +to disassociate it a bit from "image" and "prompt": +```py +agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer") +agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace( + "transforms an image according to a prompt", "modifies an image" +) +``` + +Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again. + +```py +agent.run("Make an image of a house and a car", return_code=True) +``` + +Now we're getting: +```text +==Explanation from the agent== +I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car. + + +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") +``` + +which is definitely closer to what we had in mind! However, we want to have both the house and car in the same image. Steering the task more toward single image generation should help: + +```py +agent.run("Create image: 'A house and car'", return_code=True) +``` + +```text +==Explanation from the agent== +I will use the following tool: `image_generator` to generate an image. + + +==Code generated by the agent== +image = image_generator(prompt="A house and car") +``` + + + +Agents are still brittle for many use cases, especially when it comes to +slightly more complex use cases like generating an image of multiple objects. +Both the agent itself and the underlying prompt will be further improved in the coming +months making sure that agents become more robust to a variety of user inputs. + + + +### Customizing the whole prompt + +To give the user maximum flexibility, the whole prompt template as explained in [above](#structure-of-the-prompt) +can be overwritten by the user. In this case make sure that your custom prompt includes an introduction section, +a tool section, an example section, and an unfinished example section. If you want to overwrite the `run` prompt template, +you can do as follows: + +```py +template = """ [...] """ + +agent = HfAgent(your_endpoint, run_prompt_template=template) +``` + + + +Please make sure to have the `<>` string and the `<>` defined somewhere in the `template` so that the agent can be aware +of the tools, it has available to it as well as correctly insert the user's prompt. + + + +Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges: +```text +Human: <> + +Assistant: +``` + +Therefore it is important that the examples of the custom `chat` prompt template also make use of this format. +You can overwrite the `chat` template at instantiation as follows. + +``` +template = """ [...] """ + +agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template) +``` + + + +Please make sure to have the `<>` string defined somewhere in the `template` so that the agent can be aware +of the tools, it has available to it. + + + +In both cases, you can pass a repo ID instead of the prompt template if you would like to use a template hosted by someone in the community. The default prompts live in [this repo](https://huggingface.co/datasets/huggingface-tools/default-prompts) as an example. + +To upload your custom prompt on a repo on the Hub and share it with the community just make sure: +- to use a dataset repository +- to put the prompt template for the `run` command in a file named `run_prompt_template.txt` +- to put the prompt template for the `chat` command in a file named `chat_prompt_template.txt` + +## Using custom tools + +In this section, we'll be leveraging two existing custom tools that are specific to image generation: + +- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation), + with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) + to allow for more image modifications. +- We add a new tool for image upscaling to the default toolbox: + [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool. + +We'll start by loading the custom tools with the convenient [`load_tool`] function: + +```py +from transformers import load_tool + +controlnet_transformer = load_tool("diffusers/controlnet-canny-tool") +upscaler = load_tool("diffusers/latent-upscaler-tool") +``` + +Upon adding custom tools to an agent, the tools' descriptions and names are automatically +included in the agents' prompts. Thus, it is imperative that custom tools have +a well-written description and name in order for the agent to understand how to use them. +Let's take a look at the description and name of `controlnet_transformer`: + +```py +print(f"Description: '{controlnet_transformer.description}'") +print(f"Name: '{controlnet_transformer.name}'") +``` + +gives +```text +Description: 'This is a tool that transforms an image with ControlNet according to a prompt. +It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.' +Name: 'image_transformer' +``` + +The name and description are accurate and fit the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools). +Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`: + +```py +tools = [controlnet_transformer, upscaler] +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools) +``` + +This command should give you the following info: + +```text +image_transformer has been replaced by as provided in `additional_tools` +``` + +The set of curated tools already has an `image_transformer` tool which is hereby replaced with our custom tool. + + + +Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool +because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API +as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that +tool are updated. + + + +The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore simply added to the list of tools. +You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute: + +```py +print("\n".join([f"- {a}" for a in agent.toolbox.keys()])) +``` + +```text +- document_qa +- image_captioner +- image_qa +- image_segmenter +- transcriber +- summarizer +- text_classifier +- text_qa +- text_reader +- translator +- image_transformer +- text_downloader +- image_generator +- video_generator +- image_upscaler +``` + +Note how `image_upscaler` is now part of the agents' toolbox. + +Let's now try out the new tools! We will re-use the image we generated in [Transformers Agents Quickstart](./transformers_agents#single-execution-run). + +```py +from diffusers.utils import load_image + +image = load_image( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" +) +``` + + + +Let's transform the image into a beautiful winter landscape: + +```py +image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image) +``` + +```text +==Explanation from the agent== +I will use the following tool: `image_transformer` to transform the image. + + +==Code generated by the agent== +image = image_transformer(image, prompt="A frozen lake and snowy forest") +``` + + + +The new image processing tool is based on ControlNet which can make very strong modifications to the image. +By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it. + +```py +image = agent.run("Upscale the image", image) +``` + +```text +==Explanation from the agent== +I will use the following tool: `image_upscaler` to upscale the image. + + +==Code generated by the agent== +upscaled_image = image_upscaler(image) +``` + + + +The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool +and was able to correctly run it. + +Next, let's have a look at how you can create a new custom tool. + +### Adding new tools + +In this section, we show how to create a new tool that can be added to the agent. + +#### Creating a new tool + +We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face +Hub with the most downloads for a given task. + +We can do that with the following code: + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` + +For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`. + +How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the +main attributes necessary. We'll create a class that inherits from it: + +```python +from transformers import Tool + + +class HFModelDownloadsTool(Tool): + pass +``` + +This class has a few needs: +- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a + performative name, we'll name it `model_download_counter`. +- An attribute `description`, which will be used to populate the prompt of the agent. +- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types, + and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected + values, which can be `text`, `image`, or `audio`. +- A `__call__` method which contains the inference code. This is the code we've played with above! + +Here's what our class looks like now: + +```python +from transformers import Tool +from huggingface_hub import list_models + + +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " + "It takes the name of the category (such as text-classification, depth-estimation, etc), and " + "returns the name of the checkpoint." + ) + + inputs = ["text"] + outputs = ["text"] + + def __call__(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id +``` + +We now have our tool handy. Save it in a file and import it from your main script. Let's name this file +`model_downloads.py`, so the resulting import code looks like this: + +```python +from model_downloads import HFModelDownloadsTool + +tool = HFModelDownloadsTool() +``` + +In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your +namespace. To do so, just call `push_to_hub` on the `tool` variable: + +```python +tool.push_to_hub("hf-model-downloads") +``` + +You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it. + +#### Having the agent use the tool + +We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool): + +```python +from transformers import load_tool + +tool = load_tool("lysandre/hf-model-downloads") +``` + +In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method: + +```python +from transformers import HfAgent + +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) + +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` +which outputs the following: +```text +==Code generated by the agent== +model = model_download_counter(task="text-to-video") +print(f"The model with the most downloads is {model}.") +audio_model = text_reader(model) + + +==Result== +The model with the most downloads is damo-vilab/text-to-video-ms-1.7b. +``` + +and generates the following audio. + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +| ` to the beginning of every prompt. **Note**: Make sure to pass `use_fast=False` when loading OPT's tokenizer with [`AutoTokenizer`] to get the correct tokenizer. - -This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten). -The original code can be found [here](https://github.com/facebookresearch/metaseq). - - -## OPTConfig - -[[autodoc]] OPTConfig - -## OPTModel - -[[autodoc]] OPTModel - - forward - -## OPTForCausalLM - -[[autodoc]] OPTForCausalLM - - forward - -## TFOPTModel - -[[autodoc]] TFOPTModel - - call - -## TFOPTForCausalLM - -[[autodoc]] TFOPTForCausalLM - - call - -## OPTForSequenceClassification - -[[autodoc]] OPTForSequenceClassification - - forward - -## OPTForQuestionAnswering - -[[autodoc]] OPTForQuestionAnswering - - forward - -## FlaxOPTModel - -[[autodoc]] FlaxOPTModel - - __call__ - - -## FlaxOPTForCausalLM - -[[autodoc]] FlaxOPTForCausalLM - - __call__ \ No newline at end of file diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md new file mode 100644 index 000000000000..b18b80b40511 --- /dev/null +++ b/docs/source/en/model_doc/owlvit.md @@ -0,0 +1,118 @@ + + +# OWL-ViT + +## Overview + +The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text. + +The abstract from the paper is the following: + +*Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.* + +## Usage + +OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection. + +[`OwlViTImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`]. + + +```python +>>> import requests +>>> from PIL import Image +>>> import torch + +>>> from transformers import OwlViTProcessor, OwlViTForObjectDetection + +>>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") +>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") + +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) +>>> texts = [["a photo of a cat", "a photo of a dog"]] +>>> inputs = processor(text=texts, images=image, return_tensors="pt") +>>> outputs = model(**inputs) + +>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] +>>> target_sizes = torch.Tensor([image.size[::-1]]) +>>> # Convert outputs (bounding boxes and class logits) to COCO API +>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) +>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries +>>> text = texts[i] +>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] +>>> for box, score, label in zip(boxes, scores, labels): +... box = [round(i, 2) for i in box.tolist()] +... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] +Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] +``` + +This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit). + +## OwlViTConfig + +[[autodoc]] OwlViTConfig + - from_text_vision_configs + +## OwlViTTextConfig + +[[autodoc]] OwlViTTextConfig + +## OwlViTVisionConfig + +[[autodoc]] OwlViTVisionConfig + +## OwlViTImageProcessor + +[[autodoc]] OwlViTImageProcessor + - preprocess + - post_process_object_detection + - post_process_image_guided_detection + +## OwlViTFeatureExtractor + +[[autodoc]] OwlViTFeatureExtractor + - __call__ + - post_process + - post_process_image_guided_detection + +## OwlViTProcessor + +[[autodoc]] OwlViTProcessor + +## OwlViTModel + +[[autodoc]] OwlViTModel + - forward + - get_text_features + - get_image_features + +## OwlViTTextModel + +[[autodoc]] OwlViTTextModel + - forward + +## OwlViTVisionModel + +[[autodoc]] OwlViTVisionModel + - forward + +## OwlViTForObjectDetection + +[[autodoc]] OwlViTForObjectDetection + - forward + - image_guided_detection diff --git a/docs/source/en/model_doc/owlvit.mdx b/docs/source/en/model_doc/owlvit.mdx deleted file mode 100644 index f13ad4a540e1..000000000000 --- a/docs/source/en/model_doc/owlvit.mdx +++ /dev/null @@ -1,118 +0,0 @@ - - -# OWL-ViT - -## Overview - -The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text. - -The abstract from the paper is the following: - -*Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.* - -## Usage - -OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection. - -[`OwlViTFeatureExtractor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`]. - - -```python ->>> import requests ->>> from PIL import Image ->>> import torch - ->>> from transformers import OwlViTProcessor, OwlViTForObjectDetection - ->>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") ->>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") - ->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" ->>> image = Image.open(requests.get(url, stream=True).raw) ->>> texts = [["a photo of a cat", "a photo of a dog"]] ->>> inputs = processor(text=texts, images=image, return_tensors="pt") ->>> outputs = model(**inputs) - ->>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] ->>> target_sizes = torch.Tensor([image.size[::-1]]) ->>> # Convert outputs (bounding boxes and class logits) to COCO API ->>> results = processor.post_process(outputs=outputs, target_sizes=target_sizes) - ->>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ->>> text = texts[i] ->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] - ->>> score_threshold = 0.1 ->>> for box, score, label in zip(boxes, scores, labels): -... box = [round(i, 2) for i in box.tolist()] -... if score >= score_threshold: -... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") -Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] -Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] -``` - -This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit). - -## OwlViTConfig - -[[autodoc]] OwlViTConfig - - from_text_vision_configs - -## OwlViTTextConfig - -[[autodoc]] OwlViTTextConfig - -## OwlViTVisionConfig - -[[autodoc]] OwlViTVisionConfig - -## OwlViTImageProcessor - -[[autodoc]] OwlViTImageProcessor - - preprocess - - post_process_object_detection - - post_process_image_guided_detection - -## OwlViTFeatureExtractor - -[[autodoc]] OwlViTFeatureExtractor - - __call__ - - post_process - - post_process_image_guided_detection - -## OwlViTProcessor - -[[autodoc]] OwlViTProcessor - -## OwlViTModel - -[[autodoc]] OwlViTModel - - forward - - get_text_features - - get_image_features - -## OwlViTTextModel - -[[autodoc]] OwlViTTextModel - - forward - -## OwlViTVisionModel - -[[autodoc]] OwlViTVisionModel - - forward - -## OwlViTForObjectDetection - -[[autodoc]] OwlViTForObjectDetection - - forward - - image_guided_detection diff --git a/docs/source/en/model_doc/pegasus.md b/docs/source/en/model_doc/pegasus.md new file mode 100644 index 000000000000..14608aae31c9 --- /dev/null +++ b/docs/source/en/model_doc/pegasus.md @@ -0,0 +1,166 @@ + + +# Pegasus + +
+ +Models + + +Spaces + +
+ +**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title) +and assign @patrickvonplaten. + + +## Overview + +The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019. + +According to the abstract, + +- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an + input document and are generated together as one output sequence from the remaining sentences, similar to an + extractive summary. +- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval. + +This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/google-research/pegasus). + +Tips: + +- Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining objective, called Gap Sentence Generation (GSG). + + * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in BERT) + * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder. + +## Checkpoints + +All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tuned for summarization, besides +*pegasus-large*, whence the other checkpoints are fine-tuned: + +- Each checkpoint is 2.2 GB on disk and 568M parameters. +- FP16 is not supported (help/ideas on this appreciated!). +- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU. +- Full replication results and correctly pre-processed data can be found in this [Issue](https://github.com/huggingface/transformers/issues/6844#issue-689259666). +- [Distilled checkpoints](https://huggingface.co/models?search=distill-pegasus) are described in this [paper](https://arxiv.org/abs/2010.13002). + +### Examples + +- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus + on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md). +- FP16 is not supported (help/ideas on this appreciated!). +- The adafactor optimizer is recommended for pegasus fine-tuning. + + +## Implementation Notes + +- All models are transformer encoder-decoders with 16 layers in each component. +- The implementation is completely inherited from [`BartForConditionalGeneration`] +- Some key configuration differences: + + - static, sinusoidal position embeddings + - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. + - more beams are used (`num_beams=8`) +- All pretrained pegasus checkpoints are the same besides three attributes: `tokenizer.model_max_length` (maximum + input size), `max_length` (the maximum number of tokens to generate) and `length_penalty`. +- The code to convert checkpoints trained in the author's [repo](https://github.com/google-research/pegasus) can be + found in `convert_pegasus_tf_to_pytorch.py`. + + +## Usage Example + +```python +>>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer +>>> import torch + +>>> src_text = [ +... """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" +... ] + +... model_name = "google/pegasus-xsum" +... device = "cuda" if torch.cuda.is_available() else "cpu" +... tokenizer = PegasusTokenizer.from_pretrained(model_name) +... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) +... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device) +... translated = model.generate(**batch) +... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) +... assert ( +... tgt_text[0] +... == "California's largest electricity provider has turned off power to hundreds of thousands of customers." +... ) +``` + +## Documentation resources + +- [Causal language modeling task guide](../tasks/language_modeling) +- [Translation task guide](../tasks/translation) +- [Summarization task guide](../tasks/summarization) + +## PegasusConfig + +[[autodoc]] PegasusConfig + +## PegasusTokenizer + +warning: `add_tokens` does not work at the moment. + +[[autodoc]] PegasusTokenizer + +## PegasusTokenizerFast + +[[autodoc]] PegasusTokenizerFast + +## PegasusModel + +[[autodoc]] PegasusModel + - forward + +## PegasusForConditionalGeneration + +[[autodoc]] PegasusForConditionalGeneration + - forward + +## PegasusForCausalLM + +[[autodoc]] PegasusForCausalLM + - forward + +## TFPegasusModel + +[[autodoc]] TFPegasusModel + - call + +## TFPegasusForConditionalGeneration + +[[autodoc]] TFPegasusForConditionalGeneration + - call + +## FlaxPegasusModel + +[[autodoc]] FlaxPegasusModel + - __call__ + - encode + - decode + +## FlaxPegasusForConditionalGeneration + +[[autodoc]] FlaxPegasusForConditionalGeneration + - __call__ + - encode + - decode diff --git a/docs/source/en/model_doc/pegasus.mdx b/docs/source/en/model_doc/pegasus.mdx deleted file mode 100644 index 52dd10b9bfa7..000000000000 --- a/docs/source/en/model_doc/pegasus.mdx +++ /dev/null @@ -1,141 +0,0 @@ - - -# Pegasus - -**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title) -and assign @patrickvonplaten. - - -## Overview - -The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019. - -According to the abstract, - -- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an - input document and are generated together as one output sequence from the remaining sentences, similar to an - extractive summary. -- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval. - -This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The Authors' code can be found [here](https://github.com/google-research/pegasus). - - -## Checkpoints - -All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tuned for summarization, besides -*pegasus-large*, whence the other checkpoints are fine-tuned: - -- Each checkpoint is 2.2 GB on disk and 568M parameters. -- FP16 is not supported (help/ideas on this appreciated!). -- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU. -- Full replication results and correctly pre-processed data can be found in this [Issue](https://github.com/huggingface/transformers/issues/6844#issue-689259666). -- [Distilled checkpoints](https://huggingface.co/models?search=distill-pegasus) are described in this [paper](https://arxiv.org/abs/2010.13002). - -### Examples - -- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus - on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md). -- FP16 is not supported (help/ideas on this appreciated!). -- The adafactor optimizer is recommended for pegasus fine-tuning. - - -## Implementation Notes - -- All models are transformer encoder-decoders with 16 layers in each component. -- The implementation is completely inherited from [`BartForConditionalGeneration`] -- Some key configuration differences: - - - static, sinusoidal position embeddings - - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. - - more beams are used (`num_beams=8`) -- All pretrained pegasus checkpoints are the same besides three attributes: `tokenizer.model_max_length` (maximum - input size), `max_length` (the maximum number of tokens to generate) and `length_penalty`. -- The code to convert checkpoints trained in the author's [repo](https://github.com/google-research/pegasus) can be - found in `convert_pegasus_tf_to_pytorch.py`. - - -## Usage Example - -```python ->>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer ->>> import torch - ->>> src_text = [ -... """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" -... ] - -... model_name = "google/pegasus-xsum" -... device = "cuda" if torch.cuda.is_available() else "cpu" -... tokenizer = PegasusTokenizer.from_pretrained(model_name) -... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) -... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device) -... translated = model.generate(**batch) -... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) -... assert ( -... tgt_text[0] -... == "California's largest electricity provider has turned off power to hundreds of thousands of customers." -... ) -``` - -## PegasusConfig - -[[autodoc]] PegasusConfig - -## PegasusTokenizer - -warning: `add_tokens` does not work at the moment. - -[[autodoc]] PegasusTokenizer - -## PegasusTokenizerFast - -[[autodoc]] PegasusTokenizerFast - -## PegasusModel - -[[autodoc]] PegasusModel - - forward - -## PegasusForConditionalGeneration - -[[autodoc]] PegasusForConditionalGeneration - - forward - -## PegasusForCausalLM - -[[autodoc]] PegasusForCausalLM - - forward - -## TFPegasusModel - -[[autodoc]] TFPegasusModel - - call - -## TFPegasusForConditionalGeneration - -[[autodoc]] TFPegasusForConditionalGeneration - - call - -## FlaxPegasusModel - -[[autodoc]] FlaxPegasusModel - - __call__ - - encode - - decode - -## FlaxPegasusForConditionalGeneration - -[[autodoc]] FlaxPegasusForConditionalGeneration - - __call__ - - encode - - decode diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md new file mode 100644 index 000000000000..a0fd670fc7c9 --- /dev/null +++ b/docs/source/en/model_doc/pegasus_x.md @@ -0,0 +1,54 @@ + + +# PEGASUS-X + +## Overview + +The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao and Peter J. Liu. + +PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder. + +The abstract from the paper is the following: + +*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.* + +Tips: + +* PEGASUS-X uses the same tokenizer as PEGASUS. + +This model was contributed by [zphang]( - -# PEGASUS-X - -## Overview - -The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao and Peter J. Liu. - -PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder. - -The abstract from the paper is the following: - -*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.* - -Tips: - -* PEGASUS-X uses the same tokenizer as PEGASUS. - -This model was contributed by [zphang]( + +# Perceiver + +## Overview + +The Perceiver IO model was proposed in [Perceiver IO: A General Architecture for Structured Inputs & +Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, +Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. +Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. + +Perceiver IO is a generalization of [Perceiver](https://arxiv.org/abs/2103.03206) to handle arbitrary outputs in +addition to arbitrary inputs. The original Perceiver only produced a single classification label. In addition to +classification labels, Perceiver IO can produce (for example) language, optical flow, and multimodal videos with audio. +This is done using the same building blocks as the original Perceiver. The computational complexity of Perceiver IO is +linear in the input and output size and the bulk of the processing occurs in the latent space, allowing us to process +inputs and outputs that are much larger than can be handled by standard Transformers. This means, for example, +Perceiver IO can do BERT-style masked language modeling directly using bytes instead of tokenized inputs. + +The abstract from the paper is the following: + +*The recently-proposed Perceiver model obtains good results on several domains (images, audio, multimodal, point +clouds) while scaling linearly in compute and memory with the input size. While the Perceiver supports many kinds of +inputs, it can only produce very simple outputs such as class scores. Perceiver IO overcomes this limitation without +sacrificing the original's appealing properties by learning to flexibly query the model's latent space to produce +outputs of arbitrary size and semantics. Perceiver IO still decouples model depth from data size and still scales +linearly with data size, but now with respect to both input and output sizes. The full Perceiver IO model achieves +strong results on tasks with highly structured output spaces, such as natural language and visual understanding, +StarCraft II, and multi-task and multi-modal domains. As highlights, Perceiver IO matches a Transformer-based BERT +baseline on the GLUE language benchmark without the need for input tokenization and achieves state-of-the-art +performance on Sintel optical flow estimation.* + +Here's a TLDR explaining how Perceiver works: + +The main problem with the self-attention mechanism of the Transformer is that the time and memory requirements scale +quadratically with the sequence length. Hence, models like BERT and RoBERTa are limited to a max sequence length of 512 +tokens. Perceiver aims to solve this issue by, instead of performing self-attention on the inputs, perform it on a set +of latent variables, and only use the inputs for cross-attention. In this way, the time and memory requirements don't +depend on the length of the inputs anymore, as one uses a fixed amount of latent variables, like 256 or 512. These are +randomly initialized, after which they are trained end-to-end using backpropagation. + +Internally, [`PerceiverModel`] will create the latents, which is a tensor of shape `(batch_size, num_latents, +d_latents)`. One must provide `inputs` (which could be text, images, audio, you name it!) to the model, which it will +use to perform cross-attention with the latents. The output of the Perceiver encoder is a tensor of the same shape. One +can then, similar to BERT, convert the last hidden states of the latents to classification logits by averaging along +the sequence dimension, and placing a linear layer on top of that to project the `d_latents` to `num_labels`. + +This was the idea of the original Perceiver paper. However, it could only output classification logits. In a follow-up +work, PerceiverIO, they generalized it to let the model also produce outputs of arbitrary size. How, you might ask? The +idea is actually relatively simple: one defines outputs of an arbitrary size, and then applies cross-attention with the +last hidden states of the latents, using the outputs as queries, and the latents as keys and values. + +So let's say one wants to perform masked language modeling (BERT-style) with the Perceiver. As the Perceiver's input +length will not have an impact on the computation time of the self-attention layers, one can provide raw bytes, +providing `inputs` of length 2048 to the model. If one now masks out certain of these 2048 tokens, one can define the +`outputs` as being of shape: `(batch_size, 2048, 768)`. Next, one performs cross-attention with the final hidden states +of the latents to update the `outputs` tensor. After cross-attention, one still has a tensor of shape `(batch_size, +2048, 768)`. One can then place a regular language modeling head on top, to project the last dimension to the +vocabulary size of the model, i.e. creating logits of shape `(batch_size, 2048, 262)` (as Perceiver uses a vocabulary +size of 262 byte IDs). + + + + Perceiver IO architecture. Taken from the original paper + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found +[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver). + +Tips: + +- The quickest way to get started with the Perceiver is by checking the [tutorial + notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver). +- Refer to the [blog post](https://huggingface.co/blog/perceiver) if you want to fully understand how the model works and +is implemented in the library. Note that the models available in the library only showcase some examples of what you can do +with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, +audio classification, video classification, etc. + +**Note**: + +- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035) + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Image classification task guide](../tasks/image_classification) + +## Perceiver specific outputs + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverDecoderOutput + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMaskedLMOutput + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassifierOutput + +## PerceiverConfig + +[[autodoc]] PerceiverConfig + +## PerceiverTokenizer + +[[autodoc]] PerceiverTokenizer + - __call__ + +## PerceiverFeatureExtractor + +[[autodoc]] PerceiverFeatureExtractor + - __call__ + +## PerceiverImageProcessor + +[[autodoc]] PerceiverImageProcessor + - preprocess + +## PerceiverTextPreprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverTextPreprocessor + +## PerceiverImagePreprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverImagePreprocessor + +## PerceiverOneHotPreprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOneHotPreprocessor + +## PerceiverAudioPreprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor + +## PerceiverMultimodalPreprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor + +## PerceiverProjectionDecoder + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionDecoder + +## PerceiverBasicDecoder + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicDecoder + +## PerceiverClassificationDecoder + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationDecoder + +## PerceiverOpticalFlowDecoder + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder + +## PerceiverBasicVideoAutoencodingDecoder + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicVideoAutoencodingDecoder + +## PerceiverMultimodalDecoder + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder + +## PerceiverProjectionPostprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor + +## PerceiverAudioPostprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor + +## PerceiverClassificationPostprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor + +## PerceiverMultimodalPostprocessor + +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor + +## PerceiverModel + +[[autodoc]] PerceiverModel + - forward + +## PerceiverForMaskedLM + +[[autodoc]] PerceiverForMaskedLM + - forward + +## PerceiverForSequenceClassification + +[[autodoc]] PerceiverForSequenceClassification + - forward + +## PerceiverForImageClassificationLearned + +[[autodoc]] PerceiverForImageClassificationLearned + - forward + +## PerceiverForImageClassificationFourier + +[[autodoc]] PerceiverForImageClassificationFourier + - forward + +## PerceiverForImageClassificationConvProcessing + +[[autodoc]] PerceiverForImageClassificationConvProcessing + - forward + +## PerceiverForOpticalFlow + +[[autodoc]] PerceiverForOpticalFlow + - forward + +## PerceiverForMultimodalAutoencoding + +[[autodoc]] PerceiverForMultimodalAutoencoding + - forward diff --git a/docs/source/en/model_doc/perceiver.mdx b/docs/source/en/model_doc/perceiver.mdx deleted file mode 100644 index 52a928472c0d..000000000000 --- a/docs/source/en/model_doc/perceiver.mdx +++ /dev/null @@ -1,220 +0,0 @@ - - -# Perceiver - -## Overview - -The Perceiver IO model was proposed in [Perceiver IO: A General Architecture for Structured Inputs & -Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, -Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. -Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. - -Perceiver IO is a generalization of [Perceiver](https://arxiv.org/abs/2103.03206) to handle arbitrary outputs in -addition to arbitrary inputs. The original Perceiver only produced a single classification label. In addition to -classification labels, Perceiver IO can produce (for example) language, optical flow, and multimodal videos with audio. -This is done using the same building blocks as the original Perceiver. The computational complexity of Perceiver IO is -linear in the input and output size and the bulk of the processing occurs in the latent space, allowing us to process -inputs and outputs that are much larger than can be handled by standard Transformers. This means, for example, -Perceiver IO can do BERT-style masked language modeling directly using bytes instead of tokenized inputs. - -The abstract from the paper is the following: - -*The recently-proposed Perceiver model obtains good results on several domains (images, audio, multimodal, point -clouds) while scaling linearly in compute and memory with the input size. While the Perceiver supports many kinds of -inputs, it can only produce very simple outputs such as class scores. Perceiver IO overcomes this limitation without -sacrificing the original's appealing properties by learning to flexibly query the model's latent space to produce -outputs of arbitrary size and semantics. Perceiver IO still decouples model depth from data size and still scales -linearly with data size, but now with respect to both input and output sizes. The full Perceiver IO model achieves -strong results on tasks with highly structured output spaces, such as natural language and visual understanding, -StarCraft II, and multi-task and multi-modal domains. As highlights, Perceiver IO matches a Transformer-based BERT -baseline on the GLUE language benchmark without the need for input tokenization and achieves state-of-the-art -performance on Sintel optical flow estimation.* - -Here's a TLDR explaining how Perceiver works: - -The main problem with the self-attention mechanism of the Transformer is that the time and memory requirements scale -quadratically with the sequence length. Hence, models like BERT and RoBERTa are limited to a max sequence length of 512 -tokens. Perceiver aims to solve this issue by, instead of performing self-attention on the inputs, perform it on a set -of latent variables, and only use the inputs for cross-attention. In this way, the time and memory requirements don't -depend on the length of the inputs anymore, as one uses a fixed amount of latent variables, like 256 or 512. These are -randomly initialized, after which they are trained end-to-end using backpropagation. - -Internally, [`PerceiverModel`] will create the latents, which is a tensor of shape `(batch_size, num_latents, -d_latents)`. One must provide `inputs` (which could be text, images, audio, you name it!) to the model, which it will -use to perform cross-attention with the latents. The output of the Perceiver encoder is a tensor of the same shape. One -can then, similar to BERT, convert the last hidden states of the latents to classification logits by averaging along -the sequence dimension, and placing a linear layer on top of that to project the `d_latents` to `num_labels`. - -This was the idea of the original Perceiver paper. However, it could only output classification logits. In a follow-up -work, PerceiverIO, they generalized it to let the model also produce outputs of arbitrary size. How, you might ask? The -idea is actually relatively simple: one defines outputs of an arbitrary size, and then applies cross-attention with the -last hidden states of the latents, using the outputs as queries, and the latents as keys and values. - -So let's say one wants to perform masked language modeling (BERT-style) with the Perceiver. As the Perceiver's input -length will not have an impact on the computation time of the self-attention layers, one can provide raw bytes, -providing `inputs` of length 2048 to the model. If one now masks out certain of these 2048 tokens, one can define the -`outputs` as being of shape: `(batch_size, 2048, 768)`. Next, one performs cross-attention with the final hidden states -of the latents to update the `outputs` tensor. After cross-attention, one still has a tensor of shape `(batch_size, -2048, 768)`. One can then place a regular language modeling head on top, to project the last dimension to the -vocabulary size of the model, i.e. creating logits of shape `(batch_size, 2048, 262)` (as Perceiver uses a vocabulary -size of 262 byte IDs). - - - - Perceiver IO architecture. Taken from the original paper - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found -[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver). - -Tips: - -- The quickest way to get started with the Perceiver is by checking the [tutorial - notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver). -- Refer to the [blog post](https://huggingface.co/blog/perceiver) if you want to fully understand how the model works and -is implemented in the library. Note that the models available in the library only showcase some examples of what you can do -with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, -audio classification, video classification, etc. - -**Note**: - -- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035) - -## Perceiver specific outputs - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverDecoderOutput - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMaskedLMOutput - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassifierOutput - -## PerceiverConfig - -[[autodoc]] PerceiverConfig - -## PerceiverTokenizer - -[[autodoc]] PerceiverTokenizer - - __call__ - -## PerceiverFeatureExtractor - -[[autodoc]] PerceiverFeatureExtractor - - __call__ - -## PerceiverImageProcessor - -[[autodoc]] PerceiverImageProcessor - - preprocess - -## PerceiverTextPreprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverTextPreprocessor - -## PerceiverImagePreprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverImagePreprocessor - -## PerceiverOneHotPreprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOneHotPreprocessor - -## PerceiverAudioPreprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor - -## PerceiverMultimodalPreprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor - -## PerceiverProjectionDecoder - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionDecoder - -## PerceiverBasicDecoder - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicDecoder - -## PerceiverClassificationDecoder - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationDecoder - -## PerceiverOpticalFlowDecoder - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder - -## PerceiverBasicVideoAutoencodingDecoder - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicVideoAutoencodingDecoder - -## PerceiverMultimodalDecoder - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder - -## PerceiverProjectionPostprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor - -## PerceiverAudioPostprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor - -## PerceiverClassificationPostprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor - -## PerceiverMultimodalPostprocessor - -[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor - -## PerceiverModel - -[[autodoc]] PerceiverModel - - forward - -## PerceiverForMaskedLM - -[[autodoc]] PerceiverForMaskedLM - - forward - -## PerceiverForSequenceClassification - -[[autodoc]] PerceiverForSequenceClassification - - forward - -## PerceiverForImageClassificationLearned - -[[autodoc]] PerceiverForImageClassificationLearned - - forward - -## PerceiverForImageClassificationFourier - -[[autodoc]] PerceiverForImageClassificationFourier - - forward - -## PerceiverForImageClassificationConvProcessing - -[[autodoc]] PerceiverForImageClassificationConvProcessing - - forward - -## PerceiverForOpticalFlow - -[[autodoc]] PerceiverForOpticalFlow - - forward - -## PerceiverForMultimodalAutoencoding - -[[autodoc]] PerceiverForMultimodalAutoencoding - - forward diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md new file mode 100644 index 000000000000..cf13d070c622 --- /dev/null +++ b/docs/source/en/model_doc/persimmon.md @@ -0,0 +1,96 @@ + + +# Persimmon + +## Overview + +The Persimmon model was created by [ADEPT](https://www.adept.ai/blog/persimmon-8b), and authored by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. + +The authors introduced Persimmon-8B, a decoder model based on the classic transformers architecture, with query and key normalization. Persimmon-8B is a fully permissively-licensed model with approximately 8 billion parameters, released under the Apache license. Some of the key attributes of Persimmon-8B are long context size (16K), performance, and capabilities for multimodal extensions. + +The authors showcase their approach to model evaluation, focusing on practical text generation, mirroring how users interact with language models. The work also includes a comparative analysis, pitting Persimmon-8B against other prominent models (MPT 7B Instruct and Llama 2 Base 7B 1-Shot), across various evaluation tasks. The results demonstrate Persimmon-8B's competitive performance, even with limited training data. + +In terms of model details, the work outlines the architecture and training methodology of Persimmon-8B, providing insights into its design choices, sequence length, and dataset composition. The authors present a fast inference code that outperforms traditional implementations through operator fusion and CUDA graph utilization while maintaining code coherence. They express their anticipation of how the community will leverage this contribution to drive innovation, hinting at further upcoming releases as part of an ongoing series of developments. + + + + +The `Persimmon` models were trained using `bfloat16`, but the original inference uses `float16` The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be +used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. + +The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be cast to the default `dtype` of `torch` (becomes `torch.float32`). Users should specify the `torch_dtype` they want, and if they don't it will be `torch.float32`. + +Finetuning the model in `float16` is not recommended and known to produce `nan`, as such the model should be fine-tuned in `bfloat16`. + + + + +Tips: + +- To convert the model, you need to clone the original repository using `git clone https://github.com/persimmon-ai-labs/adept-inference`, then get the checkpoints: + +```bash +git clone https://github.com/persimmon-ai-labs/adept-inference +wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar +tar -xvf 8b_base_model_release.tar +python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path \ + --pt_model_path /path/to/8b_chat_model_release/iter_0001251/mp_rank_00/model_optim_rng.pt + --ada_lib_path /path/to/adept-inference +``` + +For the chat model: +```bash +wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar +tar -xvf 8b_base_model_release.tar +``` + +Thereafter, models can be loaded via: + +```py +from transformers import PersimmonForCausalLM, PersimmonTokenizer + +model = PersimmonForCausalLM.from_pretrained("/output/path") +tokenizer = PersimmonTokenizer.from_pretrained("/output/path") +``` + +This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ). +The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference). + +- Perismmon uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer. +The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. The `chat` template will be updated with the templating functions in a follow up PR! + +- The authors suggest to use the following prompt format for the chat mode: `f"human: {prompt}\n\nadept:"` + + +## PersimmonConfig + +[[autodoc]] PersimmonConfig + +## PersimmonModel + +[[autodoc]] PersimmonModel + - forward + +## PersimmonForCausalLM + +[[autodoc]] PersimmonForCausalLM + - forward + +## PersimmonForSequenceClassification + +[[autodoc]] PersimmonForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/phobert.md b/docs/source/en/model_doc/phobert.md new file mode 100644 index 000000000000..5543a9b3541a --- /dev/null +++ b/docs/source/en/model_doc/phobert.md @@ -0,0 +1,57 @@ + + +# PhoBERT + +## Overview + +The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen. + +The abstract from the paper is the following: + +*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual +language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent +best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple +Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and +Natural language inference.* + +Example of use: + +```python +>>> import torch +>>> from transformers import AutoModel, AutoTokenizer + +>>> phobert = AutoModel.from_pretrained("vinai/phobert-base") +>>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") + +>>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED! +>>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ." + +>>> input_ids = torch.tensor([tokenizer.encode(line)]) + +>>> with torch.no_grad(): +... features = phobert(input_ids) # Models outputs are now tuples + +>>> # With TensorFlow 2.0+: +>>> # from transformers import TFAutoModel +>>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base") +``` + +This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT). + +## PhobertTokenizer + +[[autodoc]] PhobertTokenizer diff --git a/docs/source/en/model_doc/phobert.mdx b/docs/source/en/model_doc/phobert.mdx deleted file mode 100644 index 4ae9b0aa6251..000000000000 --- a/docs/source/en/model_doc/phobert.mdx +++ /dev/null @@ -1,53 +0,0 @@ - - -# PhoBERT - -## Overview - -The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen. - -The abstract from the paper is the following: - -*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual -language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent -best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple -Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and -Natural language inference.* - -Example of use: - -```python ->>> import torch ->>> from transformers import AutoModel, AutoTokenizer - ->>> phobert = AutoModel.from_pretrained("vinai/phobert-base") ->>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") - ->>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED! ->>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ." - ->>> input_ids = torch.tensor([tokenizer.encode(line)]) - ->>> with torch.no_grad(): -... features = phobert(input_ids) # Models outputs are now tuples - ->>> # With TensorFlow 2.0+: ->>> # from transformers import TFAutoModel ->>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base") -``` - -This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/PhoBERT). - -## PhobertTokenizer - -[[autodoc]] PhobertTokenizer diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md new file mode 100644 index 000000000000..b722a59b82e6 --- /dev/null +++ b/docs/source/en/model_doc/pix2struct.md @@ -0,0 +1,78 @@ + + +# Pix2Struct + +## Overview + +The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. + +The abstract from the paper is the following: + +> Visually-situated language is ubiquitous -- sources range from textbooks with diagrams to web pages with images and tables, to mobile apps with buttons and forms. Perhaps due to this diversity, previous work has typically relied on domain-specific recipes with limited sharing of the underlying data, model architectures, and objectives. We present Pix2Struct, a pretrained image-to-text model for purely visual language understanding, which can be finetuned on tasks containing visually-situated language. Pix2Struct is pretrained by learning to parse masked screenshots of web pages into simplified HTML. The web, with its richness of visual elements cleanly reflected in the HTML structure, provides a large source of pretraining data well suited to the diversity of downstream tasks. Intuitively, this objective subsumes common pretraining signals such as OCR, language modeling, image captioning. In addition to the novel pretraining strategy, we introduce a variable-resolution input representation and a more flexible integration of language and vision inputs, where language prompts such as questions are rendered directly on top of the input image. For the first time, we show that a single pretrained model can achieve state-of-the-art results in six out of nine tasks across four domains: documents, illustrations, user interfaces, and natural images. + +Tips: + +Pix2Struct has been fine tuned on a variety of tasks and datasets, ranging from image captioning, visual question answering (VQA) over different inputs (books, charts, science diagrams), captioning UI components etc. The full list can be found in Table 1 of the paper. +We therefore advise you to use these models for the tasks they have been fine tuned on. For instance, if you want to use Pix2Struct for UI captioning, you should use the model fine tuned on the UI dataset. If you want to use Pix2Struct for image captioning, you should use the model fine tuned on the natural images captioning dataset and so on. + +If you want to use the model to perform conditional text captioning, make sure to use the processor with `add_special_tokens=False`. + +This model was contributed by [ybelkada](https://huggingface.co/ybelkada). +The original code can be found [here](https://github.com/google-research/pix2struct). + +## Resources + +- [Fine-tuning Notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb) +- [All models](https://huggingface.co/models?search=pix2struct) + + +## Pix2StructConfig + +[[autodoc]] Pix2StructConfig + - from_text_vision_configs + +## Pix2StructTextConfig + +[[autodoc]] Pix2StructTextConfig + +## Pix2StructVisionConfig + +[[autodoc]] Pix2StructVisionConfig + +## Pix2StructProcessor + +[[autodoc]] Pix2StructProcessor + +## Pix2StructImageProcessor + +[[autodoc]] Pix2StructImageProcessor + - preprocess + +## Pix2StructTextModel + +[[autodoc]] Pix2StructTextModel + - forward + +## Pix2StructVisionModel + +[[autodoc]] Pix2StructVisionModel + - forward + +## Pix2StructForConditionalGeneration + +[[autodoc]] Pix2StructForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md new file mode 100644 index 000000000000..c9f502021485 --- /dev/null +++ b/docs/source/en/model_doc/plbart.md @@ -0,0 +1,119 @@ + + +# PLBart + +**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign +[@gchhablani](https://www.github.com/gchhablani). + +## Overview of PLBart + +The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. +This is a BART-like model which can be used to perform code-summarization, code-generation, and code-translation tasks. The pre-trained model `plbart-base` has been trained using multilingual denoising task +on Java, Python and English. + +According to the abstract + +*Code summarization and generation empower conversion between programming language (PL) and natural language (NL), +while code translation avails the migration of legacy code from one PL to another. This paper introduces PLBART, +a sequence-to-sequence model capable of performing a broad spectrum of program and language understanding and generation tasks. +PLBART is pre-trained on an extensive collection of Java and Python functions and associated NL text via denoising autoencoding. +Experiments on code summarization in the English language, code generation, and code translation in seven programming languages +show that PLBART outperforms or rivals state-of-the-art models. Moreover, experiments on discriminative tasks, e.g., program +repair, clone detection, and vulnerable code detection, demonstrate PLBART's effectiveness in program understanding. +Furthermore, analysis reveals that PLBART learns program syntax, style (e.g., identifier naming convention), logical flow +(e.g., if block inside an else block is equivalent to else if block) that are crucial to program semantics and thus excels +even with limited annotations.* + +This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The Authors' code can be found [here](https://github.com/wasiahmad/PLBART). + +### Training of PLBart + +PLBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for code-to-text, text-to-code, code-to-code tasks. As the +model is multilingual it expects the sequences in a different format. A special language id token is added in both the +source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The +target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. + +However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this. + +In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format +when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if +it's passed with the `text_target` keyword argument. + +- Supervised training + +```python +>>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer + +>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python") +>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" +>>> expected_translation_english = "Returns the maximum value of a b c." +>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt") +>>> model(**inputs) +``` + +- Generation + + While generating the target text set the `decoder_start_token_id` to the target language id. The following + example shows how to translate Python to English using the `uclanlp/plbart-python-en_XX` model. + +```python +>>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer + +>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX") +>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" +>>> inputs = tokenizer(example_python_phrase, return_tensors="pt") +>>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX") +>>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"]) +>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +"Returns the maximum value of a b c." +``` + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Translation task guide](../tasks/translation) +- [Summarization task guide](../tasks/summarization) + +## PLBartConfig + +[[autodoc]] PLBartConfig + +## PLBartTokenizer + +[[autodoc]] PLBartTokenizer + - build_inputs_with_special_tokens + +## PLBartModel + +[[autodoc]] PLBartModel + - forward + +## PLBartForConditionalGeneration + +[[autodoc]] PLBartForConditionalGeneration + - forward + +## PLBartForSequenceClassification + +[[autodoc]] PLBartForSequenceClassification + - forward + +## PLBartForCausalLM + +[[autodoc]] PLBartForCausalLM + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/plbart.mdx b/docs/source/en/model_doc/plbart.mdx deleted file mode 100644 index 0755bb9a56e1..000000000000 --- a/docs/source/en/model_doc/plbart.mdx +++ /dev/null @@ -1,108 +0,0 @@ - - -# PLBart - -**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign -[@gchhablani](https://www.github.com/gchhablani). - -## Overview of PLBart - -The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. -This is a BART-like model which can be used to perform code-summarization, code-generation, and code-translation tasks. The pre-trained model `plbart-base` has been trained using multilingual denoising task -on Java, Python and English. - -According to the abstract - -*Code summarization and generation empower conversion between programming language (PL) and natural language (NL), -while code translation avails the migration of legacy code from one PL to another. This paper introduces PLBART, -a sequence-to-sequence model capable of performing a broad spectrum of program and language understanding and generation tasks. -PLBART is pre-trained on an extensive collection of Java and Python functions and associated NL text via denoising autoencoding. -Experiments on code summarization in the English language, code generation, and code translation in seven programming languages -show that PLBART outperforms or rivals state-of-the-art models. Moreover, experiments on discriminative tasks, e.g., program -repair, clone detection, and vulnerable code detection, demonstrate PLBART's effectiveness in program understanding. -Furthermore, analysis reveals that PLBART learns program syntax, style (e.g., identifier naming convention), logical flow -(e.g., if block inside an else block is equivalent to else if block) that are crucial to program semantics and thus excels -even with limited annotations.* - -This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The Authors' code can be found [here](https://github.com/wasiahmad/PLBART). - -### Training of PLBart - -PLBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for code-to-text, text-to-code, code-to-code tasks. As the -model is multilingual it expects the sequences in a different format. A special language id token is added in both the -source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The -target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. - -However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this. - -In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format -when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if -it's passed with the `text_target` keyword argument. - -- Supervised training - -```python ->>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer - ->>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python") ->>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" ->>> expected_translation_english = "Returns the maximum value of a b c." ->>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt") ->>> model(**inputs) -``` - -- Generation - - While generating the target text set the `decoder_start_token_id` to the target language id. The following - example shows how to translate Python to English using the `uclanlp/plbart-python-en_XX` model. - -```python ->>> from transformers import PLBartForConditionalGeneration, PLBartTokenizer - ->>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX") ->>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" ->>> inputs = tokenizer(example_python_phrase, return_tensors="pt") ->>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX") ->>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"]) ->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] -"Returns the maximum value of a b c." -``` - -## PLBartConfig - -[[autodoc]] PLBartConfig - -## PLBartTokenizer - -[[autodoc]] PLBartTokenizer - - build_inputs_with_special_tokens - -## PLBartModel - -[[autodoc]] PLBartModel - - forward - -## PLBartForConditionalGeneration - -[[autodoc]] PLBartForConditionalGeneration - - forward - -## PLBartForSequenceClassification - -[[autodoc]] PLBartForSequenceClassification - - forward - -## PLBartForCausalLM - -[[autodoc]] PLBartForCausalLM - - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/poolformer.md b/docs/source/en/model_doc/poolformer.md new file mode 100644 index 000000000000..537c60bdbcf6 --- /dev/null +++ b/docs/source/en/model_doc/poolformer.md @@ -0,0 +1,81 @@ + + +# PoolFormer + +## Overview + +The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer. + +The abstract from the paper is the following: + +*Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.* + +The figure below illustrates the architecture of PoolFormer. Taken from the [original paper](https://arxiv.org/abs/2111.11418). + + + + +Tips: + +- PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. All checkpoints of the model can be found on the [hub](https://huggingface.co/models?other=poolformer). +- One can use [`PoolFormerImageProcessor`] to prepare images for the model. +- As most models, PoolFormer comes in different sizes, the details of which can be found in the table below. + +| **Model variant** | **Depths** | **Hidden sizes** | **Params (M)** | **ImageNet-1k Top 1** | +| :---------------: | ------------- | ------------------- | :------------: | :-------------------: | +| s12 | [2, 2, 6, 2] | [64, 128, 320, 512] | 12 | 77.2 | +| s24 | [4, 4, 12, 4] | [64, 128, 320, 512] | 21 | 80.3 | +| s36 | [6, 6, 18, 6] | [64, 128, 320, 512] | 31 | 81.4 | +| m36 | [6, 6, 18, 6] | [96, 192, 384, 768] | 56 | 82.1 | +| m48 | [8, 8, 24, 8] | [96, 192, 384, 768] | 73 | 82.5 | + +This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PoolFormer. + + + +- [`PoolFormerForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## PoolFormerConfig + +[[autodoc]] PoolFormerConfig + +## PoolFormerFeatureExtractor + +[[autodoc]] PoolFormerFeatureExtractor + - __call__ + +## PoolFormerImageProcessor + +[[autodoc]] PoolFormerImageProcessor + - preprocess + +## PoolFormerModel + +[[autodoc]] PoolFormerModel + - forward + +## PoolFormerForImageClassification + +[[autodoc]] PoolFormerForImageClassification + - forward diff --git a/docs/source/en/model_doc/poolformer.mdx b/docs/source/en/model_doc/poolformer.mdx deleted file mode 100644 index e04762626163..000000000000 --- a/docs/source/en/model_doc/poolformer.mdx +++ /dev/null @@ -1,66 +0,0 @@ - - -# PoolFormer - -## Overview - -The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer. - -The abstract from the paper is the following: - -*Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.* - -The figure below illustrates the architecture of PoolFormer. Taken from the [original paper](https://arxiv.org/abs/2111.11418). - - - - -Tips: - -- PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. All checkpoints of the model can be found on the [hub](https://huggingface.co/models?other=poolformer). -- One can use [`PoolFormerImageProcessor`] to prepare images for the model. -- As most models, PoolFormer comes in different sizes, the details of which can be found in the table below. - -| **Model variant** | **Depths** | **Hidden sizes** | **Params (M)** | **ImageNet-1k Top 1** | -| :---------------: | ------------- | ------------------- | :------------: | :-------------------: | -| s12 | [2, 2, 6, 2] | [64, 128, 320, 512] | 12 | 77.2 | -| s24 | [4, 4, 12, 4] | [64, 128, 320, 512] | 21 | 80.3 | -| s36 | [6, 6, 18, 6] | [64, 128, 320, 512] | 31 | 81.4 | -| m36 | [6, 6, 18, 6] | [96, 192, 384, 768] | 56 | 82.1 | -| m48 | [8, 8, 24, 8] | [96, 192, 384, 768] | 73 | 82.5 | - -This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer). - -## PoolFormerConfig - -[[autodoc]] PoolFormerConfig - -## PoolFormerFeatureExtractor - -[[autodoc]] PoolFormerFeatureExtractor - - __call__ - -## PoolFormerImageProcessor - -[[autodoc]] PoolFormerImageProcessor - - preprocess - -## PoolFormerModel - -[[autodoc]] PoolFormerModel - - forward - -## PoolFormerForImageClassification - -[[autodoc]] PoolFormerForImageClassification - - forward diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md new file mode 100644 index 000000000000..95fd83f19237 --- /dev/null +++ b/docs/source/en/model_doc/pop2piano.md @@ -0,0 +1,196 @@ + + +# Pop2Piano + +
+ +Spaces + +
+ +## Overview + +The Pop2Piano model was proposed in [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. + +Piano covers of pop music are widely enjoyed, but generating them from music is not a trivial task. It requires great +expertise with playing piano as well as knowing different characteristics and melodies of a song. With Pop2Piano you +can directly generate a cover from a song's audio waveform. It is the first model to directly generate a piano cover +from pop audio without melody and chord extraction modules. + +Pop2Piano is an encoder-decoder Transformer model based on [T5](https://arxiv.org/pdf/1910.10683.pdf). The input audio +is transformed to its waveform and passed to the encoder, which transforms it to a latent representation. The decoder +uses these latent representations to generate token ids in an autoregressive way. Each token id corresponds to one of four +different token types: time, velocity, note and 'special'. The token ids are then decoded to their equivalent MIDI file. + + +The abstract from the paper is the following: + +*Piano covers of pop music are enjoyed by many people. However, the +task of automatically generating piano covers of pop music is still +understudied. This is partly due to the lack of synchronized +{Pop, Piano Cover} data pairs, which made it challenging to apply +the latest data-intensive deep learning-based methods. To leverage +the power of the data-driven approach, we make a large amount of +paired and synchronized {Pop, Piano Cover} data using an automated +pipeline. In this paper, we present Pop2Piano, a Transformer network +that generates piano covers given waveforms of pop music. To the best +of our knowledge, this is the first model to generate a piano cover +directly from pop audio without using melody and chord extraction +modules. We show that Pop2Piano, trained with our dataset, is capable +of producing plausible piano covers.* + + +Tips: + +1. To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules: +``` +pip install pretty-midi==0.2.9 essentia==2.1b6.dev1034 librosa scipy +``` +Please note that you may need to restart your runtime after installation. +2. Pop2Piano is an Encoder-Decoder based model like T5. +3. Pop2Piano can be used to generate midi-audio files for a given audio sequence. +4. Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results. +5. Setting the sampling rate to 44.1 kHz when loading the audio file can give good performance. +6. Though Pop2Piano was mainly trained on Korean Pop music, it also does pretty well on other Western Pop or Hip Hop songs. + +This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). +The original code can be found [here](https://github.com/sweetcocoa/pop2piano). + +## Examples + +- Example using HuggingFace Dataset: + +```python +>>> from datasets import load_dataset +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor + +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") +>>> ds = load_dataset("sweetcocoa/pop2piano_ci", split="test") + +>>> inputs = processor( +... audio=ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="pt" +... ) +>>> model_output = model.generate(input_features=inputs["input_features"], composer="composer1") +>>> tokenizer_output = processor.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"][0] +>>> tokenizer_output.write("./Outputs/midi_output.mid") +``` + +- Example using your own audio file: + +```python +>>> import librosa +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor + +>>> audio, sr = librosa.load("", sr=44100) # feel free to change the sr to a suitable value. +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") + +>>> inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt") +>>> model_output = model.generate(input_features=inputs["input_features"], composer="composer1") +>>> tokenizer_output = processor.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"][0] +>>> tokenizer_output.write("./Outputs/midi_output.mid") +``` + +- Example of processing multiple audio files in batch: + +```python +>>> import librosa +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor + +>>> # feel free to change the sr to a suitable value. +>>> audio1, sr1 = librosa.load("", sr=44100) +>>> audio2, sr2 = librosa.load("", sr=44100) +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") + +>>> inputs = processor(audio=[audio1, audio2], sampling_rate=[sr1, sr2], return_attention_mask=True, return_tensors="pt") +>>> # Since we now generating in batch(2 audios) we must pass the attention_mask +>>> model_output = model.generate( +... input_features=inputs["input_features"], +... attention_mask=inputs["attention_mask"], +... composer="composer1", +... ) +>>> tokenizer_output = processor.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"] + +>>> # Since we now have 2 generated MIDI files +>>> tokenizer_output[0].write("./Outputs/midi_output1.mid") +>>> tokenizer_output[1].write("./Outputs/midi_output2.mid") +``` + + +- Example of processing multiple audio files in batch (Using `Pop2PianoFeatureExtractor` and `Pop2PianoTokenizer`): + +```python +>>> import librosa +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoFeatureExtractor, Pop2PianoTokenizer + +>>> # feel free to change the sr to a suitable value. +>>> audio1, sr1 = librosa.load("", sr=44100) +>>> audio2, sr2 = librosa.load("", sr=44100) +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano") +>>> tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano") + +>>> inputs = feature_extractor( +... audio=[audio1, audio2], +... sampling_rate=[sr1, sr2], +... return_attention_mask=True, +... return_tensors="pt", +... ) +>>> # Since we now generating in batch(2 audios) we must pass the attention_mask +>>> model_output = model.generate( +... input_features=inputs["input_features"], +... attention_mask=inputs["attention_mask"], +... composer="composer1", +... ) +>>> tokenizer_output = tokenizer.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"] + +>>> # Since we now have 2 generated MIDI files +>>> tokenizer_output[0].write("./Outputs/midi_output1.mid") +>>> tokenizer_output[1].write("./Outputs/midi_output2.mid") +``` + + +## Pop2PianoConfig + +[[autodoc]] Pop2PianoConfig + +## Pop2PianoFeatureExtractor + +[[autodoc]] Pop2PianoFeatureExtractor + - __call__ + +## Pop2PianoForConditionalGeneration + +[[autodoc]] Pop2PianoForConditionalGeneration + - forward + - generate + +## Pop2PianoTokenizer + +[[autodoc]] Pop2PianoTokenizer + - __call__ + +## Pop2PianoProcessor + +[[autodoc]] Pop2PianoProcessor + - __call__ diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md new file mode 100644 index 000000000000..6ab0937da77e --- /dev/null +++ b/docs/source/en/model_doc/prophetnet.md @@ -0,0 +1,107 @@ + + +# ProphetNet + +
+ +Models + + +Spaces + +
+ + +**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign +@patrickvonplaten + +## Overview + +The ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei +Zhang, Ming Zhou on 13 Jan, 2020. + +ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just +the next token. + +The abstract from the paper is the following: + +*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel +self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of +the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by +n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time +step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent +overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale +dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for +abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new +state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.* + +Tips: + +- ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. +- The model architecture is based on the original Transformer, but replaces the “standard” self-attention mechanism in the decoder by a a main self-attention mechanism and a self and n-stream (predict) self-attention mechanism. + +The Authors' code can be found [here](https://github.com/microsoft/ProphetNet). + +## Documentation resources + +- [Causal language modeling task guide](../tasks/language_modeling) +- [Translation task guide](../tasks/translation) +- [Summarization task guide](../tasks/summarization) + +## ProphetNetConfig + +[[autodoc]] ProphetNetConfig + +## ProphetNetTokenizer + +[[autodoc]] ProphetNetTokenizer + +## ProphetNet specific outputs + +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput + +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput + +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput + +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput + +## ProphetNetModel + +[[autodoc]] ProphetNetModel + - forward + +## ProphetNetEncoder + +[[autodoc]] ProphetNetEncoder + - forward + +## ProphetNetDecoder + +[[autodoc]] ProphetNetDecoder + - forward + +## ProphetNetForConditionalGeneration + +[[autodoc]] ProphetNetForConditionalGeneration + - forward + +## ProphetNetForCausalLM + +[[autodoc]] ProphetNetForCausalLM + - forward diff --git a/docs/source/en/model_doc/prophetnet.mdx b/docs/source/en/model_doc/prophetnet.mdx deleted file mode 100644 index 14d0b3a92415..000000000000 --- a/docs/source/en/model_doc/prophetnet.mdx +++ /dev/null @@ -1,87 +0,0 @@ - - -# ProphetNet - -**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign -@patrickvonplaten - -## Overview - -The ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei -Zhang, Ming Zhou on 13 Jan, 2020. - -ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just -the next token. - -The abstract from the paper is the following: - -*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel -self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of -the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by -n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time -step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent -overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale -dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for -abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new -state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.* - -Tips: - -- ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than - the left. - -The Authors' code can be found [here](https://github.com/microsoft/ProphetNet). - - -## ProphetNetConfig - -[[autodoc]] ProphetNetConfig - -## ProphetNetTokenizer - -[[autodoc]] ProphetNetTokenizer - -## ProphetNet specific outputs - -[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput - -[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput - -[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput - -[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput - -## ProphetNetModel - -[[autodoc]] ProphetNetModel - - forward - -## ProphetNetEncoder - -[[autodoc]] ProphetNetEncoder - - forward - -## ProphetNetDecoder - -[[autodoc]] ProphetNetDecoder - - forward - -## ProphetNetForConditionalGeneration - -[[autodoc]] ProphetNetForConditionalGeneration - - forward - -## ProphetNetForCausalLM - -[[autodoc]] ProphetNetForCausalLM - - forward diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md new file mode 100644 index 000000000000..4b297a33f8fc --- /dev/null +++ b/docs/source/en/model_doc/pvt.md @@ -0,0 +1,71 @@ + + +# Pyramid Vision Transformer (PVT) + +## Overview + +The PVT model was proposed in +[Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/abs/2102.12122) +by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. The PVT is a type of +vision transformer that utilizes a pyramid structure to make it an effective backbone for dense prediction tasks. Specifically +it allows for more fine-grained inputs (4 x 4 pixels per patch) to be used, while simultaneously shrinking the sequence length +of the Transformer as it deepens - reducing the computational cost. Additionally, a spatial-reduction attention (SRA) layer +is used to further reduce the resource consumption when learning high-resolution features. + +The abstract from the paper is the following: + +*Although convolutional neural networks (CNNs) have achieved great success in computer vision, this work investigates a +simpler, convolution-free backbone network useful for many dense prediction tasks. Unlike the recently proposed Vision +Transformer (ViT) that was designed for image classification specifically, we introduce the Pyramid Vision Transformer +(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several +merits compared to current state of the arts. Different from ViT that typically yields low resolution outputs and +incurs high computational and memory costs, PVT not only can be trained on dense partitions of an image to achieve high +output resolution, which is important for dense prediction, but also uses a progressive shrinking pyramid to reduce the +computations of large feature maps. PVT inherits the advantages of both CNN and Transformer, making it a unified +backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. +We validate PVT through extensive experiments, showing that it boosts the performance of many downstream tasks, including +object detection, instance and semantic segmentation. For example, with a comparable number of parameters, PVT+RetinaNet +achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP (see Figure 2). We hope +that PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future research.* + +This model was contributed by [Xrenya]( + +# QDQBERT + +## Overview + +The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical +Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius +Micikevicius. + +The abstract from the paper is the following: + +*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by +taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of +quantization parameters and evaluate their choices on a wide range of neural network models for different application +domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration +by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is +able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are +more difficult to quantize, such as MobileNets and BERT-large.* + +Tips: + +- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer + inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model. + +- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com` + +- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and + perform Quantization Aware Training/Post Training Quantization. + +- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for + SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/). + +This model was contributed by [shangz](https://huggingface.co/shangz). + + +### Set default quantizers + +QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by +`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module +for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch +Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details. + +Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers. + +Example: + +```python +>>> import pytorch_quantization.nn as quant_nn +>>> from pytorch_quantization.tensor_quant import QuantDescriptor + +>>> # The default tensor quantizer is set to use Max calibration method +>>> input_desc = QuantDescriptor(num_bits=8, calib_method="max") +>>> # The default tensor quantizer is set to be per-channel quantization for weights +>>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,))) +>>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc) +>>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc) +``` + +### Calibration + +Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for +tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model: + +```python +>>> # Find the TensorQuantizer and enable calibration +>>> for name, module in model.named_modules(): +... if name.endswith("_input_quantizer"): +... module.enable_calib() +... module.disable_quant() # Use full precision data to calibrate + +>>> # Feeding data samples +>>> model(x) +>>> # ... + +>>> # Finalize calibration +>>> for name, module in model.named_modules(): +... if name.endswith("_input_quantizer"): +... module.load_calib_amax() +... module.enable_quant() + +>>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process +>>> model.cuda() + +>>> # Keep running the quantized model +>>> # ... +``` + +### Export to ONNX + +The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake +quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of +TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow +the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example: + +```python +>>> from pytorch_quantization.nn import TensorQuantizer + +>>> TensorQuantizer.use_fb_fake_quant = True + +>>> # Load the calibrated model +>>> ... +>>> # ONNX export +>>> torch.onnx.export(...) +``` + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## QDQBertConfig + +[[autodoc]] QDQBertConfig + +## QDQBertModel + +[[autodoc]] QDQBertModel + - forward + +## QDQBertLMHeadModel + +[[autodoc]] QDQBertLMHeadModel + - forward + +## QDQBertForMaskedLM + +[[autodoc]] QDQBertForMaskedLM + - forward + +## QDQBertForSequenceClassification + +[[autodoc]] QDQBertForSequenceClassification + - forward + +## QDQBertForNextSentencePrediction + +[[autodoc]] QDQBertForNextSentencePrediction + - forward + +## QDQBertForMultipleChoice + +[[autodoc]] QDQBertForMultipleChoice + - forward + +## QDQBertForTokenClassification + +[[autodoc]] QDQBertForTokenClassification + - forward + +## QDQBertForQuestionAnswering + +[[autodoc]] QDQBertForQuestionAnswering + - forward diff --git a/docs/source/en/model_doc/qdqbert.mdx b/docs/source/en/model_doc/qdqbert.mdx deleted file mode 100644 index df7b7bcee625..000000000000 --- a/docs/source/en/model_doc/qdqbert.mdx +++ /dev/null @@ -1,159 +0,0 @@ - - -# QDQBERT - -## Overview - -The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical -Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius -Micikevicius. - -The abstract from the paper is the following: - -*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by -taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of -quantization parameters and evaluate their choices on a wide range of neural network models for different application -domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration -by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is -able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are -more difficult to quantize, such as MobileNets and BERT-large.* - -Tips: - -- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer - inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model. - -- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com` - -- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and - perform Quantization Aware Training/Post Training Quantization. - -- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for - SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/). - -This model was contributed by [shangz](https://huggingface.co/shangz). - - -### Set default quantizers - -QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by -`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module -for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch -Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details. - -Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers. - -Example: - -```python ->>> import pytorch_quantization.nn as quant_nn ->>> from pytorch_quantization.tensor_quant import QuantDescriptor - ->>> # The default tensor quantizer is set to use Max calibration method ->>> input_desc = QuantDescriptor(num_bits=8, calib_method="max") ->>> # The default tensor quantizer is set to be per-channel quantization for weights ->>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,))) ->>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc) ->>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc) -``` - -### Calibration - -Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for -tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model: - -```python ->>> # Find the TensorQuantizer and enable calibration ->>> for name, module in model.named_modules(): -... if name.endswith("_input_quantizer"): -... module.enable_calib() -... module.disable_quant() # Use full precision data to calibrate - ->>> # Feeding data samples ->>> model(x) ->>> # ... - ->>> # Finalize calibration ->>> for name, module in model.named_modules(): -... if name.endswith("_input_quantizer"): -... module.load_calib_amax() -... module.enable_quant() - ->>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process ->>> model.cuda() - ->>> # Keep running the quantized model ->>> # ... -``` - -### Export to ONNX - -The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake -quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of -TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow -the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example: - -```python ->>> from pytorch_quantization.nn import TensorQuantizer - ->>> TensorQuantizer.use_fb_fake_quant = True - ->>> # Load the calibrated model ->>> ... ->>> # ONNX export ->>> torch.onnx.export(...) -``` - -## QDQBertConfig - -[[autodoc]] QDQBertConfig - -## QDQBertModel - -[[autodoc]] QDQBertModel - - forward - -## QDQBertLMHeadModel - -[[autodoc]] QDQBertLMHeadModel - - forward - -## QDQBertForMaskedLM - -[[autodoc]] QDQBertForMaskedLM - - forward - -## QDQBertForSequenceClassification - -[[autodoc]] QDQBertForSequenceClassification - - forward - -## QDQBertForNextSentencePrediction - -[[autodoc]] QDQBertForNextSentencePrediction - - forward - -## QDQBertForMultipleChoice - -[[autodoc]] QDQBertForMultipleChoice - - forward - -## QDQBertForTokenClassification - -[[autodoc]] QDQBertForTokenClassification - - forward - -## QDQBertForQuestionAnswering - -[[autodoc]] QDQBertForQuestionAnswering - - forward diff --git a/docs/source/en/model_doc/rag.md b/docs/source/en/model_doc/rag.md new file mode 100644 index 000000000000..b467c6169f66 --- /dev/null +++ b/docs/source/en/model_doc/rag.md @@ -0,0 +1,108 @@ + + +# RAG + +
+ +Models + +
+ +## Overview + +Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and +sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate +outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing +both retrieval and generation to adapt to downstream tasks. + +It is based on the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir +Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. + +The abstract from the paper is the following: + +*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve +state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely +manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind +task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge +remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric +memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a +general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained +parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a +pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a +pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages +across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our +models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks, +outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation +tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art +parametric-only seq2seq baseline.* + +This model was contributed by [ola13](https://huggingface.co/ola13). + +Tips: +- Retrieval-augmented generation (“RAG”) models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation to adapt to downstream tasks. + +## RagConfig + +[[autodoc]] RagConfig + +## RagTokenizer + +[[autodoc]] RagTokenizer + +## Rag specific outputs + +[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput + +[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput + +## RagRetriever + +[[autodoc]] RagRetriever + +## RagModel + +[[autodoc]] RagModel + - forward + +## RagSequenceForGeneration + +[[autodoc]] RagSequenceForGeneration + - forward + - generate + +## RagTokenForGeneration + +[[autodoc]] RagTokenForGeneration + - forward + - generate + +## TFRagModel + +[[autodoc]] TFRagModel + - call + +## TFRagSequenceForGeneration + +[[autodoc]] TFRagSequenceForGeneration + - call + - generate + +## TFRagTokenForGeneration + +[[autodoc]] TFRagTokenForGeneration + - call + - generate diff --git a/docs/source/en/model_doc/rag.mdx b/docs/source/en/model_doc/rag.mdx deleted file mode 100644 index 2f5d3498d8cf..000000000000 --- a/docs/source/en/model_doc/rag.mdx +++ /dev/null @@ -1,96 +0,0 @@ - - -# RAG - -## Overview - -Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and -sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate -outputs. The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing -both retrieval and generation to adapt to downstream tasks. - -It is based on the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir -Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. - -The abstract from the paper is the following: - -*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve -state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely -manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind -task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge -remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric -memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a -general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained -parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a -pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a -pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages -across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our -models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks, -outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation -tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art -parametric-only seq2seq baseline.* - -This model was contributed by [ola13](https://huggingface.co/ola13). - - -## RagConfig - -[[autodoc]] RagConfig - -## RagTokenizer - -[[autodoc]] RagTokenizer - -## Rag specific outputs - -[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput - -[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput - -## RagRetriever - -[[autodoc]] RagRetriever - -## RagModel - -[[autodoc]] RagModel - - forward - -## RagSequenceForGeneration - -[[autodoc]] RagSequenceForGeneration - - forward - - generate - -## RagTokenForGeneration - -[[autodoc]] RagTokenForGeneration - - forward - - generate - -## TFRagModel - -[[autodoc]] TFRagModel - - call - -## TFRagSequenceForGeneration - -[[autodoc]] TFRagSequenceForGeneration - - call - - generate - -## TFRagTokenForGeneration - -[[autodoc]] TFRagTokenForGeneration - - call - - generate diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md new file mode 100644 index 000000000000..a8227bc83c73 --- /dev/null +++ b/docs/source/en/model_doc/realm.md @@ -0,0 +1,89 @@ + + +# REALM + +## Overview + +The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a +retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then +utilizes retrieved documents to process question answering tasks. + +The abstract from the paper is the following: + +*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks +such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network, +requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we +augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend +over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the +first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language +modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We +demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the +challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both +explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous +methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as +interpretability and modularity.* + +This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found +[here](https://github.com/google-research/language/tree/master/language/realm). + +## RealmConfig + +[[autodoc]] RealmConfig + +## RealmTokenizer + +[[autodoc]] RealmTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + - batch_encode_candidates + +## RealmTokenizerFast + +[[autodoc]] RealmTokenizerFast + - batch_encode_candidates + +## RealmRetriever + +[[autodoc]] RealmRetriever + +## RealmEmbedder + +[[autodoc]] RealmEmbedder + - forward + +## RealmScorer + +[[autodoc]] RealmScorer + - forward + +## RealmKnowledgeAugEncoder + +[[autodoc]] RealmKnowledgeAugEncoder + - forward + +## RealmReader + +[[autodoc]] RealmReader + - forward + +## RealmForOpenQA + +[[autodoc]] RealmForOpenQA + - block_embedding_to + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/realm.mdx b/docs/source/en/model_doc/realm.mdx deleted file mode 100644 index 545b1e0a3bf8..000000000000 --- a/docs/source/en/model_doc/realm.mdx +++ /dev/null @@ -1,85 +0,0 @@ - - -# REALM - -## Overview - -The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a -retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then -utilizes retrieved documents to process question answering tasks. - -The abstract from the paper is the following: - -*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks -such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network, -requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we -augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend -over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the -first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language -modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We -demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the -challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both -explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous -methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as -interpretability and modularity.* - -This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found -[here](https://github.com/google-research/language/tree/master/language/realm). - -## RealmConfig - -[[autodoc]] RealmConfig - -## RealmTokenizer - -[[autodoc]] RealmTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - - batch_encode_candidates - -## RealmTokenizerFast - -[[autodoc]] RealmTokenizerFast - - batch_encode_candidates - -## RealmRetriever - -[[autodoc]] RealmRetriever - -## RealmEmbedder - -[[autodoc]] RealmEmbedder - - forward - -## RealmScorer - -[[autodoc]] RealmScorer - - forward - -## RealmKnowledgeAugEncoder - -[[autodoc]] RealmKnowledgeAugEncoder - - forward - -## RealmReader - -[[autodoc]] RealmReader - - forward - -## RealmForOpenQA - -[[autodoc]] RealmForOpenQA - - block_embedding_to - - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md new file mode 100644 index 000000000000..05274c7667b7 --- /dev/null +++ b/docs/source/en/model_doc/reformer.md @@ -0,0 +1,201 @@ + + +# Reformer + +
+ +Models + + +Spaces + +
+ +**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title). + +## Overview + +The Reformer model was proposed in the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451.pdf) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. + +The abstract from the paper is the following: + +*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can +be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of +Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its +complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual +layers instead of the standard residuals, which allows storing activations only once in the training process instead of +N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models +while being much more memory-efficient and much faster on long sequences.* + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be +found [here](https://github.com/google/trax/tree/master/trax/models/reformer). + +Tips: + +- Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035). +- Use Axial position encoding (see below for more details). It’s a mechanism to avoid having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller matrices. +- Replace traditional attention by LSH (local-sensitive hashing) attention (see below for more details). It’s a technique to avoid computing the full product query-key in the attention layers. +- Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them for results inside a given layer (less efficient than storing them but saves memory). +- Compute the feedforward operations by chunks and not on the whole batch. + +## Axial Positional Encodings + +Axial Positional Encodings were first implemented in Google's [trax library](https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29) +and developed by the authors of this model's paper. In models that are treating very long input sequences, the +conventional position id encodings store an embedings vector of size \\(d\\) being the `config.hidden_size` for +every position \\(i, \ldots, n_s\\), with \\(n_s\\) being `config.max_embedding_size`. This means that having +a sequence length of \\(n_s = 2^{19} \approx 0.5M\\) and a `config.hidden_size` of \\(d = 2^{10} \approx 1000\\) +would result in a position encoding matrix: + +$$X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right]$$ + +which alone has over 500M parameters to store. Axial positional encodings factorize \\(X_{i,j}\\) into two matrices: + +$$X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right]$$ + +and + +$$X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right]$$ + +with: + +$$d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .$$ + +Therefore the following holds: + +$$X_{i,j} = \begin{cases} +X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\ +X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor +\end{cases}$$ + +Intuitively, this means that a position embedding vector \\(x_j \in \mathbb{R}^{d}\\) is now the composition of two +factorized embedding vectors: \\(x^1_{k, l} + x^2_{l, k}\\), where as the `config.max_embedding_size` dimension +\\(j\\) is factorized into \\(k \text{ and } l\\). This design ensures that each position embedding vector +\\(x_j\\) is unique. + +Using the above example again, axial position encoding with \\(d^1 = 2^9, d^2 = 2^9, n_s^1 = 2^9, n_s^2 = 2^{10}\\) +can drastically reduced the number of parameters from 500 000 000 to \\(2^{18} + 2^{19} \approx 780 000\\) parameters, this means 85% less memory usage. + +In practice, the parameter `config.axial_pos_embds_dim` is set to a tuple \\((d^1, d^2)\\) which sum has to be +equal to `config.hidden_size` and `config.axial_pos_shape` is set to a tuple \\((n_s^1, n_s^2)\\) which +product has to be equal to `config.max_embedding_size`, which during training has to be equal to the *sequence +length* of the `input_ids`. + + +## LSH Self Attention + +In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key +query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in +[Practical and Optimal LSH for Angular Distance](https://arxiv.org/abs/1509.02897) to assign each of the tied key +query embedding vectors to one of `config.num_buckets` possible buckets. The premise is that the more "similar" +key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to +the same bucket. + +The accuracy of the LSH mechanism can be improved by increasing `config.num_hashes` or directly the argument +`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output +of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks +each of length `config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors +(which are tied to themselves) and to the key embedding vectors of `config.lsh_num_chunks_before` previous +neighboring chunks and `config.lsh_num_chunks_after` following neighboring chunks. + +For more information, see the [original Paper](https://arxiv.org/abs/2001.04451) or this great [blog post](https://www.pragmatic.ml/reformer-deep-dive/). + +Note that `config.num_buckets` can also be factorized into a list \\((n_{\text{buckets}}^1, +n_{\text{buckets}}^2)\\). This way instead of assigning the query key embedding vectors to one of \\((1,\ldots, +n_{\text{buckets}})\\) they are assigned to one of \\((1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, +1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)\\). This is crucial for very long sequences to +save memory. + +When training a model from scratch, it is recommended to leave `config.num_buckets=None`, so that depending on the +sequence length a good value for `num_buckets` is calculated on the fly. This value will then automatically be +saved in the config and should be reused for inference. + +Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from +\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory +and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length. + + +## Local Self Attention + +Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is +chunked so that in each chunk of length `config.local_chunk_length` the query embedding vectors only attends to +the key embedding vectors in its chunk and to the key embedding vectors of `config.local_num_chunks_before` +previous neighboring chunks and `config.local_num_chunks_after` following neighboring chunks. + +Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from +\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory +and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length. + + +## Training + +During training, we must ensure that the sequence length is set to a value that can be divided by the least common +multiple of `config.lsh_chunk_length` and `config.local_chunk_length` and that the parameters of the Axial +Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can +easily be trained on sequences as long as 64000 tokens. + +For training, the [`ReformerModelWithLMHead`] should be used as follows: + +```python +input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt") +loss = model(input_ids, labels=input_ids)[0] +``` + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) + +## ReformerConfig + +[[autodoc]] ReformerConfig + +## ReformerTokenizer + +[[autodoc]] ReformerTokenizer + - save_vocabulary + +## ReformerTokenizerFast + +[[autodoc]] ReformerTokenizerFast + +## ReformerModel + +[[autodoc]] ReformerModel + - forward + +## ReformerModelWithLMHead + +[[autodoc]] ReformerModelWithLMHead + - forward + +## ReformerForMaskedLM + +[[autodoc]] ReformerForMaskedLM + - forward + +## ReformerForSequenceClassification + +[[autodoc]] ReformerForSequenceClassification + - forward + +## ReformerForQuestionAnswering + +[[autodoc]] ReformerForQuestionAnswering + - forward diff --git a/docs/source/en/model_doc/reformer.mdx b/docs/source/en/model_doc/reformer.mdx deleted file mode 100644 index 777a333e7b1f..000000000000 --- a/docs/source/en/model_doc/reformer.mdx +++ /dev/null @@ -1,177 +0,0 @@ - - -# Reformer - -**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title). - -## Overview - -The Reformer model was proposed in the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451.pdf) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. - -The abstract from the paper is the following: - -*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can -be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of -Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its -complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual -layers instead of the standard residuals, which allows storing activations only once in the training process instead of -N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models -while being much more memory-efficient and much faster on long sequences.* - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be -found [here](https://github.com/google/trax/tree/master/trax/models/reformer). - -**Note**: - -- Reformer does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035) - -## Axial Positional Encodings - -Axial Positional Encodings were first implemented in Google's [trax library](https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29) -and developed by the authors of this model's paper. In models that are treating very long input sequences, the -conventional position id encodings store an embedings vector of size \\(d\\) being the `config.hidden_size` for -every position \\(i, \ldots, n_s\\), with \\(n_s\\) being `config.max_embedding_size`. This means that having -a sequence length of \\(n_s = 2^{19} \approx 0.5M\\) and a `config.hidden_size` of \\(d = 2^{10} \approx 1000\\) -would result in a position encoding matrix: - -$$X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right]$$ - -which alone has over 500M parameters to store. Axial positional encodings factorize \\(X_{i,j}\\) into two matrices: - -$$X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right]$$ - -and - -$$X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right]$$ - -with: - -$$d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .$$ - -Therefore the following holds: - -$$X_{i,j} = \begin{cases} -X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\ -X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor -\end{cases}$$ - -Intuitively, this means that a position embedding vector \\(x_j \in \mathbb{R}^{d}\\) is now the composition of two -factorized embedding vectors: \\(x^1_{k, l} + x^2_{l, k}\\), where as the `config.max_embedding_size` dimension -\\(j\\) is factorized into \\(k \text{ and } l\\). This design ensures that each position embedding vector -\\(x_j\\) is unique. - -Using the above example again, axial position encoding with \\(d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}\\) -can drastically reduced the number of parameters to \\(2^{14} + 2^{15} \approx 49000\\) parameters. - -In practice, the parameter `config.axial_pos_embds_dim` is set to a tuple \\((d^1, d^2)\\) which sum has to be -equal to `config.hidden_size` and `config.axial_pos_shape` is set to a tuple \\((n_s^1, n_s^2)\\) which -product has to be equal to `config.max_embedding_size`, which during training has to be equal to the *sequence -length* of the `input_ids`. - - -## LSH Self Attention - -In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key -query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in -[Practical and Optimal LSH for Angular Distance](https://arxiv.org/abs/1509.02897) to assign each of the tied key -query embedding vectors to one of `config.num_buckets` possible buckets. The premise is that the more "similar" -key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to -the same bucket. - -The accuracy of the LSH mechanism can be improved by increasing `config.num_hashes` or directly the argument -`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output -of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks -each of length `config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors -(which are tied to themselves) and to the key embedding vectors of `config.lsh_num_chunks_before` previous -neighboring chunks and `config.lsh_num_chunks_after` following neighboring chunks. - -For more information, see the [original Paper](https://arxiv.org/abs/2001.04451) or this great [blog post](https://www.pragmatic.ml/reformer-deep-dive/). - -Note that `config.num_buckets` can also be factorized into a list \\((n_{\text{buckets}}^1, -n_{\text{buckets}}^2)\\). This way instead of assigning the query key embedding vectors to one of \\((1,\ldots, -n_{\text{buckets}})\\) they are assigned to one of \\((1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, -1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)\\). This is crucial for very long sequences to -save memory. - -When training a model from scratch, it is recommended to leave `config.num_buckets=None`, so that depending on the -sequence length a good value for `num_buckets` is calculated on the fly. This value will then automatically be -saved in the config and should be reused for inference. - -Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from -\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory -and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length. - - -## Local Self Attention - -Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is -chunked so that in each chunk of length `config.local_chunk_length` the query embedding vectors only attends to -the key embedding vectors in its chunk and to the key embedding vectors of `config.local_num_chunks_before` -previous neighboring chunks and `config.local_num_chunks_after` following neighboring chunks. - -Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from -\\(\mathcal{O}(n_s \times n_s)\\) to \\(\mathcal{O}(n_s \times \log(n_s))\\), which usually represents the memory -and time bottleneck in a transformer model, with \\(n_s\\) being the sequence length. - - -## Training - -During training, we must ensure that the sequence length is set to a value that can be divided by the least common -multiple of `config.lsh_chunk_length` and `config.local_chunk_length` and that the parameters of the Axial -Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can -easily be trained on sequences as long as 64000 tokens. - -For training, the [`ReformerModelWithLMHead`] should be used as follows: - -```python -input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt") -loss = model(input_ids, labels=input_ids)[0] -``` - -## ReformerConfig - -[[autodoc]] ReformerConfig - -## ReformerTokenizer - -[[autodoc]] ReformerTokenizer - - save_vocabulary - -## ReformerTokenizerFast - -[[autodoc]] ReformerTokenizerFast - -## ReformerModel - -[[autodoc]] ReformerModel - - forward - -## ReformerModelWithLMHead - -[[autodoc]] ReformerModelWithLMHead - - forward - -## ReformerForMaskedLM - -[[autodoc]] ReformerForMaskedLM - - forward - -## ReformerForSequenceClassification - -[[autodoc]] ReformerForSequenceClassification - - forward - -## ReformerForQuestionAnswering - -[[autodoc]] ReformerForQuestionAnswering - - forward diff --git a/docs/source/en/model_doc/regnet.md b/docs/source/en/model_doc/regnet.md new file mode 100644 index 000000000000..89e89459bd7f --- /dev/null +++ b/docs/source/en/model_doc/regnet.md @@ -0,0 +1,86 @@ + + +# RegNet + +## Overview + +The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. + +The authors design search spaces to perform Neural Architecture Search (NAS). They first start from a high dimensional search space and iteratively reduce the search space by empirically applying constraints based on the best-performing models sampled by the current search space. + +The abstract from the paper is the following: + +*In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.* + +Tips: + +- One can use [`AutoImageProcessor`] to prepare images for the model. +- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer) + +This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of the model +was contributed by [sayakpaul](https://huggingface.com/sayakpaul) and [ariG23498](https://huggingface.com/ariG23498). +The original code can be found [here](https://github.com/facebookresearch/pycls). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RegNet. + + + +- [`RegNetForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## RegNetConfig + +[[autodoc]] RegNetConfig + + +## RegNetModel + +[[autodoc]] RegNetModel + - forward + + +## RegNetForImageClassification + +[[autodoc]] RegNetForImageClassification + - forward + +## TFRegNetModel + +[[autodoc]] TFRegNetModel + - call + + +## TFRegNetForImageClassification + +[[autodoc]] TFRegNetForImageClassification + - call + + +## FlaxRegNetModel + +[[autodoc]] FlaxRegNetModel + - __call__ + + +## FlaxRegNetForImageClassification + +[[autodoc]] FlaxRegNetForImageClassification + - __call__ \ No newline at end of file diff --git a/docs/source/en/model_doc/regnet.mdx b/docs/source/en/model_doc/regnet.mdx deleted file mode 100644 index a426ad8fa146..000000000000 --- a/docs/source/en/model_doc/regnet.mdx +++ /dev/null @@ -1,60 +0,0 @@ - - -# RegNet - -## Overview - -The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. - -The authors design search spaces to perform Neural Architecture Search (NAS). They first start from a high dimensional search space and iteratively reduce the search space by empirically applying constraints based on the best-performing models sampled by the current search space. - -The abstract from the paper is the following: - -*In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.* - -Tips: - -- One can use [`AutoImageProcessor`] to prepare images for the model. -- The huge 10B model from [Self-supervised Pretraining of Visual Features in the Wild](https://arxiv.org/abs/2103.01988), trained on one billion Instagram images, is available on the [hub](https://huggingface.co/facebook/regnet-y-10b-seer) - -This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of the model -was contributed by [sayakpaul](https://huggingface.com/sayakpaul) and [ariG23498](https://huggingface.com/ariG23498). -The original code can be found [here](https://github.com/facebookresearch/pycls). - - -## RegNetConfig - -[[autodoc]] RegNetConfig - - -## RegNetModel - -[[autodoc]] RegNetModel - - forward - - -## RegNetForImageClassification - -[[autodoc]] RegNetForImageClassification - - forward - -## TFRegNetModel - -[[autodoc]] TFRegNetModel - - call - - -## TFRegNetForImageClassification - -[[autodoc]] TFRegNetForImageClassification - - call \ No newline at end of file diff --git a/docs/source/en/model_doc/rembert.md b/docs/source/en/model_doc/rembert.md new file mode 100644 index 000000000000..b2e4d0f5adae --- /dev/null +++ b/docs/source/en/model_doc/rembert.md @@ -0,0 +1,141 @@ + + +# RemBERT + +## Overview + +The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder. + +The abstract from the paper is the following: + +*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art +pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to +significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By +reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on +standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that +allocating additional capacity to the output embedding provides benefits to the model that persist through the +fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger +output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage +Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these +findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the +number of parameters at the fine-tuning stage.* + +Tips: + +For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the +embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input +embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is +also similar to the Albert one rather than the BERT one. + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## RemBertConfig + +[[autodoc]] RemBertConfig + +## RemBertTokenizer + +[[autodoc]] RemBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## RemBertTokenizerFast + +[[autodoc]] RemBertTokenizerFast + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## RemBertModel + +[[autodoc]] RemBertModel + - forward + +## RemBertForCausalLM + +[[autodoc]] RemBertForCausalLM + - forward + +## RemBertForMaskedLM + +[[autodoc]] RemBertForMaskedLM + - forward + +## RemBertForSequenceClassification + +[[autodoc]] RemBertForSequenceClassification + - forward + +## RemBertForMultipleChoice + +[[autodoc]] RemBertForMultipleChoice + - forward + +## RemBertForTokenClassification + +[[autodoc]] RemBertForTokenClassification + - forward + +## RemBertForQuestionAnswering + +[[autodoc]] RemBertForQuestionAnswering + - forward + +## TFRemBertModel + +[[autodoc]] TFRemBertModel + - call + +## TFRemBertForMaskedLM + +[[autodoc]] TFRemBertForMaskedLM + - call + +## TFRemBertForCausalLM + +[[autodoc]] TFRemBertForCausalLM + - call + +## TFRemBertForSequenceClassification + +[[autodoc]] TFRemBertForSequenceClassification + - call + +## TFRemBertForMultipleChoice + +[[autodoc]] TFRemBertForMultipleChoice + - call + +## TFRemBertForTokenClassification + +[[autodoc]] TFRemBertForTokenClassification + - call + +## TFRemBertForQuestionAnswering + +[[autodoc]] TFRemBertForQuestionAnswering + - call diff --git a/docs/source/en/model_doc/rembert.mdx b/docs/source/en/model_doc/rembert.mdx deleted file mode 100644 index 0edb8e5202d9..000000000000 --- a/docs/source/en/model_doc/rembert.mdx +++ /dev/null @@ -1,128 +0,0 @@ - - -# RemBERT - -## Overview - -The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder. - -The abstract from the paper is the following: - -*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art -pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to -significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By -reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on -standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that -allocating additional capacity to the output embedding provides benefits to the model that persist through the -fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger -output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage -Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these -findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the -number of parameters at the fine-tuning stage.* - -Tips: - -For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the -embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input -embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is -also similar to the Albert one rather than the BERT one. - -## RemBertConfig - -[[autodoc]] RemBertConfig - -## RemBertTokenizer - -[[autodoc]] RemBertTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## RemBertTokenizerFast - -[[autodoc]] RemBertTokenizerFast - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## RemBertModel - -[[autodoc]] RemBertModel - - forward - -## RemBertForCausalLM - -[[autodoc]] RemBertForCausalLM - - forward - -## RemBertForMaskedLM - -[[autodoc]] RemBertForMaskedLM - - forward - -## RemBertForSequenceClassification - -[[autodoc]] RemBertForSequenceClassification - - forward - -## RemBertForMultipleChoice - -[[autodoc]] RemBertForMultipleChoice - - forward - -## RemBertForTokenClassification - -[[autodoc]] RemBertForTokenClassification - - forward - -## RemBertForQuestionAnswering - -[[autodoc]] RemBertForQuestionAnswering - - forward - -## TFRemBertModel - -[[autodoc]] TFRemBertModel - - call - -## TFRemBertForMaskedLM - -[[autodoc]] TFRemBertForMaskedLM - - call - -## TFRemBertForCausalLM - -[[autodoc]] TFRemBertForCausalLM - - call - -## TFRemBertForSequenceClassification - -[[autodoc]] TFRemBertForSequenceClassification - - call - -## TFRemBertForMultipleChoice - -[[autodoc]] TFRemBertForMultipleChoice - - call - -## TFRemBertForTokenClassification - -[[autodoc]] TFRemBertForTokenClassification - - call - -## TFRemBertForQuestionAnswering - -[[autodoc]] TFRemBertForQuestionAnswering - - call diff --git a/docs/source/en/model_doc/resnet.md b/docs/source/en/model_doc/resnet.md new file mode 100644 index 000000000000..9bb36a776f16 --- /dev/null +++ b/docs/source/en/model_doc/resnet.md @@ -0,0 +1,87 @@ + + +# ResNet + +## Overview + +The ResNet model was proposed in [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. Our implementation follows the small changes made by [Nvidia](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/resnet_50_v1_5_for_pytorch), we apply the `stride=2` for downsampling in bottleneck's `3x3` conv and not in the first `1x1`. This is generally known as "ResNet v1.5". + +ResNet introduced residual connections, they allow to train networks with an unseen number of layers (up to 1000). ResNet won the 2015 ILSVRC & COCO competition, one important milestone in deep computer vision. + +The abstract from the paper is the following: + +*Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. +The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.* + +Tips: + +- One can use [`AutoImageProcessor`] to prepare images for the model. + +The figure below illustrates the architecture of ResNet. Taken from the [original paper](https://arxiv.org/abs/1512.03385). + + + +This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/KaimingHe/deep-residual-networks). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ResNet. + + + +- [`ResNetForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## ResNetConfig + +[[autodoc]] ResNetConfig + + +## ResNetModel + +[[autodoc]] ResNetModel + - forward + + +## ResNetForImageClassification + +[[autodoc]] ResNetForImageClassification + - forward + + +## TFResNetModel + +[[autodoc]] TFResNetModel + - call + + +## TFResNetForImageClassification + +[[autodoc]] TFResNetForImageClassification + - call + +## FlaxResNetModel + +[[autodoc]] FlaxResNetModel + - __call__ + +## FlaxResNetForImageClassification + +[[autodoc]] FlaxResNetForImageClassification + - __call__ diff --git a/docs/source/en/model_doc/resnet.mdx b/docs/source/en/model_doc/resnet.mdx deleted file mode 100644 index ce1799e8d48a..000000000000 --- a/docs/source/en/model_doc/resnet.mdx +++ /dev/null @@ -1,62 +0,0 @@ - - -# ResNet - -## Overview - -The ResNet model was proposed in [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. Our implementation follows the small changes made by [Nvidia](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/resnet_50_v1_5_for_pytorch), we apply the `stride=2` for downsampling in bottleneck's `3x3` conv and not in the first `1x1`. This is generally known as "ResNet v1.5". - -ResNet introduced residual connections, they allow to train networks with an unseen number of layers (up to 1000). ResNet won the 2015 ILSVRC & COCO competition, one important milestone in deep computer vision. - -The abstract from the paper is the following: - -*Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. -The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.* - -Tips: - -- One can use [`AutoImageProcessor`] to prepare images for the model. - -The figure below illustrates the architecture of ResNet. Taken from the [original paper](https://arxiv.org/abs/1512.03385). - - - -This model was contributed by [Francesco](https://huggingface.co/Francesco). The TensorFlow version of this model was added by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/KaimingHe/deep-residual-networks). - -## ResNetConfig - -[[autodoc]] ResNetConfig - - -## ResNetModel - -[[autodoc]] ResNetModel - - forward - - -## ResNetForImageClassification - -[[autodoc]] ResNetForImageClassification - - forward - - -## TFResNetModel - -[[autodoc]] TFResNetModel - - call - - -## TFResNetForImageClassification - -[[autodoc]] TFResNetForImageClassification - - call diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md new file mode 100644 index 000000000000..ab29ac966fe1 --- /dev/null +++ b/docs/source/en/model_doc/retribert.md @@ -0,0 +1,53 @@ + + +# RetriBERT + + + +This model is in maintenance mode only, so we won't accept any new PRs changing its code. + +If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0. +You can do so by running the following command: `pip install -U transformers==4.30.0`. + + + +## Overview + +The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form +Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or +pair of BERT encoders with lower-dimension projection for dense semantic indexing of text. + +This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be +found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation). + + +## RetriBertConfig + +[[autodoc]] RetriBertConfig + +## RetriBertTokenizer + +[[autodoc]] RetriBertTokenizer + +## RetriBertTokenizerFast + +[[autodoc]] RetriBertTokenizerFast + +## RetriBertModel + +[[autodoc]] RetriBertModel + - forward diff --git a/docs/source/en/model_doc/retribert.mdx b/docs/source/en/model_doc/retribert.mdx deleted file mode 100644 index e83fae32300d..000000000000 --- a/docs/source/en/model_doc/retribert.mdx +++ /dev/null @@ -1,40 +0,0 @@ - - -# RetriBERT - -## Overview - -The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form -Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or -pair of BERT encoders with lower-dimension projection for dense semantic indexing of text. - -This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be -found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation). - - -## RetriBertConfig - -[[autodoc]] RetriBertConfig - -## RetriBertTokenizer - -[[autodoc]] RetriBertTokenizer - -## RetriBertTokenizerFast - -[[autodoc]] RetriBertTokenizerFast - -## RetriBertModel - -[[autodoc]] RetriBertModel - - forward diff --git a/docs/source/en/model_doc/roberta-prelayernorm.md b/docs/source/en/model_doc/roberta-prelayernorm.md new file mode 100644 index 000000000000..9822fd7af961 --- /dev/null +++ b/docs/source/en/model_doc/roberta-prelayernorm.md @@ -0,0 +1,152 @@ + + +# RoBERTa-PreLayerNorm + +## Overview + +The RoBERTa-PreLayerNorm model was proposed in [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +It is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/). + +The abstract from the paper is the following: + +*fairseq is an open-source sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling, and other text generation tasks. The toolkit is based on PyTorch and supports distributed training across multiple GPUs and machines. We also support fast mixed-precision training and inference on modern GPUs.* + +Tips: + +- The implementation is the same as [Roberta](roberta) except instead of using _Add and Norm_ it does _Norm and Add_. _Add_ and _Norm_ refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762). +- This is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/). + +This model was contributed by [andreasmaden](https://huggingface.co/andreasmaden). +The original code can be found [here](https://github.com/princeton-nlp/DinkyTrain). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## RobertaPreLayerNormConfig + +[[autodoc]] RobertaPreLayerNormConfig + +## RobertaPreLayerNormModel + +[[autodoc]] RobertaPreLayerNormModel + - forward + +## RobertaPreLayerNormForCausalLM + +[[autodoc]] RobertaPreLayerNormForCausalLM + - forward + +## RobertaPreLayerNormForMaskedLM + +[[autodoc]] RobertaPreLayerNormForMaskedLM + - forward + +## RobertaPreLayerNormForSequenceClassification + +[[autodoc]] RobertaPreLayerNormForSequenceClassification + - forward + +## RobertaPreLayerNormForMultipleChoice + +[[autodoc]] RobertaPreLayerNormForMultipleChoice + - forward + +## RobertaPreLayerNormForTokenClassification + +[[autodoc]] RobertaPreLayerNormForTokenClassification + - forward + +## RobertaPreLayerNormForQuestionAnswering + +[[autodoc]] RobertaPreLayerNormForQuestionAnswering + - forward + +## TFRobertaPreLayerNormModel + +[[autodoc]] TFRobertaPreLayerNormModel + - call + +## TFRobertaPreLayerNormForCausalLM + +[[autodoc]] TFRobertaPreLayerNormForCausalLM + - call + +## TFRobertaPreLayerNormForMaskedLM + +[[autodoc]] TFRobertaPreLayerNormForMaskedLM + - call + +## TFRobertaPreLayerNormForSequenceClassification + +[[autodoc]] TFRobertaPreLayerNormForSequenceClassification + - call + +## TFRobertaPreLayerNormForMultipleChoice + +[[autodoc]] TFRobertaPreLayerNormForMultipleChoice + - call + +## TFRobertaPreLayerNormForTokenClassification + +[[autodoc]] TFRobertaPreLayerNormForTokenClassification + - call + +## TFRobertaPreLayerNormForQuestionAnswering + +[[autodoc]] TFRobertaPreLayerNormForQuestionAnswering + - call + +## FlaxRobertaPreLayerNormModel + +[[autodoc]] FlaxRobertaPreLayerNormModel + - __call__ + +## FlaxRobertaPreLayerNormForCausalLM + +[[autodoc]] FlaxRobertaPreLayerNormForCausalLM + - __call__ + +## FlaxRobertaPreLayerNormForMaskedLM + +[[autodoc]] FlaxRobertaPreLayerNormForMaskedLM + - __call__ + +## FlaxRobertaPreLayerNormForSequenceClassification + +[[autodoc]] FlaxRobertaPreLayerNormForSequenceClassification + - __call__ + +## FlaxRobertaPreLayerNormForMultipleChoice + +[[autodoc]] FlaxRobertaPreLayerNormForMultipleChoice + - __call__ + +## FlaxRobertaPreLayerNormForTokenClassification + +[[autodoc]] FlaxRobertaPreLayerNormForTokenClassification + - __call__ + +## FlaxRobertaPreLayerNormForQuestionAnswering + +[[autodoc]] FlaxRobertaPreLayerNormForQuestionAnswering + - __call__ diff --git a/docs/source/en/model_doc/roberta-prelayernorm.mdx b/docs/source/en/model_doc/roberta-prelayernorm.mdx deleted file mode 100644 index a8fb2bb2b9aa..000000000000 --- a/docs/source/en/model_doc/roberta-prelayernorm.mdx +++ /dev/null @@ -1,140 +0,0 @@ - - -# RoBERTa-PreLayerNorm - -## Overview - -The RoBERTa-PreLayerNorm model was proposed in [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. -It is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/). - -The abstract from the paper is the following: - -*fairseq is an open-source sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling, and other text generation tasks. The toolkit is based on PyTorch and supports distributed training across multiple GPUs and machines. We also support fast mixed-precision training and inference on modern GPUs.* - -Tips: - -- The implementation is the same as [Roberta](roberta) except instead of using _Add and Norm_ it does _Norm and Add_. _Add_ and _Norm_ refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762). -- This is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/). - -This model was contributed by [andreasmaden](https://huggingface.co/andreasmaden). -The original code can be found [here](https://github.com/princeton-nlp/DinkyTrain). - - -## RobertaPreLayerNormConfig - -[[autodoc]] RobertaPreLayerNormConfig - -## RobertaPreLayerNormModel - -[[autodoc]] RobertaPreLayerNormModel - - forward - -## RobertaPreLayerNormForCausalLM - -[[autodoc]] RobertaPreLayerNormForCausalLM - - forward - -## RobertaPreLayerNormForMaskedLM - -[[autodoc]] RobertaPreLayerNormForMaskedLM - - forward - -## RobertaPreLayerNormForSequenceClassification - -[[autodoc]] RobertaPreLayerNormForSequenceClassification - - forward - -## RobertaPreLayerNormForMultipleChoice - -[[autodoc]] RobertaPreLayerNormForMultipleChoice - - forward - -## RobertaPreLayerNormForTokenClassification - -[[autodoc]] RobertaPreLayerNormForTokenClassification - - forward - -## RobertaPreLayerNormForQuestionAnswering - -[[autodoc]] RobertaPreLayerNormForQuestionAnswering - - forward - -## TFRobertaPreLayerNormModel - -[[autodoc]] TFRobertaPreLayerNormModel - - call - -## TFRobertaPreLayerNormForCausalLM - -[[autodoc]] TFRobertaPreLayerNormForCausalLM - - call - -## TFRobertaPreLayerNormForMaskedLM - -[[autodoc]] TFRobertaPreLayerNormForMaskedLM - - call - -## TFRobertaPreLayerNormForSequenceClassification - -[[autodoc]] TFRobertaPreLayerNormForSequenceClassification - - call - -## TFRobertaPreLayerNormForMultipleChoice - -[[autodoc]] TFRobertaPreLayerNormForMultipleChoice - - call - -## TFRobertaPreLayerNormForTokenClassification - -[[autodoc]] TFRobertaPreLayerNormForTokenClassification - - call - -## TFRobertaPreLayerNormForQuestionAnswering - -[[autodoc]] TFRobertaPreLayerNormForQuestionAnswering - - call - -## FlaxRobertaPreLayerNormModel - -[[autodoc]] FlaxRobertaPreLayerNormModel - - __call__ - -## FlaxRobertaPreLayerNormForCausalLM - -[[autodoc]] FlaxRobertaPreLayerNormForCausalLM - - __call__ - -## FlaxRobertaPreLayerNormForMaskedLM - -[[autodoc]] FlaxRobertaPreLayerNormForMaskedLM - - __call__ - -## FlaxRobertaPreLayerNormForSequenceClassification - -[[autodoc]] FlaxRobertaPreLayerNormForSequenceClassification - - __call__ - -## FlaxRobertaPreLayerNormForMultipleChoice - -[[autodoc]] FlaxRobertaPreLayerNormForMultipleChoice - - __call__ - -## FlaxRobertaPreLayerNormForTokenClassification - -[[autodoc]] FlaxRobertaPreLayerNormForTokenClassification - - __call__ - -## FlaxRobertaPreLayerNormForQuestionAnswering - -[[autodoc]] FlaxRobertaPreLayerNormForQuestionAnswering - - __call__ diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md new file mode 100644 index 000000000000..5a2ba6b5cf66 --- /dev/null +++ b/docs/source/en/model_doc/roberta.md @@ -0,0 +1,233 @@ + + +# RoBERTa + +
+ +Models + + +Spaces + + +Paper page + +
+ +## Overview + +The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer +Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. + +It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with +much larger mini-batches and learning rates. + +The abstract from the paper is the following: + +*Language model pretraining has led to significant performance gains but careful comparison between different +approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, +and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication +study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and +training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every +model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results +highlight the importance of previously overlooked design choices, and raise questions about the source of recently +reported improvements. We release our models and code.* + +Tips: + +- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup + for Roberta pretrained models. +- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a + different pretraining scheme. +- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just + separate your segments with the separation token `tokenizer.sep_token` (or ``) +- Same as BERT with better pretraining tricks: + + * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all + * together to reach 512 tokens (so the sentences are in an order than may span several documents) + * train with larger batches + * use BPE with bytes as a subunit and not characters (because of unicode characters) +- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples. + +This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A blog on [Getting Started with Sentiment Analysis on Twitter](https://huggingface.co/blog/sentiment-analysis-twitter) using RoBERTa and the [Inference API](https://huggingface.co/inference-api). +- A blog on [Opinion Classification with Kili and Hugging Face AutoTrain](https://huggingface.co/blog/opinion-classification-with-kili) using RoBERTa. +- A notebook on how to [finetune RoBERTa for sentiment analysis](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb). 🌎 +- [`RobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb). +- [`TFRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). +- [`FlaxRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb). +- [Text classification task guide](../tasks/sequence_classification) + + + +- [`RobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb). +- [`TFRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). +- [`FlaxRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification). +- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course. +- [Token classification task guide](../tasks/token_classification) + + + +- A blog on [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) with RoBERTa. +- [`RobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). +- [`TFRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). +- [`FlaxRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb). +- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course. +- [Masked language modeling task guide](../tasks/masked_language_modeling) + + + +- A blog on [Accelerated Inference with Optimum and Transformers Pipelines](https://huggingface.co/blog/optimum-inference) with RoBERTa for question answering. +- [`RobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb). +- [`TFRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). +- [`FlaxRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering). +- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course. +- [Question answering task guide](../tasks/question_answering) + +**Multiple choice** +- [`RobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb). +- [`TFRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). +- [Multiple choice task guide](../tasks/multiple_choice) + +## RobertaConfig + +[[autodoc]] RobertaConfig + +## RobertaTokenizer + +[[autodoc]] RobertaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## RobertaTokenizerFast + +[[autodoc]] RobertaTokenizerFast + - build_inputs_with_special_tokens + +## RobertaModel + +[[autodoc]] RobertaModel + - forward + +## RobertaForCausalLM + +[[autodoc]] RobertaForCausalLM + - forward + +## RobertaForMaskedLM + +[[autodoc]] RobertaForMaskedLM + - forward + +## RobertaForSequenceClassification + +[[autodoc]] RobertaForSequenceClassification + - forward + +## RobertaForMultipleChoice + +[[autodoc]] RobertaForMultipleChoice + - forward + +## RobertaForTokenClassification + +[[autodoc]] RobertaForTokenClassification + - forward + +## RobertaForQuestionAnswering + +[[autodoc]] RobertaForQuestionAnswering + - forward + +## TFRobertaModel + +[[autodoc]] TFRobertaModel + - call + +## TFRobertaForCausalLM + +[[autodoc]] TFRobertaForCausalLM + - call + +## TFRobertaForMaskedLM + +[[autodoc]] TFRobertaForMaskedLM + - call + +## TFRobertaForSequenceClassification + +[[autodoc]] TFRobertaForSequenceClassification + - call + +## TFRobertaForMultipleChoice + +[[autodoc]] TFRobertaForMultipleChoice + - call + +## TFRobertaForTokenClassification + +[[autodoc]] TFRobertaForTokenClassification + - call + +## TFRobertaForQuestionAnswering + +[[autodoc]] TFRobertaForQuestionAnswering + - call + +## FlaxRobertaModel + +[[autodoc]] FlaxRobertaModel + - __call__ + +## FlaxRobertaForCausalLM + +[[autodoc]] FlaxRobertaForCausalLM + - __call__ + +## FlaxRobertaForMaskedLM + +[[autodoc]] FlaxRobertaForMaskedLM + - __call__ + +## FlaxRobertaForSequenceClassification + +[[autodoc]] FlaxRobertaForSequenceClassification + - __call__ + +## FlaxRobertaForMultipleChoice + +[[autodoc]] FlaxRobertaForMultipleChoice + - __call__ + +## FlaxRobertaForTokenClassification + +[[autodoc]] FlaxRobertaForTokenClassification + - __call__ + +## FlaxRobertaForQuestionAnswering + +[[autodoc]] FlaxRobertaForQuestionAnswering + - __call__ diff --git a/docs/source/en/model_doc/roberta.mdx b/docs/source/en/model_doc/roberta.mdx deleted file mode 100644 index 61f44381b0c4..000000000000 --- a/docs/source/en/model_doc/roberta.mdx +++ /dev/null @@ -1,206 +0,0 @@ - - -# RoBERTa - -## Overview - -The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer -Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. - -It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with -much larger mini-batches and learning rates. - -The abstract from the paper is the following: - -*Language model pretraining has led to significant performance gains but careful comparison between different -approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, -and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication -study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and -training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every -model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results -highlight the importance of previously overlooked design choices, and raise questions about the source of recently -reported improvements. We release our models and code.* - -Tips: - -- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup - for Roberta pretrained models. -- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a - different pretraining scheme. -- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just - separate your segments with the separation token `tokenizer.sep_token` (or ``) -- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples. - -This model was contributed by [julien-c](https://huggingface.co/julien-c). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta). - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - - - -- A blog on [Getting Started with Sentiment Analysis on Twitter](https://huggingface.co/blog/sentiment-analysis-twitter) using RoBERTa and the [Inference API](https://huggingface.co/inference-api). -- A blog on [Opinion Classification with Kili and Hugging Face AutoTrain](https://huggingface.co/blog/opinion-classification-with-kili) using RoBERTa. -- A notebook on how to [finetune RoBERTa for sentiment analysis](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb). 🌎 -- [`RobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb). -- [`TFRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). -- [`FlaxRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb). - - - -- [`RobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb). -- [`TFRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). -- [`FlaxRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification). -- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course. - - - -- A blog on [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) with RoBERTa. -- [`RobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). -- [`TFRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). -- [`FlaxRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb). -- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course. - - - -- A blog on [Accelerated Inference with Optimum and Transformers Pipelines](https://huggingface.co/blog/optimum-inference) with RoBERTa for question answering. -- [`RobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb). -- [`TFRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). -- [`FlaxRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering). -- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course. - -**Multiple choice** -- [`RobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb). -- [`TFRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). - -## RobertaConfig - -[[autodoc]] RobertaConfig - -## RobertaTokenizer - -[[autodoc]] RobertaTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## RobertaTokenizerFast - -[[autodoc]] RobertaTokenizerFast - - build_inputs_with_special_tokens - -## RobertaModel - -[[autodoc]] RobertaModel - - forward - -## RobertaForCausalLM - -[[autodoc]] RobertaForCausalLM - - forward - -## RobertaForMaskedLM - -[[autodoc]] RobertaForMaskedLM - - forward - -## RobertaForSequenceClassification - -[[autodoc]] RobertaForSequenceClassification - - forward - -## RobertaForMultipleChoice - -[[autodoc]] RobertaForMultipleChoice - - forward - -## RobertaForTokenClassification - -[[autodoc]] RobertaForTokenClassification - - forward - -## RobertaForQuestionAnswering - -[[autodoc]] RobertaForQuestionAnswering - - forward - -## TFRobertaModel - -[[autodoc]] TFRobertaModel - - call - -## TFRobertaForCausalLM - -[[autodoc]] TFRobertaForCausalLM - - call - -## TFRobertaForMaskedLM - -[[autodoc]] TFRobertaForMaskedLM - - call - -## TFRobertaForSequenceClassification - -[[autodoc]] TFRobertaForSequenceClassification - - call - -## TFRobertaForMultipleChoice - -[[autodoc]] TFRobertaForMultipleChoice - - call - -## TFRobertaForTokenClassification - -[[autodoc]] TFRobertaForTokenClassification - - call - -## TFRobertaForQuestionAnswering - -[[autodoc]] TFRobertaForQuestionAnswering - - call - -## FlaxRobertaModel - -[[autodoc]] FlaxRobertaModel - - __call__ - -## FlaxRobertaForCausalLM - -[[autodoc]] FlaxRobertaForCausalLM - - __call__ - -## FlaxRobertaForMaskedLM - -[[autodoc]] FlaxRobertaForMaskedLM - - __call__ - -## FlaxRobertaForSequenceClassification - -[[autodoc]] FlaxRobertaForSequenceClassification - - __call__ - -## FlaxRobertaForMultipleChoice - -[[autodoc]] FlaxRobertaForMultipleChoice - - __call__ - -## FlaxRobertaForTokenClassification - -[[autodoc]] FlaxRobertaForTokenClassification - - __call__ - -## FlaxRobertaForQuestionAnswering - -[[autodoc]] FlaxRobertaForQuestionAnswering - - __call__ diff --git a/docs/source/en/model_doc/roc_bert.md b/docs/source/en/model_doc/roc_bert.md new file mode 100644 index 000000000000..831c656fb817 --- /dev/null +++ b/docs/source/en/model_doc/roc_bert.md @@ -0,0 +1,106 @@ + + +# RoCBert + +## Overview + +The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +It's a pretrained Chinese language model that is robust under various forms of adversarial attacks. + +The abstract from the paper is the following: + +*Large-scale pretrained language models have achieved SOTA results on NLP tasks. However, they have been shown +vulnerable to adversarial attacks especially for logographic languages like Chinese. In this work, we propose +ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation, +synonyms, typos, etc. It is pretrained with the contrastive learning objective which maximizes the label consistency +under different synthesized adversarial examples. The model takes as input multimodal information including the +semantic, phonetic and visual features. We show all these features are important to the model robustness since the +attack can be performed in all the three forms. Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under +three blackbox adversarial algorithms without sacrificing the performance on clean testset. It also performs the best +in the toxic content detection task under human-made attacks.* + +This model was contributed by [weiweishi](https://huggingface.co/weiweishi). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## RoCBertConfig + +[[autodoc]] RoCBertConfig + - all + + +## RoCBertTokenizer + +[[autodoc]] RoCBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + + +## RoCBertModel + +[[autodoc]] RoCBertModel + - forward + + +## RoCBertForPreTraining + +[[autodoc]] RoCBertForPreTraining + - forward + + +## RoCBertForCausalLM + +[[autodoc]] RoCBertForCausalLM + - forward + + +## RoCBertForMaskedLM + +[[autodoc]] RoCBertForMaskedLM + - forward + + +## RoCBertForSequenceClassification + +[[autodoc]] transformers.RoCBertForSequenceClassification + - forward + +## RoCBertForMultipleChoice + +[[autodoc]] transformers.RoCBertForMultipleChoice + - forward + + +## RoCBertForTokenClassification + +[[autodoc]] transformers.RoCBertForTokenClassification + - forward + + +## RoCBertForQuestionAnswering + +[[autodoc]] RoCBertForQuestionAnswering + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.mdx deleted file mode 100644 index c30ccfd1c523..000000000000 --- a/docs/source/en/model_doc/roc_bert.mdx +++ /dev/null @@ -1,93 +0,0 @@ - - -# RoCBert - -## Overview - -The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. -It's a pretrained Chinese language model that is robust under various forms of adversarial attacks. - -The abstract from the paper is the following: - -*Large-scale pretrained language models have achieved SOTA results on NLP tasks. However, they have been shown -vulnerable to adversarial attacks especially for logographic languages like Chinese. In this work, we propose -ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation, -synonyms, typos, etc. It is pretrained with the contrastive learning objective which maximizes the label consistency -under different synthesized adversarial examples. The model takes as input multimodal information including the -semantic, phonetic and visual features. We show all these features are important to the model robustness since the -attack can be performed in all the three forms. Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under -three blackbox adversarial algorithms without sacrificing the performance on clean testset. It also performs the best -in the toxic content detection task under human-made attacks.* - -This model was contributed by [weiweishi](https://huggingface.co/weiweishi). - -## RoCBertConfig - -[[autodoc]] RoCBertConfig - - all - - -## RoCBertTokenizer - -[[autodoc]] RoCBertTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - - -## RoCBertModel - -[[autodoc]] RoCBertModel - - forward - - -## RoCBertForPreTraining - -[[autodoc]] RoCBertForPreTraining - - forward - - -## RoCBertForCausalLM - -[[autodoc]] RoCBertForCausalLM - - forward - - -## RoCBertForMaskedLM - -[[autodoc]] RoCBertForMaskedLM - - forward - - -## RoCBertForSequenceClassification - -[[autodoc]] transformers.RoCBertForSequenceClassification - - forward - -## RoCBertForMultipleChoice - -[[autodoc]] transformers.RoCBertForMultipleChoice - - forward - - -## RoCBertForTokenClassification - -[[autodoc]] transformers.RoCBertForTokenClassification - - forward - - -## RoCBertForQuestionAnswering - -[[autodoc]] RoCBertForQuestionAnswering - - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/roformer.md b/docs/source/en/model_doc/roformer.md new file mode 100644 index 000000000000..f15a1062965f --- /dev/null +++ b/docs/source/en/model_doc/roformer.md @@ -0,0 +1,168 @@ + + +# RoFormer + +## Overview + +The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. + +The abstract from the paper is the following: + +*Position encoding in transformer architecture provides supervision for dependency modeling between elements at +different positions in the sequence. We investigate various methods to encode positional information in +transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The +proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative +position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of +being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and +capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced +transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We +release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing +experiment for English benchmark will soon be updated.* + +Tips: + +- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown + improved performance on classification tasks with long texts. + + +This model was contributed by [junnyu](https://huggingface.co/junnyu). The original code can be found [here](https://github.com/ZhuiyiTechnology/roformer). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## RoFormerConfig + +[[autodoc]] RoFormerConfig + +## RoFormerTokenizer + +[[autodoc]] RoFormerTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## RoFormerTokenizerFast + +[[autodoc]] RoFormerTokenizerFast + - build_inputs_with_special_tokens + +## RoFormerModel + +[[autodoc]] RoFormerModel + - forward + +## RoFormerForCausalLM + +[[autodoc]] RoFormerForCausalLM + - forward + +## RoFormerForMaskedLM + +[[autodoc]] RoFormerForMaskedLM + - forward + +## RoFormerForSequenceClassification + +[[autodoc]] RoFormerForSequenceClassification + - forward + +## RoFormerForMultipleChoice + +[[autodoc]] RoFormerForMultipleChoice + - forward + +## RoFormerForTokenClassification + +[[autodoc]] RoFormerForTokenClassification + - forward + +## RoFormerForQuestionAnswering + +[[autodoc]] RoFormerForQuestionAnswering + - forward + +## TFRoFormerModel + +[[autodoc]] TFRoFormerModel + - call + +## TFRoFormerForMaskedLM + +[[autodoc]] TFRoFormerForMaskedLM + - call + +## TFRoFormerForCausalLM + +[[autodoc]] TFRoFormerForCausalLM + - call + +## TFRoFormerForSequenceClassification + +[[autodoc]] TFRoFormerForSequenceClassification + - call + +## TFRoFormerForMultipleChoice + +[[autodoc]] TFRoFormerForMultipleChoice + - call + +## TFRoFormerForTokenClassification + +[[autodoc]] TFRoFormerForTokenClassification + - call + +## TFRoFormerForQuestionAnswering + +[[autodoc]] TFRoFormerForQuestionAnswering + - call + +## FlaxRoFormerModel + +[[autodoc]] FlaxRoFormerModel + - __call__ + +## FlaxRoFormerForMaskedLM + +[[autodoc]] FlaxRoFormerForMaskedLM + - __call__ + +## FlaxRoFormerForSequenceClassification + +[[autodoc]] FlaxRoFormerForSequenceClassification + - __call__ + +## FlaxRoFormerForMultipleChoice + +[[autodoc]] FlaxRoFormerForMultipleChoice + - __call__ + +## FlaxRoFormerForTokenClassification + +[[autodoc]] FlaxRoFormerForTokenClassification + - __call__ + +## FlaxRoFormerForQuestionAnswering + +[[autodoc]] FlaxRoFormerForQuestionAnswering + - __call__ diff --git a/docs/source/en/model_doc/roformer.mdx b/docs/source/en/model_doc/roformer.mdx deleted file mode 100644 index 435941d9f29a..000000000000 --- a/docs/source/en/model_doc/roformer.mdx +++ /dev/null @@ -1,155 +0,0 @@ - - -# RoFormer - -## Overview - -The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. - -The abstract from the paper is the following: - -*Position encoding in transformer architecture provides supervision for dependency modeling between elements at -different positions in the sequence. We investigate various methods to encode positional information in -transformer-based language models and propose a novel implementation named Rotary Position Embedding(RoPE). The -proposed RoPE encodes absolute positional information with rotation matrix and naturally incorporates explicit relative -position dependency in self-attention formulation. Notably, RoPE comes with valuable properties such as flexibility of -being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and -capability of equipping the linear self-attention with relative position encoding. As a result, the enhanced -transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. We -release the theoretical analysis along with some preliminary experiment results on Chinese data. The undergoing -experiment for English benchmark will soon be updated.* - -Tips: - -- RoFormer is a BERT-like autoencoding model with rotary position embeddings. Rotary position embeddings have shown - improved performance on classification tasks with long texts. - - -This model was contributed by [junnyu](https://huggingface.co/junnyu). The original code can be found [here](https://github.com/ZhuiyiTechnology/roformer). - -## RoFormerConfig - -[[autodoc]] RoFormerConfig - -## RoFormerTokenizer - -[[autodoc]] RoFormerTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## RoFormerTokenizerFast - -[[autodoc]] RoFormerTokenizerFast - - build_inputs_with_special_tokens - -## RoFormerModel - -[[autodoc]] RoFormerModel - - forward - -## RoFormerForCausalLM - -[[autodoc]] RoFormerForCausalLM - - forward - -## RoFormerForMaskedLM - -[[autodoc]] RoFormerForMaskedLM - - forward - -## RoFormerForSequenceClassification - -[[autodoc]] RoFormerForSequenceClassification - - forward - -## RoFormerForMultipleChoice - -[[autodoc]] RoFormerForMultipleChoice - - forward - -## RoFormerForTokenClassification - -[[autodoc]] RoFormerForTokenClassification - - forward - -## RoFormerForQuestionAnswering - -[[autodoc]] RoFormerForQuestionAnswering - - forward - -## TFRoFormerModel - -[[autodoc]] TFRoFormerModel - - call - -## TFRoFormerForMaskedLM - -[[autodoc]] TFRoFormerForMaskedLM - - call - -## TFRoFormerForCausalLM - -[[autodoc]] TFRoFormerForCausalLM - - call - -## TFRoFormerForSequenceClassification - -[[autodoc]] TFRoFormerForSequenceClassification - - call - -## TFRoFormerForMultipleChoice - -[[autodoc]] TFRoFormerForMultipleChoice - - call - -## TFRoFormerForTokenClassification - -[[autodoc]] TFRoFormerForTokenClassification - - call - -## TFRoFormerForQuestionAnswering - -[[autodoc]] TFRoFormerForQuestionAnswering - - call - -## FlaxRoFormerModel - -[[autodoc]] FlaxRoFormerModel - - __call__ - -## FlaxRoFormerForMaskedLM - -[[autodoc]] FlaxRoFormerForMaskedLM - - __call__ - -## FlaxRoFormerForSequenceClassification - -[[autodoc]] FlaxRoFormerForSequenceClassification - - __call__ - -## FlaxRoFormerForMultipleChoice - -[[autodoc]] FlaxRoFormerForMultipleChoice - - __call__ - -## FlaxRoFormerForTokenClassification - -[[autodoc]] FlaxRoFormerForTokenClassification - - __call__ - -## FlaxRoFormerForQuestionAnswering - -[[autodoc]] FlaxRoFormerForQuestionAnswering - - __call__ diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md new file mode 100644 index 000000000000..9293db14cc63 --- /dev/null +++ b/docs/source/en/model_doc/rwkv.md @@ -0,0 +1,151 @@ + + +# RWKV + +## Overview + +The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM) + +It suggests a tweak in the traditional Transformer attention to make it linear. This way, the model can be used as recurrent network: passing inputs for timestamp 0 and timestamp 1 together is the same as passing inputs at timestamp 0, then inputs at timestamp 1 along with the state of timestamp 0 (see example below). + +This can be more efficient than a regular Transformer and can deal with sentence of any length (even if the model uses a fixed context length for training). + +This model was contributed by [sgugger](https://huggingface.co/sgugger). +The original code can be found [here](https://github.com/BlinkDL/RWKV-LM). + +Example of use as an RNN: + +```py +import torch +from transformers import AutoTokenizer, RwkvConfig, RwkvModel + +model = RwkvModel.from_pretrained("sgugger/rwkv-430M-pile") +tokenizer = AutoTokenizer.from_pretrained("sgugger/rwkv-430M-pile") + +inputs = tokenizer("This is an example.", return_tensors="pt") +# Feed everything to the model +outputs = model(inputs["input_ids"]) +output_whole = outputs.last_hidden_state + +outputs = model(inputs["input_ids"][:, :2]) +output_one = outputs.last_hidden_state + +# Using the state computed on the first inputs, we will get the same output +outputs = model(inputs["input_ids"][:, 2:], state=outputs.state) +output_two = outputs.last_hidden_state + +torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5) +``` + +If you want to make sure the model stops generating when `'\n\n'` is detected, we recommend using the following stopping criteria: + +```python +from transformers import StoppingCriteria + +class RwkvStoppingCriteria(StoppingCriteria): + def __init__(self, eos_sequence = [187,187], eos_token_id = 537): + self.eos_sequence = eos_sequence + self.eos_token_id = eos_token_id + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + last_2_ids = input_ids[:,-2:].tolist() + return self.eos_sequence in last_2_ids + + +output = model.generate(inputs["input_ids"], max_new_tokens=64, stopping_criteria = [RwkvStoppingCriteria()]) +``` + +## RwkvConfig + +[[autodoc]] RwkvConfig + + +## RwkvModel + +[[autodoc]] RwkvModel + - forward + +## RwkvLMHeadModel + +[[autodoc]] RwkvForCausalLM + - forward + +## Rwkv attention and the recurrent formulas + +In a traditional auto-regressive Transformer, attention is written as + +$$O = \hbox{softmax}(QK^{T} / \sqrt{d}) V$$ + +with \\(Q\\), \\(K\\) and \\(V\\) are matrices of shape `seq_len x hidden_size` named query, key and value (they are actually bigger matrices with a batch dimension and an attention head dimension but we're only interested in the last two, which is where the matrix product is taken, so for the sake of simplicity we only consider those two). The product \\(QK^{T}\\) then has shape `seq_len x seq_len` and we can take the maxtrix product with \\(V\\) to get the output \\(O\\) of the same shape as the others. + +Replacing the softmax by its value gives: + +$$O_{i} = \frac{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}} V_{j}}{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}}}$$ + +Note that the entries in \\(QK^{T}\\) corresponding to \\(j > i\\) are masked (the sum stops at j) because the attention is not allowed to look at future tokens (only past ones). + +In comparison, the RWKV attention is given by + +$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$ + +where \\(R\\) is a new matrix called receptance by the author, \\(K\\) and \\(V\\) are still the key and value (\\(\sigma\\) here is the sigmoid function). \\(W\\) is a new vector that represents the position of the token and is given by + +$$W_{0} = u \hbox{ and } W_{k} = (k-1)w \hbox{ for } k \geq 1$$ + +with \\(u\\) and \\(w\\) learnable parameters called in the code `time_first` and `time_decay` respectively. The numerator and denominator can both be expressed recursively. Naming them \\(N_{i}\\) and \\(D_{i}\\) we have: + +$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}_{i} \hbox{ where } \hat{N}_{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$ + +so \\(\hat{N}_{i}\\) (called `numerator_state` in the code) satistfies + +$$\hat{N}_{0} = 0 \hbox{ and } \hat{N}_{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$ + +and + +$$D_{i} = e^{u + K_{i}} + \hat{D}_{i} \hbox{ where } \hat{D}_{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$ + +so \\(\hat{D}_{i}\\) (called `denominator_state` in the code) satistfies + +$$\hat{D}_{0} = 0 \hbox{ and } \hat{D}_{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$ + +The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator: + +$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$ + +with \\(M\\) the maximum of all \\(x_{j}\\). So here on top of saving the numerator state (\\(\hat{N}\\)) and the denominator state (\\(\hat{D}\\)) we also keep track of the maximum of all terms encountered in the exponentials. So we actually use + +$$\tilde{N}_{i} = e^{-M_{i}} \hat{N}_{i} \hbox{ and } \tilde{D}_{i} = e^{-M_{i}} \hat{D}_{i}$$ + +defined by the following recurrent formulas: + +$$\tilde{N}_{0} = 0 \hbox{ and } \tilde{N}_{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}_{j} \hbox{ where } q = \max(K_{j}, w + M_{j})$$ + +and + +$$\tilde{D}_{0} = 0 \hbox{ and } \tilde{D}_{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}_{j} \hbox{ where } q = \max(K_{j}, w + M_{j})$$ + +and \\(M_{j+1} = q\\). With those, we can then compute + +$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}_{i} \hbox{ where } q = \max(u + K_{i}, M_{i})$$ + +and + +$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{ where } q = \max(u + K_{i}, M_{i})$$ + +which finally gives us + +$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$ \ No newline at end of file diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md new file mode 100644 index 000000000000..fe0d24623fd7 --- /dev/null +++ b/docs/source/en/model_doc/sam.md @@ -0,0 +1,111 @@ + + +# SAM + +## Overview + +SAM (Segment Anything Model) was proposed in [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. + +The model can be used to predict segmentation masks of any object of interest given an input image. + +![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png) + +The abstract from the paper is the following: + +*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.* + +Tips: + +- The model predicts binary masks that states the presence or not of the object of interest given an image. +- The model predicts much better results if input 2D points and/or input bounding boxes are provided +- You can prompt multiple points for the same image, and predict a single mask. +- Fine-tuning the model is not supported yet +- According to the paper, textual input should be also supported. However, at this time of writing this seems to be not supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). + + +This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ). +The original code can be found [here](https://github.com/facebookresearch/segment-anything). + +Below is an example on how to run mask generation given an image and a 2D point: + +```python +import torch +from PIL import Image +import requests +from transformers import SamModel, SamProcessor + +device = "cuda" if torch.cuda.is_available() else "cpu" +model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device) +processor = SamProcessor.from_pretrained("facebook/sam-vit-huge") + +img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" +raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") +input_points = [[[450, 600]]] # 2D location of a window in the image + +inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device) +outputs = model(**inputs) + +masks = processor.image_processor.post_process_masks( + outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu() +) +scores = outputs.iou_scores +``` + +Resources: + +- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model. +- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using the automatic mask generation pipeline. +- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Run_inference_with_MedSAM_using_HuggingFace_Transformers.ipynb) for inference with MedSAM, a fine-tuned version of SAM on the medical domain. +- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)_on_a_custom_dataset.ipynb) for fine-tuning the model on custom data. + +## SamConfig + +[[autodoc]] SamConfig + +## SamVisionConfig + +[[autodoc]] SamVisionConfig + +## SamMaskDecoderConfig + +[[autodoc]] SamMaskDecoderConfig + +## SamPromptEncoderConfig + +[[autodoc]] SamPromptEncoderConfig + + +## SamProcessor + +[[autodoc]] SamProcessor + + +## SamImageProcessor + +[[autodoc]] SamImageProcessor + + +## SamModel + +[[autodoc]] SamModel + - forward + + +## TFSamModel + +[[autodoc]] TFSamModel + - call \ No newline at end of file diff --git a/docs/source/en/model_doc/segformer.md b/docs/source/en/model_doc/segformer.md new file mode 100644 index 000000000000..0f535351af5c --- /dev/null +++ b/docs/source/en/model_doc/segformer.md @@ -0,0 +1,164 @@ + + +# SegFormer + +## Overview + +The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping +Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great +results on image segmentation benchmarks such as ADE20K and Cityscapes. + +The abstract from the paper is the following: + +*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with +lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel +hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding, +thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution +differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from +different layers, and thus combining both local attention and global attention to render powerful representations. We +show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our +approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance +and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters, +being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on +Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.* + +The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://arxiv.org/abs/2105.15203). + + + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version +of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer). + +Tips: + +- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head. + [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to + as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on + top to perform semantic segmentation of images. In addition, there's + [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The + authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw + away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on + ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be + found on the [hub](https://huggingface.co/models?other=segformer). +- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and + fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data. +- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning. +- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers) + to try out a SegFormer model on custom images. +- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. +- One can use [`SegformerImageProcessor`] to prepare images and corresponding segmentation maps + for the model. Note that this image processor is fairly basic and does not include all data augmentations used in + the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most + important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size, + such as 512x512 or 640x640, after which they are normalized. +- One additional thing to keep in mind is that one can initialize [`SegformerImageProcessor`] with + `reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated + segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels. + Therefore, `reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the + background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function + used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as + background class and include this class as part of all labels. In that case, `reduce_labels` should be set to + `False`, as loss should also be computed for the background class. +- As most models, SegFormer comes in different sizes, the details of which can be found in the table below + (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)). + +| **Model variant** | **Depths** | **Hidden sizes** | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** | +| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: | +| MiT-b0 | [2, 2, 2, 2] | [32, 64, 160, 256] | 256 | 3.7 | 70.5 | +| MiT-b1 | [2, 2, 2, 2] | [64, 128, 320, 512] | 256 | 14.0 | 78.7 | +| MiT-b2 | [3, 4, 6, 3] | [64, 128, 320, 512] | 768 | 25.4 | 81.6 | +| MiT-b3 | [3, 4, 18, 3] | [64, 128, 320, 512] | 768 | 45.2 | 83.1 | +| MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 | +| MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 | + +Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For +SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SegFormer. + + + +- [`SegformerForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- [Image classification task guide](../tasks/image_classification) + +Semantic segmentation: + +- [`SegformerForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation). +- A blog on fine-tuning SegFormer on a custom dataset can be found [here](https://huggingface.co/blog/fine-tune-segformer). +- More demo notebooks on SegFormer (both inference + fine-tuning on a custom dataset) can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer). +- [`TFSegformerForSemanticSegmentation`] is supported by this [example notebook](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb). +- [Semantic segmentation task guide](../tasks/semantic_segmentation) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## SegformerConfig + +[[autodoc]] SegformerConfig + +## SegformerFeatureExtractor + +[[autodoc]] SegformerFeatureExtractor + - __call__ + - post_process_semantic_segmentation + +## SegformerImageProcessor + +[[autodoc]] SegformerImageProcessor + - preprocess + - post_process_semantic_segmentation + +## SegformerModel + +[[autodoc]] SegformerModel + - forward + +## SegformerDecodeHead + +[[autodoc]] SegformerDecodeHead + - forward + +## SegformerForImageClassification + +[[autodoc]] SegformerForImageClassification + - forward + +## SegformerForSemanticSegmentation + +[[autodoc]] SegformerForSemanticSegmentation + - forward + +## TFSegformerDecodeHead + +[[autodoc]] TFSegformerDecodeHead + - call + +## TFSegformerModel + +[[autodoc]] TFSegformerModel + - call + +## TFSegformerForImageClassification + +[[autodoc]] TFSegformerForImageClassification + - call + +## TFSegformerForSemanticSegmentation + +[[autodoc]] TFSegformerForSemanticSegmentation + - call diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx deleted file mode 100644 index 76a02c27f423..000000000000 --- a/docs/source/en/model_doc/segformer.mdx +++ /dev/null @@ -1,142 +0,0 @@ - - -# SegFormer - -## Overview - -The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping -Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great -results on image segmentation benchmarks such as ADE20K and Cityscapes. - -The abstract from the paper is the following: - -*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with -lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel -hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding, -thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution -differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from -different layers, and thus combining both local attention and global attention to render powerful representations. We -show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our -approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance -and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters, -being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on -Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.* - -The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://arxiv.org/abs/2105.15203). - - - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version -of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer). - -Tips: - -- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head. - [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to - as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on - top to perform semantic segmentation of images. In addition, there's - [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The - authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw - away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on - ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be - found on the [hub](https://huggingface.co/models?other=segformer). -- The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and - fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data. -- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning. -- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers) - to try out a SegFormer model on custom images. -- SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. -- One can use [`SegformerImageProcessor`] to prepare images and corresponding segmentation maps - for the model. Note that this image processor is fairly basic and does not include all data augmentations used in - the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found [here](https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py). The most - important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size, - such as 512x512 or 640x640, after which they are normalized. -- One additional thing to keep in mind is that one can initialize [`SegformerImageProcessor`] with - `reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated - segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels. - Therefore, `reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the - background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function - used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as - background class and include this class as part of all labels. In that case, `reduce_labels` should be set to - `False`, as loss should also be computed for the background class. -- As most models, SegFormer comes in different sizes, the details of which can be found in the table below - (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)). - -| **Model variant** | **Depths** | **Hidden sizes** | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** | -| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: | -| MiT-b0 | [2, 2, 2, 2] | [32, 64, 160, 256] | 256 | 3.7 | 70.5 | -| MiT-b1 | [2, 2, 2, 2] | [64, 128, 320, 512] | 256 | 14.0 | 78.7 | -| MiT-b2 | [3, 4, 6, 3] | [64, 128, 320, 512] | 768 | 25.4 | 81.6 | -| MiT-b3 | [3, 4, 18, 3] | [64, 128, 320, 512] | 768 | 45.2 | 83.1 | -| MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 | -| MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 | - -Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For -SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203). - - -## SegformerConfig - -[[autodoc]] SegformerConfig - -## SegformerFeatureExtractor - -[[autodoc]] SegformerFeatureExtractor - - __call__ - - post_process_semantic_segmentation - -## SegformerImageProcessor - -[[autodoc]] SegformerImageProcessor - - preprocess - - post_process_semantic_segmentation - -## SegformerModel - -[[autodoc]] SegformerModel - - forward - -## SegformerDecodeHead - -[[autodoc]] SegformerDecodeHead - - forward - -## SegformerForImageClassification - -[[autodoc]] SegformerForImageClassification - - forward - -## SegformerForSemanticSegmentation - -[[autodoc]] SegformerForSemanticSegmentation - - forward - -## TFSegformerDecodeHead - -[[autodoc]] TFSegformerDecodeHead - - call - -## TFSegformerModel - -[[autodoc]] TFSegformerModel - - call - -## TFSegformerForImageClassification - -[[autodoc]] TFSegformerForImageClassification - - call - -## TFSegformerForSemanticSegmentation - -[[autodoc]] TFSegformerForSemanticSegmentation - - call diff --git a/docs/source/en/model_doc/sew-d.md b/docs/source/en/model_doc/sew-d.md new file mode 100644 index 000000000000..b70c59061b57 --- /dev/null +++ b/docs/source/en/model_doc/sew-d.md @@ -0,0 +1,65 @@ + + +# SEW-D + +## Overview + +SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs +in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, +Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. + +The abstract from the paper is the following: + +*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition +(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance +and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a +pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a +variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x +inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference +time, SEW reduces word error rate by 25-50% across different model sizes.* + +Tips: + +- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. +- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded + using [`Wav2Vec2CTCTokenizer`]. + +This model was contributed by [anton-l](https://huggingface.co/anton-l). + +## Documentation resources + +- [Audio classification task guide](../tasks/audio_classification) +- [Automatic speech recognition task guide](../tasks/asr) + +## SEWDConfig + +[[autodoc]] SEWDConfig + +## SEWDModel + +[[autodoc]] SEWDModel + - forward + +## SEWDForCTC + +[[autodoc]] SEWDForCTC + - forward + +## SEWDForSequenceClassification + +[[autodoc]] SEWDForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/sew-d.mdx b/docs/source/en/model_doc/sew-d.mdx deleted file mode 100644 index ceeb4f1ec35f..000000000000 --- a/docs/source/en/model_doc/sew-d.mdx +++ /dev/null @@ -1,57 +0,0 @@ - - -# SEW-D - -## Overview - -SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs -in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, -Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. - -The abstract from the paper is the following: - -*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition -(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance -and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a -pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a -variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x -inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference -time, SEW reduces word error rate by 25-50% across different model sizes.* - -Tips: - -- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. -- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded - using [`Wav2Vec2CTCTokenizer`]. - -This model was contributed by [anton-l](https://huggingface.co/anton-l). - - -## SEWDConfig - -[[autodoc]] SEWDConfig - -## SEWDModel - -[[autodoc]] SEWDModel - - forward - -## SEWDForCTC - -[[autodoc]] SEWDForCTC - - forward - -## SEWDForSequenceClassification - -[[autodoc]] SEWDForSequenceClassification - - forward diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md new file mode 100644 index 000000000000..ebf128ea429f --- /dev/null +++ b/docs/source/en/model_doc/sew.md @@ -0,0 +1,65 @@ + + +# SEW + +## Overview + +SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training +for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. +Weinberger, Yoav Artzi. + +The abstract from the paper is the following: + +*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition +(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance +and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a +pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a +variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x +inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference +time, SEW reduces word error rate by 25-50% across different model sizes.* + +Tips: + +- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. +- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using + [`Wav2Vec2CTCTokenizer`]. + +This model was contributed by [anton-l](https://huggingface.co/anton-l). + +## Documentation resources + +- [Audio classification task guide](../tasks/audio_classification) +- [Automatic speech recognition task guide](../tasks/asr) + +## SEWConfig + +[[autodoc]] SEWConfig + +## SEWModel + +[[autodoc]] SEWModel + - forward + +## SEWForCTC + +[[autodoc]] SEWForCTC + - forward + +## SEWForSequenceClassification + +[[autodoc]] SEWForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/sew.mdx b/docs/source/en/model_doc/sew.mdx deleted file mode 100644 index dce949a856b3..000000000000 --- a/docs/source/en/model_doc/sew.mdx +++ /dev/null @@ -1,57 +0,0 @@ - - -# SEW - -## Overview - -SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training -for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. -Weinberger, Yoav Artzi. - -The abstract from the paper is the following: - -*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition -(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance -and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a -pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a -variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x -inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference -time, SEW reduces word error rate by 25-50% across different model sizes.* - -Tips: - -- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. -- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using - [`Wav2Vec2CTCTokenizer`]. - -This model was contributed by [anton-l](https://huggingface.co/anton-l). - - -## SEWConfig - -[[autodoc]] SEWConfig - -## SEWModel - -[[autodoc]] SEWModel - - forward - -## SEWForCTC - -[[autodoc]] SEWForCTC - - forward - -## SEWForSequenceClassification - -[[autodoc]] SEWForSequenceClassification - - forward diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md new file mode 100644 index 000000000000..b036f27e1865 --- /dev/null +++ b/docs/source/en/model_doc/speech-encoder-decoder.md @@ -0,0 +1,132 @@ + + +# Speech Encoder Decoder Models + +The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model +with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder. + +The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech +recognition and speech translation has *e.g.* been shown in [Large-Scale Self- and Semi-Supervised Learning for Speech +Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, +Alexis Conneau. + +An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in [Speech2Text2](speech_to_text_2). + +## Randomly initializing `SpeechEncoderDecoderModel` from model configurations. + +[`SpeechEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`Wav2Vec2Model`] configuration for the encoder +and the default [`BertForCausalLM`] configuration for the decoder. + +```python +>>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel + +>>> config_encoder = Wav2Vec2Config() +>>> config_decoder = BertConfig() + +>>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) +>>> model = SpeechEncoderDecoderModel(config=config) +``` + +## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder. + +[`SpeechEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based speech model, *e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert) can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder. +Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. +Initializing [`SpeechEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder). +To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecoderModel.from_encoder_decoder_pretrained`] method. + +```python +>>> from transformers import SpeechEncoderDecoderModel + +>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( +... "facebook/hubert-large-ll60k", "bert-base-uncased" +... ) +``` + +## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference. + +To load fine-tuned checkpoints of the `SpeechEncoderDecoderModel` class, [`SpeechEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers. + +To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. + +```python +>>> from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel +>>> from datasets import load_dataset +>>> import torch + +>>> # load a fine-tuned speech translation model and corresponding processor +>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") +>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") + +>>> # let's perform inference on a piece of English speech (which we'll translate to German) +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values + +>>> # autoregressively generate transcription (uses greedy decoding by default) +>>> generated_ids = model.generate(input_values) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +>>> print(generated_text) +Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können. +``` + +## Training + +Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (speech, text) pairs. +As you can see, only 2 inputs are required for the model in order to compute a loss: `input_values` (which are the +speech inputs) and `labels` (which are the `input_ids` of the encoded target sequence). + +```python +>>> from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel +>>> from datasets import load_dataset + +>>> encoder_id = "facebook/wav2vec2-base-960h" # acoustic model encoder +>>> decoder_id = "bert-base-uncased" # text decoder + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id) +>>> tokenizer = AutoTokenizer.from_pretrained(decoder_id) +>>> # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model +>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id) + +>>> model.config.decoder_start_token_id = tokenizer.cls_token_id +>>> model.config.pad_token_id = tokenizer.pad_token_id + +>>> # load an audio input and pre-process (normalise mean/std to 0/1) +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values + +>>> # load its corresponding transcription and tokenize to generate labels +>>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids + +>>> # the forward function automatically creates the correct decoder_input_ids +>>> loss = model(input_values=input_values, labels=labels).loss +>>> loss.backward() +``` + +## SpeechEncoderDecoderConfig + +[[autodoc]] SpeechEncoderDecoderConfig + +## SpeechEncoderDecoderModel + +[[autodoc]] SpeechEncoderDecoderModel + - forward + - from_encoder_decoder_pretrained + +## FlaxSpeechEncoderDecoderModel + +[[autodoc]] FlaxSpeechEncoderDecoderModel + - __call__ + - from_encoder_decoder_pretrained diff --git a/docs/source/en/model_doc/speech-encoder-decoder.mdx b/docs/source/en/model_doc/speech-encoder-decoder.mdx deleted file mode 100644 index b0718a27a88c..000000000000 --- a/docs/source/en/model_doc/speech-encoder-decoder.mdx +++ /dev/null @@ -1,128 +0,0 @@ - - -# Speech Encoder Decoder Models - -The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model -with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder. - -The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech -recognition and speech translation has *e.g.* been shown in [Large-Scale Self- and Semi-Supervised Learning for Speech -Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, -Alexis Conneau. - -An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in [Speech2Text2](speech_to_text_2). - -## Randomly initializing `SpeechEncoderDecoderModel` from model configurations. - -[`SpeechEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`Wav2Vec2Model`] configuration for the encoder -and the default [`BertForCausalLM`] configuration for the decoder. - -```python ->>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel - ->>> config_encoder = Wav2Vec2Config() ->>> config_decoder = BertConfig() - ->>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) ->>> model = SpeechEncoderDecoderModel(config=config) -``` - -## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder. - -[`SpeechEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based speech model, *e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert) can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder. -Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. -Initializing [`SpeechEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder). -To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecoderModel.from_encoder_decoder_pretrained`] method. - -```python ->>> from transformers import SpeechEncoderDecoderModel - ->>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( -... "facebook/hubert-large-ll60k", "bert-base-uncased" -... ) -``` - -## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference. - -To load fine-tuned checkpoints of the `SpeechEncoderDecoderModel` class, [`SpeechEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers. - -To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. - -```python ->>> from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel ->>> from datasets import load_dataset ->>> import torch - ->>> # load a fine-tuned speech translation model and corresponding processor ->>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") ->>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") - ->>> # let's perform inference on a piece of English speech (which we'll translate to German) ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values - ->>> # autoregressively generate transcription (uses greedy decoding by default) ->>> generated_ids = model.generate(input_values) ->>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] ->>> print(generated_text) -Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können. -``` - -## Training - -Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (speech, text) pairs. -As you can see, only 2 inputs are required for the model in order to compute a loss: `input_values` (which are the -speech inputs) and `labels` (which are the `input_ids` of the encoded target sequence). - -```python ->>> from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel ->>> from datasets import load_dataset - ->>> encoder_id = "facebook/wav2vec2-base-960h" # acoustic model encoder ->>> decoder_id = "bert-base-uncased" # text decoder - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id) ->>> tokenizer = AutoTokenizer.from_pretrained(decoder_id) ->>> # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model ->>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id) - ->>> model.config.decoder_start_token_id = tokenizer.cls_token_id ->>> model.config.pad_token_id = tokenizer.pad_token_id - ->>> # load an audio input and pre-process (normalise mean/std to 0/1) ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values - ->>> # load its corresponding transcription and tokenize to generate labels ->>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids - ->>> # the forward function automatically creates the correct decoder_input_ids ->>> loss = model(**input_features).loss ->>> loss.backward() -``` - -## SpeechEncoderDecoderConfig - -[[autodoc]] SpeechEncoderDecoderConfig - -## SpeechEncoderDecoderModel - -[[autodoc]] SpeechEncoderDecoderModel - - forward - - from_encoder_decoder_pretrained - -## FlaxSpeechEncoderDecoderModel - -[[autodoc]] FlaxSpeechEncoderDecoderModel - - __call__ - - from_encoder_decoder_pretrained \ No newline at end of file diff --git a/docs/source/en/model_doc/speech_to_text.md b/docs/source/en/model_doc/speech_to_text.md new file mode 100644 index 000000000000..cb13a1871ae6 --- /dev/null +++ b/docs/source/en/model_doc/speech_to_text.md @@ -0,0 +1,146 @@ + + +# Speech2Text + +## Overview + +The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a +transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech +Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are +fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the +transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST: +[LibriSpeech](http://www.openslr.org/12), [CoVoST 2](https://github.com/facebookresearch/covost), [MuST-C](https://ict.fbk.eu/must-c/). + +This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text). + + +## Inference + +Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech +signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The +`generate()` method can be used for inference. + +The [`Speech2TextFeatureExtractor`] class is responsible for extracting the log-mel filter-bank +features. The [`Speech2TextProcessor`] wraps [`Speech2TextFeatureExtractor`] and +[`Speech2TextTokenizer`] into a single instance to both extract the input features and decode the +predicted token ids. + +The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to +install those packages before running the examples. You could either install those as extra speech dependencies with +`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can +be installed as follows: `apt install libsndfile1-dev` + + +- ASR and Speech Translation + +```python +>>> import torch +>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration +>>> from datasets import load_dataset + +>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") +>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") + + +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + +>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") +>>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) + +>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> transcription +['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'] +``` + +- Multilingual speech translation + + For multilingual speech translation models, `eos_token_id` is used as the `decoder_start_token_id` and + the target language id is forced as the first generated token. To force the target language id as the first + generated token, pass the `forced_bos_token_id` parameter to the `generate()` method. The following + example shows how to transate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st* + checkpoint. + +```python +>>> import torch +>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration +>>> from datasets import load_dataset + +>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") +>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") + +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + +>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") +>>> generated_ids = model.generate( +... inputs["input_features"], +... attention_mask=inputs["attention_mask"], +... forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"], +... ) + +>>> translation = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> translation +["(Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."] +``` + +See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints. + + +## Speech2TextConfig + +[[autodoc]] Speech2TextConfig + +## Speech2TextTokenizer + +[[autodoc]] Speech2TextTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## Speech2TextFeatureExtractor + +[[autodoc]] Speech2TextFeatureExtractor + - __call__ + +## Speech2TextProcessor + +[[autodoc]] Speech2TextProcessor + - __call__ + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## Speech2TextModel + +[[autodoc]] Speech2TextModel + - forward + +## Speech2TextForConditionalGeneration + +[[autodoc]] Speech2TextForConditionalGeneration + - forward + +## TFSpeech2TextModel + +[[autodoc]] TFSpeech2TextModel + - call + +## TFSpeech2TextForConditionalGeneration + +[[autodoc]] TFSpeech2TextForConditionalGeneration + - call diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx deleted file mode 100644 index 95efc5504ff8..000000000000 --- a/docs/source/en/model_doc/speech_to_text.mdx +++ /dev/null @@ -1,142 +0,0 @@ - - -# Speech2Text - -## Overview - -The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a -transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech -Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are -fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the -transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST: -[LibriSpeech](http://www.openslr.org/12), [CoVoST 2](https://github.com/facebookresearch/covost), [MuST-C](https://ict.fbk.eu/must-c/). - -This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text). - - -## Inference - -Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech -signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The -`generate()` method can be used for inference. - -The [`Speech2TextFeatureExtractor`] class is responsible for extracting the log-mel filter-bank -features. The [`Speech2TextProcessor`] wraps [`Speech2TextFeatureExtractor`] and -[`Speech2TextTokenizer`] into a single instance to both extract the input features and decode the -predicted token ids. - -The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to -install those packages before running the examples. You could either install those as extra speech dependencies with -`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can -be installed as follows: `apt install libsndfile1-dev` - - -- ASR and Speech Translation - -```python ->>> import torch ->>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration ->>> from datasets import load_dataset - ->>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") ->>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") - - ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") - ->>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") ->>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) - ->>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) ->>> transcription -['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'] -``` - -- Multilingual speech translation - - For multilingual speech translation models, `eos_token_id` is used as the `decoder_start_token_id` and - the target language id is forced as the first generated token. To force the target language id as the first - generated token, pass the `forced_bos_token_id` parameter to the `generate()` method. The following - example shows how to transate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st* - checkpoint. - -```python ->>> import torch ->>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration ->>> from datasets import load_dataset - ->>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") ->>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") - ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") - ->>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") ->>> generated_ids = model.generate( -... inputs["input_features"], -... attention_mask=inputs["attention_mask"], -... forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"], -... ) - ->>> translation = processor.batch_decode(generated_ids, skip_special_tokens=True) ->>> translation -["(Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."] -``` - -See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints. - - -## Speech2TextConfig - -[[autodoc]] Speech2TextConfig - -## Speech2TextTokenizer - -[[autodoc]] Speech2TextTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## Speech2TextFeatureExtractor - -[[autodoc]] Speech2TextFeatureExtractor - - __call__ - -## Speech2TextProcessor - -[[autodoc]] Speech2TextProcessor - - __call__ - - from_pretrained - - save_pretrained - - batch_decode - - decode - -## Speech2TextModel - -[[autodoc]] Speech2TextModel - - forward - -## Speech2TextForConditionalGeneration - -[[autodoc]] Speech2TextForConditionalGeneration - - forward - -## TFSpeech2TextModel - -[[autodoc]] TFSpeech2TextModel - - call - -## TFSpeech2TextForConditionalGeneration - -[[autodoc]] TFSpeech2TextForConditionalGeneration - - call diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md new file mode 100644 index 000000000000..1abdeced580e --- /dev/null +++ b/docs/source/en/model_doc/speech_to_text_2.md @@ -0,0 +1,128 @@ + + +# Speech2Text2 + +## Overview + +The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in +[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by +Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. + +Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as +[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the +[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only* +model. + +This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten). + +The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266). + + +Tips: + +- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see + the [official models](https://huggingface.co/models?other=speech2text2) . +- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework. +- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE). + +## Inference + +Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and +makes use of [`~generation.GenerationMixin.generate`] to translate the input speech +autoregressively to the target language. + +The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and +[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The +[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and +[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the +predicted token ids. + +- Step-by-step Speech Translation + +```python +>>> import torch +>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel +>>> from datasets import load_dataset +>>> import soundfile as sf + +>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") +>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") + + +>>> def map_to_array(batch): +... speech, _ = sf.read(batch["file"]) +... batch["speech"] = speech +... return batch + + +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> ds = ds.map(map_to_array) + +>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") +>>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"]) + +>>> transcription = processor.batch_decode(generated_ids) +``` + +- Speech Translation via Pipelines + + The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code + +```python +>>> from datasets import load_dataset +>>> from transformers import pipeline + +>>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> asr = pipeline( +... "automatic-speech-recognition", +... model="facebook/s2t-wav2vec2-large-en-de", +... feature_extractor="facebook/s2t-wav2vec2-large-en-de", +... ) + +>>> translation_de = asr(librispeech_en[0]["file"]) +``` + +See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints. + +## Documentation resources + +- [Causal language modeling task guide](../tasks/language_modeling) + +## Speech2Text2Config + +[[autodoc]] Speech2Text2Config + +## Speech2TextTokenizer + +[[autodoc]] Speech2Text2Tokenizer + - batch_decode + - decode + - save_vocabulary + +## Speech2Text2Processor + +[[autodoc]] Speech2Text2Processor + - __call__ + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## Speech2Text2ForCausalLM + +[[autodoc]] Speech2Text2ForCausalLM + - forward diff --git a/docs/source/en/model_doc/speech_to_text_2.mdx b/docs/source/en/model_doc/speech_to_text_2.mdx deleted file mode 100644 index 2e3ebc3f390a..000000000000 --- a/docs/source/en/model_doc/speech_to_text_2.mdx +++ /dev/null @@ -1,121 +0,0 @@ - - -# Speech2Text2 - -## Overview - -The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in -[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by -Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. - -Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as -[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the -[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only* -model. - -This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten). - -The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266). - - -Tips: - -- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see - the [official models](https://huggingface.co/models?other=speech2text2) . -- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework. -- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE). - -## Inference - -Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and -makes use of [`~generation.GenerationMixin.generate`] to translate the input speech -autoregressively to the target language. - -The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and -[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The -[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and -[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the -predicted token ids. - -- Step-by-step Speech Translation - -```python ->>> import torch ->>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel ->>> from datasets import load_dataset ->>> import soundfile as sf - ->>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") ->>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") - - ->>> def map_to_array(batch): -... speech, _ = sf.read(batch["file"]) -... batch["speech"] = speech -... return batch - - ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> ds = ds.map(map_to_array) - ->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") ->>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"]) - ->>> transcription = processor.batch_decode(generated_ids) -``` - -- Speech Translation via Pipelines - - The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code - -```python ->>> from datasets import load_dataset ->>> from transformers import pipeline - ->>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> asr = pipeline( -... "automatic-speech-recognition", -... model="facebook/s2t-wav2vec2-large-en-de", -... feature_extractor="facebook/s2t-wav2vec2-large-en-de", -... ) - ->>> translation_de = asr(librispeech_en[0]["file"]) -``` - -See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints. - - -## Speech2Text2Config - -[[autodoc]] Speech2Text2Config - -## Speech2TextTokenizer - -[[autodoc]] Speech2Text2Tokenizer - - batch_decode - - decode - - save_vocabulary - -## Speech2Text2Processor - -[[autodoc]] Speech2Text2Processor - - __call__ - - from_pretrained - - save_pretrained - - batch_decode - - decode - -## Speech2Text2ForCausalLM - -[[autodoc]] Speech2Text2ForCausalLM - - forward diff --git a/docs/source/en/model_doc/speecht5.md b/docs/source/en/model_doc/speecht5.md new file mode 100644 index 000000000000..4d5e2098a542 --- /dev/null +++ b/docs/source/en/model_doc/speecht5.md @@ -0,0 +1,85 @@ + + +# SpeechT5 + +## Overview + +The SpeechT5 model was proposed in [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. + +The abstract from the paper is the following: + +*Motivated by the success of T5 (Text-To-Text Transfer Transformer) in pre-trained natural language processing models, we propose a unified-modal SpeechT5 framework that explores the encoder-decoder pre-training for self-supervised speech/text representation learning. The SpeechT5 framework consists of a shared encoder-decoder network and six modal-specific (speech/text) pre/post-nets. After preprocessing the input speech/text through the pre-nets, the shared encoder-decoder network models the sequence-to-sequence transformation, and then the post-nets generate the output in the speech/text modality based on the output of the decoder. Leveraging large-scale unlabeled speech and text data, we pre-train SpeechT5 to learn a unified-modal representation, hoping to improve the modeling capability for both speech and text. To align the textual and speech information into this unified semantic space, we propose a cross-modal vector quantization approach that randomly mixes up speech/text states with latent units as the interface between encoder and decoder. Extensive evaluations show the superiority of the proposed SpeechT5 framework on a wide variety of spoken language processing tasks, including automatic speech recognition, speech synthesis, speech translation, voice conversion, speech enhancement, and speaker identification.* + +This model was contributed by [Matthijs](https://huggingface.co/Matthijs). The original code can be found [here](https://github.com/microsoft/SpeechT5). + +## SpeechT5Config + +[[autodoc]] SpeechT5Config + +## SpeechT5HifiGanConfig + +[[autodoc]] SpeechT5HifiGanConfig + +## SpeechT5Tokenizer + +[[autodoc]] SpeechT5Tokenizer + - __call__ + - save_vocabulary + - decode + - batch_decode + +## SpeechT5FeatureExtractor + +[[autodoc]] SpeechT5FeatureExtractor + - __call__ + +## SpeechT5Processor + +[[autodoc]] SpeechT5Processor + - __call__ + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## SpeechT5Model + +[[autodoc]] SpeechT5Model + - forward + +## SpeechT5ForSpeechToText + +[[autodoc]] SpeechT5ForSpeechToText + - forward + +## SpeechT5ForTextToSpeech + +[[autodoc]] SpeechT5ForTextToSpeech + - forward + - generate + +## SpeechT5ForSpeechToSpeech + +[[autodoc]] SpeechT5ForSpeechToSpeech + - forward + - generate_speech + +## SpeechT5HifiGan + +[[autodoc]] SpeechT5HifiGan + - forward diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md new file mode 100644 index 000000000000..f16169d9b218 --- /dev/null +++ b/docs/source/en/model_doc/splinter.md @@ -0,0 +1,87 @@ + + +# Splinter + +## Overview + +The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter +is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus +comprising Wikipedia and the Toronto Book Corpus. + +The abstract from the paper is the following: + +In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order +of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred +training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between +current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question +answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all +recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans +are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select +the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD +with only 128 training examples), while maintaining competitive performance in the high-resource setting. + +Tips: + +- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize + to question representations which are used to predict the answers. This layer is called QASS, and is the default + behaviour in the [`SplinterForQuestionAnswering`] class. Therefore: +- Use [`SplinterTokenizer`] (rather than [`BertTokenizer`]), as it already + contains this special token. Also, its default behavior is to use this token when two sequences are given (for + example, in the *run_qa.py* script). +- If you plan on using Splinter outside *run_qa.py*, please keep in mind the question token - it might be important for + the success of your model, especially in a few-shot setting. +- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that + one also has the pretrained weights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one + doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at + fine-tuning, as it is shown to yield better results for some cases in the paper. + +This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter). + +## Documentation resources + +- [Question answering task guide](../tasks/question-answering) + +## SplinterConfig + +[[autodoc]] SplinterConfig + +## SplinterTokenizer + +[[autodoc]] SplinterTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## SplinterTokenizerFast + +[[autodoc]] SplinterTokenizerFast + +## SplinterModel + +[[autodoc]] SplinterModel + - forward + +## SplinterForQuestionAnswering + +[[autodoc]] SplinterForQuestionAnswering + - forward + +## SplinterForPreTraining + +[[autodoc]] SplinterForPreTraining + - forward diff --git a/docs/source/en/model_doc/splinter.mdx b/docs/source/en/model_doc/splinter.mdx deleted file mode 100644 index 55e5f61b8d0b..000000000000 --- a/docs/source/en/model_doc/splinter.mdx +++ /dev/null @@ -1,79 +0,0 @@ - - -# Splinter - -## Overview - -The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter -is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus -comprising Wikipedia and the Toronto Book Corpus. - -The abstract from the paper is the following: - -In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order -of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred -training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between -current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question -answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all -recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans -are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select -the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD -with only 128 training examples), while maintaining competitive performance in the high-resource setting. - -Tips: - -- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize - to question representations which are used to predict the answers. This layer is called QASS, and is the default - behaviour in the [`SplinterForQuestionAnswering`] class. Therefore: -- Use [`SplinterTokenizer`] (rather than [`BertTokenizer`]), as it already - contains this special token. Also, its default behavior is to use this token when two sequences are given (for - example, in the *run_qa.py* script). -- If you plan on using Splinter outside *run_qa.py*, please keep in mind the question token - it might be important for - the success of your model, especially in a few-shot setting. -- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that - one also has the pretrained weights of the QASS layer (*tau/splinter-base-qass* and *tau/splinter-large-qass*) and one - doesn't (*tau/splinter-base* and *tau/splinter-large*). This is done to support randomly initializing this layer at - fine-tuning, as it is shown to yield better results for some cases in the paper. - -This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirstain) and [oriram](https://huggingface.co/oriram). The original code can be found [here](https://github.com/oriram/splinter). - -## SplinterConfig - -[[autodoc]] SplinterConfig - -## SplinterTokenizer - -[[autodoc]] SplinterTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## SplinterTokenizerFast - -[[autodoc]] SplinterTokenizerFast - -## SplinterModel - -[[autodoc]] SplinterModel - - forward - -## SplinterForQuestionAnswering - -[[autodoc]] SplinterForQuestionAnswering - - forward - -## SplinterForPreTraining - -[[autodoc]] SplinterForPreTraining - - forward diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md new file mode 100644 index 000000000000..515a2ef31781 --- /dev/null +++ b/docs/source/en/model_doc/squeezebert.md @@ -0,0 +1,99 @@ + + +# SqueezeBERT + +## Overview + +The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a +bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the +SqueezeBERT architecture is that SqueezeBERT uses [grouped convolutions](https://blog.yani.io/filter-group-tutorial) +instead of fully-connected layers for the Q, K, V and FFN layers. + +The abstract from the paper is the following: + +*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets, +large computing systems, and better neural network models, natural language processing (NLP) technology has made +significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant +opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we +consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's +highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with +BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods +such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these +techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in +self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called +SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test +set. The SqueezeBERT code will be released.* + +Tips: + +- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. +- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore + efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained + with a causal language modeling (CLM) objective are better in that regard. +- For best results when finetuning on sequence classification tasks, it is recommended to start with the + *squeezebert/squeezebert-mnli-headless* checkpoint. + +This model was contributed by [forresti](https://huggingface.co/forresti). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## SqueezeBertConfig + +[[autodoc]] SqueezeBertConfig + +## SqueezeBertTokenizer + +[[autodoc]] SqueezeBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## SqueezeBertTokenizerFast + +[[autodoc]] SqueezeBertTokenizerFast + +## SqueezeBertModel + +[[autodoc]] SqueezeBertModel + +## SqueezeBertForMaskedLM + +[[autodoc]] SqueezeBertForMaskedLM + +## SqueezeBertForSequenceClassification + +[[autodoc]] SqueezeBertForSequenceClassification + +## SqueezeBertForMultipleChoice + +[[autodoc]] SqueezeBertForMultipleChoice + +## SqueezeBertForTokenClassification + +[[autodoc]] SqueezeBertForTokenClassification + +## SqueezeBertForQuestionAnswering + +[[autodoc]] SqueezeBertForQuestionAnswering diff --git a/docs/source/en/model_doc/squeezebert.mdx b/docs/source/en/model_doc/squeezebert.mdx deleted file mode 100644 index c6219582c838..000000000000 --- a/docs/source/en/model_doc/squeezebert.mdx +++ /dev/null @@ -1,88 +0,0 @@ - - -# SqueezeBERT - -## Overview - -The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a -bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the -SqueezeBERT architecture is that SqueezeBERT uses [grouped convolutions](https://blog.yani.io/filter-group-tutorial) -instead of fully-connected layers for the Q, K, V and FFN layers. - -The abstract from the paper is the following: - -*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets, -large computing systems, and better neural network models, natural language processing (NLP) technology has made -significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant -opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we -consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's -highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with -BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods -such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these -techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in -self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called -SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test -set. The SqueezeBERT code will be released.* - -Tips: - -- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right - rather than the left. -- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore - efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained - with a causal language modeling (CLM) objective are better in that regard. -- For best results when finetuning on sequence classification tasks, it is recommended to start with the - *squeezebert/squeezebert-mnli-headless* checkpoint. - -This model was contributed by [forresti](https://huggingface.co/forresti). - - -## SqueezeBertConfig - -[[autodoc]] SqueezeBertConfig - -## SqueezeBertTokenizer - -[[autodoc]] SqueezeBertTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## SqueezeBertTokenizerFast - -[[autodoc]] SqueezeBertTokenizerFast - -## SqueezeBertModel - -[[autodoc]] SqueezeBertModel - -## SqueezeBertForMaskedLM - -[[autodoc]] SqueezeBertForMaskedLM - -## SqueezeBertForSequenceClassification - -[[autodoc]] SqueezeBertForSequenceClassification - -## SqueezeBertForMultipleChoice - -[[autodoc]] SqueezeBertForMultipleChoice - -## SqueezeBertForTokenClassification - -[[autodoc]] SqueezeBertForTokenClassification - -## SqueezeBertForQuestionAnswering - -[[autodoc]] SqueezeBertForQuestionAnswering diff --git a/docs/source/en/model_doc/swiftformer.md b/docs/source/en/model_doc/swiftformer.md new file mode 100644 index 000000000000..67c9597d2123 --- /dev/null +++ b/docs/source/en/model_doc/swiftformer.md @@ -0,0 +1,49 @@ + + +# SwiftFormer + +## Overview + +The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. + +The SwiftFormer paper introduces a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations in the self-attention computation with linear element-wise multiplications. A series of models called 'SwiftFormer' is built based on this, which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Even their small variant achieves 78.5% top-1 ImageNet1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2× faster compared to MobileViT-v2. + +The abstract from the paper is the following: + +*Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.* + +Tips: + - One can use the [`ViTImageProcessor`] API to prepare images for the model. + + +This model was contributed by [shehan97](https://huggingface.co/shehan97). +The original code can be found [here](https://github.com/Amshaker/SwiftFormer). + + +## SwiftFormerConfig + +[[autodoc]] SwiftFormerConfig + +## SwiftFormerModel + +[[autodoc]] SwiftFormerModel + - forward + +## SwiftFormerForImageClassification + +[[autodoc]] SwiftFormerForImageClassification + - forward diff --git a/docs/source/en/model_doc/swin.md b/docs/source/en/model_doc/swin.md new file mode 100644 index 000000000000..37bb86db951a --- /dev/null +++ b/docs/source/en/model_doc/swin.md @@ -0,0 +1,100 @@ + + +# Swin Transformer + +## Overview + +The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. + +The abstract from the paper is the following: + +*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone +for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, +such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. +To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted +\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping +local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at +various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it +compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense +prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation +(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and ++2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. +The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.* + +Tips: +- One can use the [`AutoImageProcessor`] API to prepare images for the model. +- Swin pads the inputs supporting any input height and width (if divisible by `32`). +- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`. + + + + Swin Transformer architecture. Taken from the original paper. + +This model was contributed by [novice03](https://huggingface.co/novice03). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer). + + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Swin Transformer. + + + +- [`SwinForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +Besides that: + +- [`SwinForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## SwinConfig + +[[autodoc]] SwinConfig + + +## SwinModel + +[[autodoc]] SwinModel + - forward + +## SwinForMaskedImageModeling + +[[autodoc]] SwinForMaskedImageModeling + - forward + +## SwinForImageClassification + +[[autodoc]] transformers.SwinForImageClassification + - forward + +## TFSwinModel + +[[autodoc]] TFSwinModel + - call + +## TFSwinForMaskedImageModeling + +[[autodoc]] TFSwinForMaskedImageModeling + - call + +## TFSwinForImageClassification + +[[autodoc]] transformers.TFSwinForImageClassification + - call diff --git a/docs/source/en/model_doc/swin.mdx b/docs/source/en/model_doc/swin.mdx deleted file mode 100644 index 503a141084a0..000000000000 --- a/docs/source/en/model_doc/swin.mdx +++ /dev/null @@ -1,81 +0,0 @@ - - -# Swin Transformer - -## Overview - -The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) -by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. - -The abstract from the paper is the following: - -*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone -for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, -such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. -To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted -\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping -local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at -various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it -compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense -prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation -(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and -+2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. -The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.* - -Tips: -- One can use the [`AutoImageProcessor`] API to prepare images for the model. -- Swin pads the inputs supporting any input height and width (if divisible by `32`). -- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`. - - - - Swin Transformer architecture. Taken from the original paper. - -This model was contributed by [novice03](https://huggingface.co/novice03). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer). - - -## SwinConfig - -[[autodoc]] SwinConfig - - -## SwinModel - -[[autodoc]] SwinModel - - forward - -## SwinForMaskedImageModeling - -[[autodoc]] SwinForMaskedImageModeling - - forward - -## SwinForImageClassification - -[[autodoc]] transformers.SwinForImageClassification - - forward - -## TFSwinModel - -[[autodoc]] TFSwinModel - - call - -## TFSwinForMaskedImageModeling - -[[autodoc]] TFSwinForMaskedImageModeling - - call - -## TFSwinForImageClassification - -[[autodoc]] transformers.TFSwinForImageClassification - - call diff --git a/docs/source/en/model_doc/swin2sr.md b/docs/source/en/model_doc/swin2sr.md new file mode 100644 index 000000000000..dfee144e50c4 --- /dev/null +++ b/docs/source/en/model_doc/swin2sr.md @@ -0,0 +1,61 @@ + + +# Swin2SR + +## Overview + +The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +Swin2R improves the [SwinIR](https://github.com/JingyunLiang/SwinIR/) model by incorporating [Swin Transformer v2](swinv2) layers which mitigates issues such as training instability, resolution gaps between pre-training +and fine-tuning, and hunger on data. + +The abstract from the paper is the following: + +*Compression plays an important role on the efficient transmission and storage of images and videos through band-limited systems such as streaming services, virtual reality or videogames. However, compression unavoidably leads to artifacts and the loss of the original information, which may severely degrade the visual quality. For these reasons, quality enhancement of compressed images has become a popular research topic. While most state-of-the-art image restoration methods are based on convolutional neural networks, other transformers-based methods such as SwinIR, show impressive performance on these tasks. +In this paper, we explore the novel Swin Transformer V2, to improve SwinIR for image super-resolution, and in particular, the compressed input scenario. Using this method we can tackle the major issues in training transformer vision models, such as training instability, resolution gaps between pre-training and fine-tuning, and hunger on data. We conduct experiments on three representative tasks: JPEG compression artifacts removal, image super-resolution (classical and lightweight), and compressed image super-resolution. Experimental results demonstrate that our method, Swin2SR, can improve the training convergence and performance of SwinIR, and is a top-5 solution at the "AIM 2022 Challenge on Super-Resolution of Compressed Image and Video".* + + + + Swin2SR architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/mv-lab/swin2sr). + +## Resources + +Demo notebooks for Swin2SR can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Swin2SR). + +A demo Space for image super-resolution with SwinSR can be found [here](https://huggingface.co/spaces/jjourney1125/swin2sr). + +## Swin2SRImageProcessor + +[[autodoc]] Swin2SRImageProcessor + - preprocess + +## Swin2SRConfig + +[[autodoc]] Swin2SRConfig + +## Swin2SRModel + +[[autodoc]] Swin2SRModel + - forward + +## Swin2SRForImageSuperResolution + +[[autodoc]] Swin2SRForImageSuperResolution + - forward diff --git a/docs/source/en/model_doc/swin2sr.mdx b/docs/source/en/model_doc/swin2sr.mdx deleted file mode 100644 index edb073d1ee38..000000000000 --- a/docs/source/en/model_doc/swin2sr.mdx +++ /dev/null @@ -1,57 +0,0 @@ - - -# Swin2SR - -## Overview - -The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. -Swin2R improves the [SwinIR](https://github.com/JingyunLiang/SwinIR/) model by incorporating [Swin Transformer v2](swinv2) layers which mitigates issues such as training instability, resolution gaps between pre-training -and fine-tuning, and hunger on data. - -The abstract from the paper is the following: - -*Compression plays an important role on the efficient transmission and storage of images and videos through band-limited systems such as streaming services, virtual reality or videogames. However, compression unavoidably leads to artifacts and the loss of the original information, which may severely degrade the visual quality. For these reasons, quality enhancement of compressed images has become a popular research topic. While most state-of-the-art image restoration methods are based on convolutional neural networks, other transformers-based methods such as SwinIR, show impressive performance on these tasks. -In this paper, we explore the novel Swin Transformer V2, to improve SwinIR for image super-resolution, and in particular, the compressed input scenario. Using this method we can tackle the major issues in training transformer vision models, such as training instability, resolution gaps between pre-training and fine-tuning, and hunger on data. We conduct experiments on three representative tasks: JPEG compression artifacts removal, image super-resolution (classical and lightweight), and compressed image super-resolution. Experimental results demonstrate that our method, Swin2SR, can improve the training convergence and performance of SwinIR, and is a top-5 solution at the "AIM 2022 Challenge on Super-Resolution of Compressed Image and Video".* - - - - Swin2SR architecture. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). -The original code can be found [here](https://github.com/mv-lab/swin2sr). - -## Resources - -Demo notebooks for Swin2SR can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Swin2SR). - -A demo Space for image super-resolution with SwinSR can be found [here](https://huggingface.co/spaces/jjourney1125/swin2sr). - -## Swin2SRImageProcessor - -[[autodoc]] Swin2SRImageProcessor - - preprocess - -## Swin2SRConfig - -[[autodoc]] Swin2SRConfig - -## Swin2SRModel - -[[autodoc]] Swin2SRModel - - forward - -## Swin2SRForImageSuperResolution - -[[autodoc]] Swin2SRForImageSuperResolution - - forward diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md new file mode 100644 index 000000000000..e08389527ece --- /dev/null +++ b/docs/source/en/model_doc/swinv2.md @@ -0,0 +1,65 @@ + + +# Swin Transformer V2 + +## Overview + +The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. + +The abstract from the paper is the following: + +*Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.* + +Tips: +- One can use the [`AutoImageProcessor`] API to prepare images for the model. + +This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik). +The original code can be found [here](https://github.com/microsoft/Swin-Transformer). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Swin Transformer v2. + + + +- [`Swinv2ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +Besides that: + +- [`Swinv2ForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## Swinv2Config + +[[autodoc]] Swinv2Config + +## Swinv2Model + +[[autodoc]] Swinv2Model + - forward + +## Swinv2ForMaskedImageModeling + +[[autodoc]] Swinv2ForMaskedImageModeling + - forward + +## Swinv2ForImageClassification + +[[autodoc]] transformers.Swinv2ForImageClassification + - forward diff --git a/docs/source/en/model_doc/swinv2.mdx b/docs/source/en/model_doc/swinv2.mdx deleted file mode 100644 index 576f1a142a63..000000000000 --- a/docs/source/en/model_doc/swinv2.mdx +++ /dev/null @@ -1,47 +0,0 @@ - - -# Swin Transformer V2 - -## Overview - -The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. - -The abstract from the paper is the following: - -*Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.* - -Tips: -- One can use the [`AutoImageProcessor`] API to prepare images for the model. - -This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik). -The original code can be found [here](https://github.com/microsoft/Swin-Transformer). - - -## Swinv2Config - -[[autodoc]] Swinv2Config - -## Swinv2Model - -[[autodoc]] Swinv2Model - - forward - -## Swinv2ForMaskedImageModeling - -[[autodoc]] Swinv2ForMaskedImageModeling - - forward - -## Swinv2ForImageClassification - -[[autodoc]] transformers.Swinv2ForImageClassification - - forward diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md new file mode 100644 index 000000000000..8f6a231b7ef7 --- /dev/null +++ b/docs/source/en/model_doc/switch_transformers.md @@ -0,0 +1,72 @@ + + +# SwitchTransformers + +## Overview + +The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. + +The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale. +During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations. + + +The abstract from the paper is the following: + +*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.* + +Tips: + +- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository. +- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned. + +This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) . +The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe). + +## Resources + +- [Translation task guide](../tasks/translation) +- [Summarization task guide](../tasks/summarization) + +## SwitchTransformersConfig + +[[autodoc]] SwitchTransformersConfig + +## SwitchTransformersTop1Router + +[[autodoc]] SwitchTransformersTop1Router + - _compute_router_probabilities + - forward + +## SwitchTransformersSparseMLP + +[[autodoc]] SwitchTransformersSparseMLP + - forward + +## SwitchTransformersModel + +[[autodoc]] SwitchTransformersModel + - forward + +## SwitchTransformersForConditionalGeneration + +[[autodoc]] SwitchTransformersForConditionalGeneration + - forward + +## SwitchTransformersEncoderModel + +[[autodoc]] SwitchTransformersEncoderModel + - forward diff --git a/docs/source/en/model_doc/switch_transformers.mdx b/docs/source/en/model_doc/switch_transformers.mdx deleted file mode 100644 index 348c831a0e98..000000000000 --- a/docs/source/en/model_doc/switch_transformers.mdx +++ /dev/null @@ -1,64 +0,0 @@ - - -# SwitchTransformers - -## Overview - -The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. - -The Switch Transformer model uses a sparse T5 encoder-decoder architecure, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale. -During a forward pass, only a fraction of the weights are used. The routing mecanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations. - - -The abstract from the paper is the following: - -*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.* - -Tips: - -- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository. -- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned. - -This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArtZucker) . -The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe). - - -## SwitchTransformersConfig - -[[autodoc]] SwitchTransformersConfig - -## SwitchTransformersTop1Router - -[[autodoc]] SwitchTransformersTop1Router - - _compute_router_probabilities - - forward - -## SwitchTransformersSparseMLP - -[[autodoc]] SwitchTransformersSparseMLP - - forward - -## SwitchTransformersModel - -[[autodoc]] SwitchTransformersModel - - forward - -## SwitchTransformersForConditionalGeneration - -[[autodoc]] SwitchTransformersForConditionalGeneration - - forward - -## SwitchTransformersEncoderModel - -[[autodoc]] SwitchTransformersEncoderModel - - forward diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md new file mode 100644 index 000000000000..2e833e8e1a67 --- /dev/null +++ b/docs/source/en/model_doc/t5.md @@ -0,0 +1,446 @@ + + +# T5 + +
+ +Models + + +Spaces + + +Paper page + +
+ +## Overview + +The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang, +Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu). + +The abstract from the paper is the following: + +*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream +task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning +has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of +transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a +text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer +approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration +with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering +summarization, question answering, text classification, and more. To facilitate future work on transfer learning for +NLP, we release our dataset, pre-trained models, and code.* + +Tips: + +- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which +each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a +different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*, +for summarization: *summarize: ...*. +- The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above). +- Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original sentence and the target is then the dropped out tokens delimited by their sentinel tokens. + +- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right. + +- See the [training](#training), [inference](#inference) and [scripts](#scripts) sections below for all details regarding usage. + +T5 comes in different sizes: + +- [t5-small](https://huggingface.co/t5-small) + +- [t5-base](https://huggingface.co/t5-base) + +- [t5-large](https://huggingface.co/t5-large) + +- [t5-3b](https://huggingface.co/t5-3b) + +- [t5-11b](https://huggingface.co/t5-11b). + +Based on the original T5 model, Google has released some follow-up works: + +- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without + mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found [here](t5v1.1). + +- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to + the documentation of mT5 which can be found [here](mt5). + +- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer + to the documentation of byT5 which can be found [here](byt5). + +- **UL2**: UL2 is a T5 like model pretrained on various denoising objectives + +- **Flan-T5**: Flan is a pretraining methods that is based on prompting. The Flan-T5 are T5 models trained on the Flan collection of + datasets which include: `taskmaster2`, `djaym7/wiki_dialog`, `deepmind/code_contests`, `lambada`, `gsm8k`, `aqua_rat`, `esnli`, `quasc` and `qed`. + +- **FLan-UL2** : the UL2 model finetuned using the "Flan" prompt tuning and dataset collection. + +- **UMT5**: UmT5 is a multilingual T5 model trained on an improved and refreshed mC4 multilingual corpus, 29 trillion characters across 107 language, using a new sampling method, UniMax. Refer to + the documentation of mT5 which can be found [here](umt5). + +All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5). + +This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer). + + + +## Training + +T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher +forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input +sequence is fed to the model using `input_ids`. The target sequence is shifted to the right, i.e., prepended by a +start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target +sequence is then appended by the EOS token and corresponds to the `labels`. The PAD token is hereby used as the +start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion. + +One can use [`T5ForConditionalGeneration`] (or the Tensorflow/Flax variant), which includes the +language modeling head on top of the decoder. + +- Unsupervised denoising training + +In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and +the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each +sentinel token represents a unique mask token for this sentence and should start with ``, +``, ... up to ``. As a default, 100 sentinel tokens are available in +[`T5Tokenizer`]. + +For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be +processed as follows: + +```python +>>> from transformers import T5Tokenizer, T5ForConditionalGeneration + +>>> tokenizer = T5Tokenizer.from_pretrained("t5-small") +>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + +>>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids +>>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids + +>>> # the forward function automatically creates the correct decoder_input_ids +>>> loss = model(input_ids=input_ids, labels=labels).loss +>>> loss.item() +3.7837 +``` + +If you're interested in pre-training T5 on a new corpus, check out the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling) script in the Examples +directory. + +- Supervised training + +In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping. +Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input +sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for +the model as follows: + +```python +>>> from transformers import T5Tokenizer, T5ForConditionalGeneration + +>>> tokenizer = T5Tokenizer.from_pretrained("t5-small") +>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + +>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids +>>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids + +>>> # the forward function automatically creates the correct decoder_input_ids +>>> loss = model(input_ids=input_ids, labels=labels).loss +>>> loss.item() +0.2542 +``` + +As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the +`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded +target sequence). The model will automatically create the `decoder_input_ids` based on the `labels`, by +shifting them one position to the right and prepending the `config.decoder_start_token_id`, which for T5 is +equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate +English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used +during T5's pre-training. + +However, the example above only shows a single training example. In practice, one trains deep learning models in +batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one +typically defines a `max_source_length` and `max_target_length`, which determine the maximum length of the +input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on +the task. + +In addition, we must make sure that padding token id's of the `labels` are not taken into account by the loss +function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the `ignore_index` +of the `CrossEntropyLoss`. In Flax, one can use the `decoder_attention_mask` to ignore padded tokens from +the loss (see the [Flax summarization script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) for details). We also pass +`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are +ignored. The code example below illustrates all of this. + +```python +>>> from transformers import T5Tokenizer, T5ForConditionalGeneration +>>> import torch + +>>> tokenizer = T5Tokenizer.from_pretrained("t5-small") +>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + +>>> # the following 2 hyperparameters are task-specific +>>> max_source_length = 512 +>>> max_target_length = 128 + +>>> # Suppose we have the following 2 training examples: +>>> input_sequence_1 = "Welcome to NYC" +>>> output_sequence_1 = "Bienvenue à NYC" + +>>> input_sequence_2 = "HuggingFace is a company" +>>> output_sequence_2 = "HuggingFace est une entreprise" + +>>> # encode the inputs +>>> task_prefix = "translate English to French: " +>>> input_sequences = [input_sequence_1, input_sequence_2] + +>>> encoding = tokenizer( +... [task_prefix + sequence for sequence in input_sequences], +... padding="longest", +... max_length=max_source_length, +... truncation=True, +... return_tensors="pt", +... ) + +>>> input_ids, attention_mask = encoding.input_ids, encoding.attention_mask + +>>> # encode the targets +>>> target_encoding = tokenizer( +... [output_sequence_1, output_sequence_2], +... padding="longest", +... max_length=max_target_length, +... truncation=True, +... return_tensors="pt", +... ) +>>> labels = target_encoding.input_ids + +>>> # replace padding token id's of the labels by -100 so it's ignored by the loss +>>> labels[labels == tokenizer.pad_token_id] = -100 + +>>> # forward pass +>>> loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss +>>> loss.item() +0.188 +``` + +Additional training tips: + +- T5 models need a slightly higher learning rate than the default one set in the `Trainer` when using the AdamW +optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question +answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer. + +According to [this forum post](https://discuss.huggingface.co/t/t5-finetuning-tips/684), task prefixes matter when +(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's +pre-training mixture (see Appendix D of the [paper](https://arxiv.org/pdf/1910.10683.pdf) for the task prefixes +used). + +If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of +*pad_to_multiple_of* to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding +batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is +encountered during training thus significantly slowing down the training. only padding up to the longest example in a +batch) leads to very slow training on TPU. + + + +## Inference + +At inference time, it is recommended to use [`~generation.GenerationMixin.generate`]. This +method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder +and auto-regressively generates the decoder output. Check out [this blog post](https://huggingface.co/blog/how-to-generate) to know all the details about generating text with Transformers. +There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encoder-decoder) which explains how +generation works in general in encoder-decoder models. + +```python +>>> from transformers import T5Tokenizer, T5ForConditionalGeneration + +>>> tokenizer = T5Tokenizer.from_pretrained("t5-small") +>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + +>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids +>>> outputs = model.generate(input_ids) +>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +Das Haus ist wunderbar. +``` + +Note that T5 uses the `pad_token_id` as the `decoder_start_token_id`, so when doing generation without using +[`~generation.GenerationMixin.generate`], make sure you start it with the `pad_token_id`. + +The example above only shows a single example. You can also do batched inference, like so: + +```python +>>> from transformers import T5Tokenizer, T5ForConditionalGeneration + +>>> tokenizer = T5Tokenizer.from_pretrained("t5-small") +>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + +>>> task_prefix = "translate English to German: " +>>> # use different length sentences to test batching +>>> sentences = ["The house is wonderful.", "I like to work in NYC."] + +>>> inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True) + +>>> output_sequences = model.generate( +... input_ids=inputs["input_ids"], +... attention_mask=inputs["attention_mask"], +... do_sample=False, # disable sampling to test if batching affects output +... ) + +>>> print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True)) +['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.'] +``` + +Because T5 has been trained with the span-mask denoising objective, +it can be used to predict the sentinel (masked-out) tokens during inference. +The predicted tokens will then be placed between the sentinel tokens. + +```python +>>> from transformers import T5Tokenizer, T5ForConditionalGeneration + +>>> tokenizer = T5Tokenizer.from_pretrained("t5-small") +>>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + +>>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids + +>>> sequence_ids = model.generate(input_ids) +>>> sequences = tokenizer.batch_decode(sequence_ids) +>>> sequences +[' park offers the park.'] +``` + + + + +## Performance + +If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter. + + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with T5. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A notebook for how to [finetune T5 for classification and multiple choice](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb). +- A notebook for how to [finetune T5 for sentiment span extraction](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb). 🌎 + + + +- A notebook for how to [finetune T5 for named entity recognition](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing). 🌎 + + + +- A notebook for [Finetuning CodeT5 for generating docstrings from Ruby code](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb). + + + +- A notebook to [Finetune T5-base-dutch to perform Dutch abstractive summarization on a TPU](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb). +- A notebook for how to [finetune T5 for summarization in PyTorch and track experiments with WandB](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=OKRpFvYhBauC). 🌎 +- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq). +- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb). +- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb). +- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization). +- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course. +- [Summarization task guide](../tasks/summarization) + + + +- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#t5-like-span-masked-language-modeling) for training T5 with a span-masked language model objective. The script also shows how to train a T5 tokenizer. [`FlaxT5ForConditionalGeneration`] is also supported by this [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb). + + + +- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb). +- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb). +- [Translation task guide](../tasks/translation) + + + +- A notebook on how to [finetune T5 for question answering with TensorFlow 2](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb). 🌎 +- A notebook on how to [finetune T5 for question answering on a TPU](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil). + +🚀 **Deploy** +- A blog post on how to deploy [T5 11B for inference for less than $500](https://www.philschmid.de/deploy-t5-11b). + +## T5Config + +[[autodoc]] T5Config + +## T5Tokenizer + +[[autodoc]] T5Tokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## T5TokenizerFast + +[[autodoc]] T5TokenizerFast + +## T5Model + +[[autodoc]] T5Model + - forward + +## T5ForConditionalGeneration + +[[autodoc]] T5ForConditionalGeneration + - forward + +## T5EncoderModel + +[[autodoc]] T5EncoderModel + - forward + +## T5ForSequenceClassification + +[[autodoc]] T5ForSequenceClassification + - forward + +## T5ForQuestionAnswering + +[[autodoc]] T5ForQuestionAnswering + - forward + +## TFT5Model + +[[autodoc]] TFT5Model + - call + +## TFT5ForConditionalGeneration + +[[autodoc]] TFT5ForConditionalGeneration + - call + +## TFT5EncoderModel + +[[autodoc]] TFT5EncoderModel + - call + +## FlaxT5Model + +[[autodoc]] FlaxT5Model + - __call__ + - encode + - decode + +## FlaxT5ForConditionalGeneration + +[[autodoc]] FlaxT5ForConditionalGeneration + - __call__ + - encode + - decode + +## FlaxT5EncoderModel + +[[autodoc]] FlaxT5EncoderModel + - __call__ diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.mdx deleted file mode 100644 index 995816061c76..000000000000 --- a/docs/source/en/model_doc/t5.mdx +++ /dev/null @@ -1,412 +0,0 @@ - - -# T5 - -## Overview - -The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, -Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. - -The abstract from the paper is the following: - -*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream -task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning -has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of -transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a -text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer -approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration -with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering -summarization, question answering, text classification, and more. To facilitate future work on transfer learning for -NLP, we release our dataset, pre-trained models, and code.* - -Tips: - -- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which -each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a -different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*, -for summarization: *summarize: ...*. - -- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right. - -- See the [training](#training), [inference](#inference) and [scripts](#scripts) sections below for all details regarding usage. - -T5 comes in different sizes: - -- [t5-small](https://huggingface.co/t5-small) - -- [t5-base](https://huggingface.co/t5-base) - -- [t5-large](https://huggingface.co/t5-large) - -- [t5-3b](https://huggingface.co/t5-3b) - -- [t5-11b](https://huggingface.co/t5-11b). - -Based on the original T5 model, Google has released some follow-up works: - -- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without - mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found [here](t5v1.1). - -- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to - the documentation of mT5 which can be found [here](mt5). - -- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer - to the documentation of byT5 which can be found [here](byt5). - -All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5). - -This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer). - - - -## Training - -T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher -forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input -sequence is fed to the model using `input_ids`. The target sequence is shifted to the right, i.e., prepended by a -start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target -sequence is then appended by the EOS token and corresponds to the `labels`. The PAD token is hereby used as the -start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion. - -One can use [`T5ForConditionalGeneration`] (or the Tensorflow/Flax variant), which includes the -language modeling head on top of the decoder. - -- Unsupervised denoising training - -In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and -the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each -sentinel token represents a unique mask token for this sentence and should start with ``, -``, ... up to ``. As a default, 100 sentinel tokens are available in -[`T5Tokenizer`]. - -For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be -processed as follows: - -```python ->>> from transformers import T5Tokenizer, T5ForConditionalGeneration - ->>> tokenizer = T5Tokenizer.from_pretrained("t5-small") ->>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - ->>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids ->>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids - ->>> # the forward function automatically creates the correct decoder_input_ids ->>> loss = model(input_ids=input_ids, labels=labels).loss ->>> loss.item() -3.7837 -``` - -If you're interested in pre-training T5 on a new corpus, check out the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling) script in the Examples -directory. - -- Supervised training - -In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping. -Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input -sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for -the model as follows: - -```python ->>> from transformers import T5Tokenizer, T5ForConditionalGeneration - ->>> tokenizer = T5Tokenizer.from_pretrained("t5-small") ->>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - ->>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids ->>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids - ->>> # the forward function automatically creates the correct decoder_input_ids ->>> loss = model(input_ids=input_ids, labels=labels).loss ->>> loss.item() -0.2542 -``` - -As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the -`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded -target sequence). The model will automatically create the `decoder_input_ids` based on the `labels`, by -shifting them one position to the right and prepending the `config.decoder_start_token_id`, which for T5 is -equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate -English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used -during T5's pre-training. - -However, the example above only shows a single training example. In practice, one trains deep learning models in -batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one -typically defines a `max_source_length` and `max_target_length`, which determine the maximum length of the -input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on -the task. - -In addition, we must make sure that padding token id's of the `labels` are not taken into account by the loss -function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the `ignore_index` -of the `CrossEntropyLoss`. In Flax, one can use the `decoder_attention_mask` to ignore padded tokens from -the loss (see the [Flax summarization script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) for details). We also pass -`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are -ignored. The code example below illustrates all of this. - -```python ->>> from transformers import T5Tokenizer, T5ForConditionalGeneration ->>> import torch - ->>> tokenizer = T5Tokenizer.from_pretrained("t5-small") ->>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - ->>> # the following 2 hyperparameters are task-specific ->>> max_source_length = 512 ->>> max_target_length = 128 - ->>> # Suppose we have the following 2 training examples: ->>> input_sequence_1 = "Welcome to NYC" ->>> output_sequence_1 = "Bienvenue à NYC" - ->>> input_sequence_2 = "HuggingFace is a company" ->>> output_sequence_2 = "HuggingFace est une entreprise" - ->>> # encode the inputs ->>> task_prefix = "translate English to French: " ->>> input_sequences = [input_sequence_1, input_sequence_2] - ->>> encoding = tokenizer( -... [task_prefix + sequence for sequence in input_sequences], -... padding="longest", -... max_length=max_source_length, -... truncation=True, -... return_tensors="pt", -... ) - ->>> input_ids, attention_mask = encoding.input_ids, encoding.attention_mask - ->>> # encode the targets ->>> target_encoding = tokenizer( -... [output_sequence_1, output_sequence_2], -... padding="longest", -... max_length=max_target_length, -... truncation=True, -... return_tensors="pt", -... ) ->>> labels = target_encoding.input_ids - ->>> # replace padding token id's of the labels by -100 so it's ignored by the loss ->>> labels[labels == tokenizer.pad_token_id] = -100 - ->>> # forward pass ->>> loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss ->>> loss.item() -0.188 -``` - -Additional training tips: - -- T5 models need a slightly higher learning rate than the default one set in the `Trainer` when using the AdamW -optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question -answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer. - -According to [this forum post](https://discuss.huggingface.co/t/t5-finetuning-tips/684), task prefixes matter when -(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's -pre-training mixture (see Appendix D of the [paper](https://arxiv.org/pdf/1910.10683.pdf) for the task prefixes -used). - -If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of -*pad_to_multiple_of* to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding -batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is -encountered during training thus significantly slowing down the training. only padding up to the longest example in a -batch) leads to very slow training on TPU. - - - -## Inference - -At inference time, it is recommended to use [`~generation.GenerationMixin.generate`]. This -method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder -and auto-regressively generates the decoder output. Check out [this blog post](https://huggingface.co/blog/how-to-generate) to know all the details about generating text with Transformers. -There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encoder-decoder) which explains how -generation works in general in encoder-decoder models. - -```python ->>> from transformers import T5Tokenizer, T5ForConditionalGeneration - ->>> tokenizer = T5Tokenizer.from_pretrained("t5-small") ->>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - ->>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids ->>> outputs = model.generate(input_ids) ->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -Das Haus ist wunderbar. -``` - -Note that T5 uses the `pad_token_id` as the `decoder_start_token_id`, so when doing generation without using -[`~generation.GenerationMixin.generate`], make sure you start it with the `pad_token_id`. - -The example above only shows a single example. You can also do batched inference, like so: - -```python ->>> from transformers import T5Tokenizer, T5ForConditionalGeneration - ->>> tokenizer = T5Tokenizer.from_pretrained("t5-small") ->>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - ->>> task_prefix = "translate English to German: " ->>> # use different length sentences to test batching ->>> sentences = ["The house is wonderful.", "I like to work in NYC."] - ->>> inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True) - ->>> output_sequences = model.generate( -... input_ids=inputs["input_ids"], -... attention_mask=inputs["attention_mask"], -... do_sample=False, # disable sampling to test if batching affects output -... ) - ->>> print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True)) -['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.'] -``` - -Because T5 has been trained with the span-mask denoising objective, -it can be used to predict the sentinel (masked-out) tokens during inference. -The predicted tokens will then be placed between the sentinel tokens. - -```python ->>> from transformers import T5Tokenizer, T5ForConditionalGeneration - ->>> tokenizer = T5Tokenizer.from_pretrained("t5-small") ->>> model = T5ForConditionalGeneration.from_pretrained("t5-small") - ->>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids - ->>> sequence_ids = model.generate(input_ids) ->>> sequences = tokenizer.batch_decode(sequence_ids) ->>> sequences -[' park offers the park.'] -``` - - - - -## Performance - -If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter. - - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with T5. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - - - -- A notebook for how to [finetune T5 for classification and multiple choice](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb). -- A notebook for how to [finetune T5 for sentiment span extraction](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb). 🌎 - - - -- A notebook for how to [finetune T5 for named entity recognition](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing). 🌎 - - - -- A notebook for [Finetuning CodeT5 for generating docstrings from Ruby code](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb). - - - -- A notebook to [Finetune T5-base-dutch to perform Dutch abstractive summarization on a TPU](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb). -- A notebook for how to [finetune T5 for summarization in PyTorch and track experiments with WandB](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=OKRpFvYhBauC). 🌎 -- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq). -- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [noteboook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb). -- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb). -- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization). -- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course. - - - -- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#t5-like-span-masked-language-modeling) for training T5 with a span-masked language model objective. The script also shows how to train a T5 tokenizer. [`FlaxT5ForConditionalGeneration`] is also supported by this [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb). - - - -- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb). -- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb). - - - -- A notebook on how to [finetune T5 for question answering with TensorFlow 2](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb). 🌎 -- A notebook on how to [finetune T5 for question answering on a TPU](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil). - -🚀 **Deploy** -- A blog post on how to deploy [T5 11B for inference for less than $500](https://www.philschmid.de/deploy-t5-11b). - -## T5Config - -[[autodoc]] T5Config - -## T5Tokenizer - -[[autodoc]] T5Tokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## T5TokenizerFast - -[[autodoc]] T5TokenizerFast - -## T5Model - -[[autodoc]] T5Model - - forward - - parallelize - - deparallelize - -## T5ForConditionalGeneration - -[[autodoc]] T5ForConditionalGeneration - - forward - - parallelize - - deparallelize - -## T5EncoderModel - -[[autodoc]] T5EncoderModel - - forward - - parallelize - - deparallelize - -## TFT5Model - -[[autodoc]] TFT5Model - - call - -## TFT5ForConditionalGeneration - -[[autodoc]] TFT5ForConditionalGeneration - - call - -## TFT5EncoderModel - -[[autodoc]] TFT5EncoderModel - - call - -## FlaxT5Model - -[[autodoc]] FlaxT5Model - - __call__ - - encode - - decode - -## FlaxT5ForConditionalGeneration - -[[autodoc]] FlaxT5ForConditionalGeneration - - __call__ - - encode - - decode - -## FlaxT5EncoderModel - -[[autodoc]] FlaxT5EncoderModel - - __call__ diff --git a/docs/source/en/model_doc/t5v1.1.md b/docs/source/en/model_doc/t5v1.1.md new file mode 100644 index 000000000000..900e26f521dd --- /dev/null +++ b/docs/source/en/model_doc/t5v1.1.md @@ -0,0 +1,65 @@ + + +# T5v1.1 + +## Overview + +T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) +repository by Colin Raffel et al. It's an improved version of the original T5 model. + +One can directly plug in the weights of T5v1.1 into a T5 model, like so: + +```python +>>> from transformers import T5ForConditionalGeneration + +>>> model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base") +``` + +T5 Version 1.1 includes the following improvements compared to the original T5 model: + +- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See [this paper](https://arxiv.org/abs/2002.05202). + +- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning. + +- Pre-trained on C4 only without mixing in the downstream tasks. + +- No parameter sharing between the embedding and classifier layer. + +- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger `d_model` and smaller + `num_heads` and `d_ff`. + +Note: T5 Version 1.1 was only pre-trained on [C4](https://huggingface.co/datasets/c4) excluding any supervised +training. Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 +model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task +fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix. + +Google has released the following variants: + +- [google/t5-v1_1-small](https://huggingface.co/google/t5-v1_1-small) + +- [google/t5-v1_1-base](https://huggingface.co/google/t5-v1_1-base) + +- [google/t5-v1_1-large](https://huggingface.co/google/t5-v1_1-large) + +- [google/t5-v1_1-xl](https://huggingface.co/google/t5-v1_1-xl) + +- [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl). + +One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be +found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511). diff --git a/docs/source/en/model_doc/t5v1.1.mdx b/docs/source/en/model_doc/t5v1.1.mdx deleted file mode 100644 index a5b64f77dc7c..000000000000 --- a/docs/source/en/model_doc/t5v1.1.mdx +++ /dev/null @@ -1,61 +0,0 @@ - - -# T5v1.1 - -## Overview - -T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) -repository by Colin Raffel et al. It's an improved version of the original T5 model. - -One can directly plug in the weights of T5v1.1 into a T5 model, like so: - -```python ->>> from transformers import T5ForConditionalGeneration - ->>> model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base") -``` - -T5 Version 1.1 includes the following improvements compared to the original T5 model: - -- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See [this paper](https://arxiv.org/abs/2002.05202). - -- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning. - -- Pre-trained on C4 only without mixing in the downstream tasks. - -- No parameter sharing between the embedding and classifier layer. - -- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger `d_model` and smaller - `num_heads` and `d_ff`. - -Note: T5 Version 1.1 was only pre-trained on [C4](https://huggingface.co/datasets/c4) excluding any supervised -training. Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 -model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task -fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix. - -Google has released the following variants: - -- [google/t5-v1_1-small](https://huggingface.co/google/t5-v1_1-small) - -- [google/t5-v1_1-base](https://huggingface.co/google/t5-v1_1-base) - -- [google/t5-v1_1-large](https://huggingface.co/google/t5-v1_1-large) - -- [google/t5-v1_1-xl](https://huggingface.co/google/t5-v1_1-xl) - -- [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl). - -One can refer to [T5's documentation page](t5) for all tips, code examples and notebooks. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be -found [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511). diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md new file mode 100644 index 000000000000..7ea7ae8cd352 --- /dev/null +++ b/docs/source/en/model_doc/table-transformer.md @@ -0,0 +1,68 @@ + + +# Table Transformer + +## Overview + +The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by +Brandon Smock, Rohith Pesala, Robin Abraham. The authors introduce a new dataset, PubTables-1M, to benchmark progress in table extraction from unstructured documents, +as well as table structure recognition and functional analysis. The authors train 2 [DETR](detr) models, one for table detection and one for table structure recognition, dubbed Table Transformers. + +The abstract from the paper is the following: + +*Recently, significant progress has been made applying machine learning to the problem of table structure inference and extraction from unstructured documents. +However, one of the greatest challenges remains the creation of datasets with complete, unambiguous ground truth at scale. To address this, we develop a new, more +comprehensive dataset for table extraction, called PubTables-1M. PubTables-1M contains nearly one million tables from scientific articles, supports multiple input +modalities, and contains detailed header and location information for table structures, making it useful for a wide variety of modeling approaches. It also addresses a significant +source of ground truth inconsistency observed in prior datasets called oversegmentation, using a novel canonicalization procedure. We demonstrate that these improvements lead to a +significant increase in training performance and a more reliable estimate of model performance at evaluation for table structure recognition. Further, we show that transformer-based +object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any +special customization for these tasks.* + +Tips: + +- The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table). +- One can use the [`AutoImageProcessor`] API to prepare images and optional targets for the model. This will load a [`DetrImageProcessor`] behind the scenes. + + + + Table detection and table structure recognition clarified. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be +found [here](https://github.com/microsoft/table-transformer). + +## Resources + + + +- A demo notebook for the Table Transformer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Table%20Transformer). +- It turns out padding of images is quite important for detection. An interesting Github thread with replies from the authors can be found [here](https://github.com/microsoft/table-transformer/issues/68). + +## TableTransformerConfig + +[[autodoc]] TableTransformerConfig + +## TableTransformerModel + +[[autodoc]] TableTransformerModel + - forward + +## TableTransformerForObjectDetection + +[[autodoc]] TableTransformerForObjectDetection + - forward diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.mdx deleted file mode 100644 index 862f4124c25f..000000000000 --- a/docs/source/en/model_doc/table-transformer.mdx +++ /dev/null @@ -1,64 +0,0 @@ - - -# Table Transformer - -## Overview - -The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by -Brandon Smock, Rohith Pesala, Robin Abraham. The authors introduce a new dataset, PubTables-1M, to benchmark progress in table extraction from unstructured documents, -as well as table structure recognition and functional analysis. The authors train 2 [DETR](detr) models, one for table detection and one for table structure recognition, dubbed Table Transformers. - -The abstract from the paper is the following: - -*Recently, significant progress has been made applying machine learning to the problem of table structure inference and extraction from unstructured documents. -However, one of the greatest challenges remains the creation of datasets with complete, unambiguous ground truth at scale. To address this, we develop a new, more -comprehensive dataset for table extraction, called PubTables-1M. PubTables-1M contains nearly one million tables from scientific articles, supports multiple input -modalities, and contains detailed header and location information for table structures, making it useful for a wide variety of modeling approaches. It also addresses a significant -source of ground truth inconsistency observed in prior datasets called oversegmentation, using a novel canonicalization procedure. We demonstrate that these improvements lead to a -significant increase in training performance and a more reliable estimate of model performance at evaluation for table structure recognition. Further, we show that transformer-based -object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any -special customization for these tasks.* - -Tips: - -- The authors released 2 models, one for [table detection](https://huggingface.co/microsoft/table-transformer-detection) in documents, one for [table structure recognition](https://huggingface.co/microsoft/table-transformer-structure-recognition) (the task of recognizing the individual rows, columns etc. in a table). -- One can use the [`AutoImageProcessor`] API to prepare images and optional targets for the model. This will load a [`DetrImageProcessor`] behind the scenes. - - - - Table detection and table structure recognition clarified. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be -found [here](https://github.com/microsoft/table-transformer). - -## Resources - - - -- A demo notebook for the Table Transformer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Table%20Transformer). -- It turns out padding of images is quite important for detection. An interesting Github thread with replies from the authors can be found [here](https://github.com/microsoft/table-transformer/issues/68). - -## TableTransformerConfig - -[[autodoc]] TableTransformerConfig - -## TableTransformerModel - -[[autodoc]] TableTransformerModel - - forward - -## TableTransformerForObjectDetection - -[[autodoc]] TableTransformerForObjectDetection - - forward diff --git a/docs/source/en/model_doc/tapas.md b/docs/source/en/model_doc/tapas.md new file mode 100644 index 000000000000..1c76015f2857 --- /dev/null +++ b/docs/source/en/model_doc/tapas.md @@ -0,0 +1,623 @@ + + +# TAPAS + +## Overview + +The TAPAS model was proposed in [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://www.aclweb.org/anthology/2020.acl-main.398) +by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically +designed (and pre-trained) for answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 +token types that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset comprising +millions of tables from English Wikipedia and corresponding texts. + +For question answering, TAPAS has 2 heads on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or summing) among selected cells. TAPAS has been fine-tuned on several datasets: +- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft) +- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University) +- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce). + +It achieves state-of-the-art on both SQA and WTQ, while having comparable performance to SOTA on WikiSQL, with a much simpler architecture. + +The abstract from the paper is the following: + +*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition, the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.* + +In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced dataset of millions of automatically created training examples which are learned in an intermediate step prior to fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on [TabFact](https://github.com/wenhuchen/Table-Fact-Checking), a large-scale dataset with 16k Wikipedia tables for table entailment (a binary classification task). For more details, see their follow-up paper: [Understanding tables with intermediate pre-training](https://www.aclweb.org/anthology/2020.findings-emnlp.27/) by Julian Martin Eisenschlos, Syrine Krichene and Thomas Müller. + + + + TAPAS architecture. Taken from the original blog post. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tensorflow version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/tapas). + +Tips: + +- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell of the table). Note that this is something that was added after the publication of the original TAPAS paper. According to the authors, this usually results in a slightly better performance, and allows you to encode longer sequences without running out of embeddings. This is reflected in the `reset_position_index_per_cell` parameter of [`TapasConfig`], which is set to `True` by default. The default versions of the models available on the [hub](https://huggingface.co/models?search=tapas) all use relative position embeddings. You can still use the ones with absolute position embeddings by passing in an additional argument `revision="no_reset"` when calling the `from_pretrained()` method. Note that it's usually advised to pad the inputs on the right rather than the left. +- TAPAS is based on BERT, so `TAPAS-base` for example corresponds to a `BERT-base` architecture. Of course, `TAPAS-large` will result in the best performance (the results reported in the paper are from `TAPAS-large`). Results of the various sized models are shown on the [original Github repository](https://github.com/google-research/tapas>). +- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more info. +- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language modeling (CLM) objective are better in that regard. Note that TAPAS can be used as an encoder in the EncoderDecoderModel framework, to combine it with an autoregressive text decoder such as GPT-2. + +## Usage: fine-tuning + +Here we explain how you can fine-tune [`TapasForQuestionAnswering`] on your own dataset. + +**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment** + +Basically, there are 3 different ways in which one can fine-tune [`TapasForQuestionAnswering`], corresponding to the different datasets on which Tapas was fine-tuned: + +1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is he?". Here, questions do not involve any aggregation (all questions are cell selection questions). +2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his career?". This case is also called **weak supervision**, since the model itself must learn the appropriate aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision. +3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation operator is much easier. + +To summarize: + +| **Task** | **Example dataset** | **Description** | +|-------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------| +| Conversational | SQA | Conversational, only cell selection questions | +| Weak supervision for aggregation | WTQ | Questions might involve aggregation, and the model must learn this given only the answer as supervision | +| Strong supervision for aggregation | WikiSQL-supervised | Questions might involve aggregation, and the model must learn this given the gold aggregation operator | + + + +Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. + +```py +>>> from transformers import TapasConfig, TapasForQuestionAnswering + +>>> # for example, the base sized model with default SQA configuration +>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base") + +>>> # or, the base sized model with WTQ configuration +>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq") +>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +>>> # or, the base sized model with WikiSQL configuration +>>> config = TapasConfig("google-base-finetuned-wikisql-supervised") +>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +``` + +Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example: + +```py +>>> from transformers import TapasConfig, TapasForQuestionAnswering + +>>> # you can initialize the classification heads any way you want (see docs of TapasConfig) +>>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) +>>> # initializing the pre-trained base sized model with our custom classification heads +>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +``` + + +Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. Be sure to have installed the [tensorflow_probability](https://github.com/tensorflow/probability) dependency: + +```py +>>> from transformers import TapasConfig, TFTapasForQuestionAnswering + +>>> # for example, the base sized model with default SQA configuration +>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base") + +>>> # or, the base sized model with WTQ configuration +>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq") +>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +>>> # or, the base sized model with WikiSQL configuration +>>> config = TapasConfig("google-base-finetuned-wikisql-supervised") +>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +``` + +Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TFTapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example: + +```py +>>> from transformers import TapasConfig, TFTapasForQuestionAnswering + +>>> # you can initialize the classification heads any way you want (see docs of TapasConfig) +>>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) +>>> # initializing the pre-trained base sized model with our custom classification heads +>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +``` + + + +What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info. + +For a list of all pre-trained and fine-tuned TAPAS checkpoints available on HuggingFace's hub, see [here](https://huggingface.co/models?search=tapas). + +**STEP 2: Prepare your data in the SQA format** + +Second, no matter what you picked above, you should prepare your dataset in the [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) format. This format is a TSV/CSV file with the following columns: + +- `id`: optional, id of the table-question pair, for bookkeeping purposes. +- `annotator`: optional, id of the person who annotated the table-question pair, for bookkeeping purposes. +- `position`: integer indicating if the question is the first, second, third,... related to the table. Only required in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised. +- `question`: string +- `table_file`: string, name of a csv file containing the tabular data +- `answer_coordinates`: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is part of the answer) +- `answer_text`: list of one or more strings (each string being a cell value that is part of the answer) +- `aggregation_label`: index of the aggregation operator. Only required in case of strong supervision for aggregation (the WikiSQL-supervised case) +- `float_answer`: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of weak supervision for aggregation (such as WTQ and WikiSQL) + +The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the SQA format. The author explains this [here](https://github.com/google-research/tapas/issues/50#issuecomment-705465960). A conversion of this script that works with HuggingFace's implementation can be found [here](https://github.com/NielsRogge/tapas_utils). Interestingly, these conversion scripts are not perfect (the `answer_coordinates` and `float_answer` fields are populated based on the `answer_text`), meaning that WTQ and WikiSQL results could actually be improved. + +**STEP 3: Convert your data into tensors using TapasTokenizer** + + + +Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TapasForQuestionAnswering`] requires different +inputs to be fine-tuned: + +| **Task** | **Required inputs** | +|------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| Conversational | `input_ids`, `attention_mask`, `token_type_ids`, `labels` | +| Weak supervision for aggregation | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` | +| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels` | + +[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example: + +```py +>>> from transformers import TapasTokenizer +>>> import pandas as pd + +>>> model_name = "google/tapas-base" +>>> tokenizer = TapasTokenizer.from_pretrained(model_name) + +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> queries = [ +... "What is the name of the first actor?", +... "How many movies has George Clooney played in?", +... "What is the total number of movies?", +... ] +>>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]] +>>> answer_text = [["Brad Pitt"], ["69"], ["209"]] +>>> table = pd.DataFrame.from_dict(data) +>>> inputs = tokenizer( +... table=table, +... queries=queries, +... answer_coordinates=answer_coordinates, +... answer_text=answer_text, +... padding="max_length", +... return_tensors="pt", +... ) +>>> inputs +{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]), +'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])} +``` + +Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data. +Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches: + +```py +>>> import torch +>>> import pandas as pd + +>>> tsv_path = "your_path_to_the_tsv_file" +>>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files" + + +>>> class TableDataset(torch.utils.data.Dataset): +... def __init__(self, data, tokenizer): +... self.data = data +... self.tokenizer = tokenizer + +... def __getitem__(self, idx): +... item = data.iloc[idx] +... table = pd.read_csv(table_csv_path + item.table_file).astype( +... str +... ) # be sure to make your table data text only +... encoding = self.tokenizer( +... table=table, +... queries=item.question, +... answer_coordinates=item.answer_coordinates, +... answer_text=item.answer_text, +... truncation=True, +... padding="max_length", +... return_tensors="pt", +... ) +... # remove the batch dimension which the tokenizer adds by default +... encoding = {key: val.squeeze(0) for key, val in encoding.items()} +... # add the float_answer which is also required (weak supervision for aggregation case) +... encoding["float_answer"] = torch.tensor(item.float_answer) +... return encoding + +... def __len__(self): +... return len(self.data) + + +>>> data = pd.read_csv(tsv_path, sep="\t") +>>> train_dataset = TableDataset(data, tokenizer) +>>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) +``` + + +Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TFTapasForQuestionAnswering`] requires different +inputs to be fine-tuned: + +| **Task** | **Required inputs** | +|------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| Conversational | `input_ids`, `attention_mask`, `token_type_ids`, `labels` | +| Weak supervision for aggregation | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` | +| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels` | + +[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example: + +```py +>>> from transformers import TapasTokenizer +>>> import pandas as pd + +>>> model_name = "google/tapas-base" +>>> tokenizer = TapasTokenizer.from_pretrained(model_name) + +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> queries = [ +... "What is the name of the first actor?", +... "How many movies has George Clooney played in?", +... "What is the total number of movies?", +... ] +>>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]] +>>> answer_text = [["Brad Pitt"], ["69"], ["209"]] +>>> table = pd.DataFrame.from_dict(data) +>>> inputs = tokenizer( +... table=table, +... queries=queries, +... answer_coordinates=answer_coordinates, +... answer_text=answer_text, +... padding="max_length", +... return_tensors="tf", +... ) +>>> inputs +{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]), +'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])} +``` + +Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data. +Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches: + +```py +>>> import tensorflow as tf +>>> import pandas as pd + +>>> tsv_path = "your_path_to_the_tsv_file" +>>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files" + + +>>> class TableDataset: +... def __init__(self, data, tokenizer): +... self.data = data +... self.tokenizer = tokenizer + +... def __iter__(self): +... for idx in range(self.__len__()): +... item = self.data.iloc[idx] +... table = pd.read_csv(table_csv_path + item.table_file).astype( +... str +... ) # be sure to make your table data text only +... encoding = self.tokenizer( +... table=table, +... queries=item.question, +... answer_coordinates=item.answer_coordinates, +... answer_text=item.answer_text, +... truncation=True, +... padding="max_length", +... return_tensors="tf", +... ) +... # remove the batch dimension which the tokenizer adds by default +... encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()} +... # add the float_answer which is also required (weak supervision for aggregation case) +... encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32) +... yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[ +... "numeric_values_scale" +... ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"] + +... def __len__(self): +... return len(self.data) + + +>>> data = pd.read_csv(tsv_path, sep="\t") +>>> train_dataset = TableDataset(data, tokenizer) +>>> output_signature = ( +... tf.TensorSpec(shape=(512,), dtype=tf.int32), +... tf.TensorSpec(shape=(512,), dtype=tf.int32), +... tf.TensorSpec(shape=(512,), dtype=tf.float32), +... tf.TensorSpec(shape=(512,), dtype=tf.float32), +... tf.TensorSpec(shape=(512, 7), dtype=tf.int32), +... tf.TensorSpec(shape=(512,), dtype=tf.int32), +... tf.TensorSpec(shape=(512,), dtype=tf.float32), +... ) +>>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32) +``` + + + +Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group together the `queries`, `answer_coordinates` and `answer_text` per table (in the order of their `position` +index) and batch encode each table with its questions. This will make sure that the `prev_labels` token types (see docs of [`TapasTokenizer`]) are set correctly. See [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info. See [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info regarding using the TensorFlow model. + +**STEP 4: Train (fine-tune) the model + + + +You can then fine-tune [`TapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case): + +```py +>>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW + +>>> # this is the default WTQ configuration +>>> config = TapasConfig( +... num_aggregation_labels=4, +... use_answer_as_supervision=True, +... answer_loss_cutoff=0.664694, +... cell_selection_preference=0.207951, +... huber_loss_delta=0.121194, +... init_cell_selection_weights_to_zero=True, +... select_one_column=True, +... allow_empty_column_selection=False, +... temperature=0.0352513, +... ) +>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +>>> optimizer = AdamW(model.parameters(), lr=5e-5) + +>>> model.train() +>>> for epoch in range(2): # loop over the dataset multiple times +... for batch in train_dataloader: +... # get the inputs; +... input_ids = batch["input_ids"] +... attention_mask = batch["attention_mask"] +... token_type_ids = batch["token_type_ids"] +... labels = batch["labels"] +... numeric_values = batch["numeric_values"] +... numeric_values_scale = batch["numeric_values_scale"] +... float_answer = batch["float_answer"] + +... # zero the parameter gradients +... optimizer.zero_grad() + +... # forward + backward + optimize +... outputs = model( +... input_ids=input_ids, +... attention_mask=attention_mask, +... token_type_ids=token_type_ids, +... labels=labels, +... numeric_values=numeric_values, +... numeric_values_scale=numeric_values_scale, +... float_answer=float_answer, +... ) +... loss = outputs.loss +... loss.backward() +... optimizer.step() +``` + + +You can then fine-tune [`TFTapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case): + +```py +>>> import tensorflow as tf +>>> from transformers import TapasConfig, TFTapasForQuestionAnswering + +>>> # this is the default WTQ configuration +>>> config = TapasConfig( +... num_aggregation_labels=4, +... use_answer_as_supervision=True, +... answer_loss_cutoff=0.664694, +... cell_selection_preference=0.207951, +... huber_loss_delta=0.121194, +... init_cell_selection_weights_to_zero=True, +... select_one_column=True, +... allow_empty_column_selection=False, +... temperature=0.0352513, +... ) +>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +>>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) + +>>> for epoch in range(2): # loop over the dataset multiple times +... for batch in train_dataloader: +... # get the inputs; +... input_ids = batch[0] +... attention_mask = batch[1] +... token_type_ids = batch[4] +... labels = batch[-1] +... numeric_values = batch[2] +... numeric_values_scale = batch[3] +... float_answer = batch[6] + +... # forward + backward + optimize +... with tf.GradientTape() as tape: +... outputs = model( +... input_ids=input_ids, +... attention_mask=attention_mask, +... token_type_ids=token_type_ids, +... labels=labels, +... numeric_values=numeric_values, +... numeric_values_scale=numeric_values_scale, +... float_answer=float_answer, +... ) +... grads = tape.gradient(outputs.loss, model.trainable_weights) +... optimizer.apply_gradients(zip(grads, model.trainable_weights)) +``` + + + +## Usage: inference + + + +Here we explain how you can use [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices. + +However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that: + +```py +>>> from transformers import TapasTokenizer, TapasForQuestionAnswering +>>> import pandas as pd + +>>> model_name = "google/tapas-base-finetuned-wtq" +>>> model = TapasForQuestionAnswering.from_pretrained(model_name) +>>> tokenizer = TapasTokenizer.from_pretrained(model_name) + +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> queries = [ +... "What is the name of the first actor?", +... "How many movies has George Clooney played in?", +... "What is the total number of movies?", +... ] +>>> table = pd.DataFrame.from_dict(data) +>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt") +>>> outputs = model(**inputs) +>>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( +... inputs, outputs.logits.detach(), outputs.logits_aggregation.detach() +... ) + +>>> # let's print out the results: +>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} +>>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] + +>>> answers = [] +>>> for coordinates in predicted_answer_coordinates: +... if len(coordinates) == 1: +... # only a single cell: +... answers.append(table.iat[coordinates[0]]) +... else: +... # multiple cells +... cell_values = [] +... for coordinate in coordinates: +... cell_values.append(table.iat[coordinate]) +... answers.append(", ".join(cell_values)) + +>>> display(table) +>>> print("") +>>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): +... print(query) +... if predicted_agg == "NONE": +... print("Predicted answer: " + answer) +... else: +... print("Predicted answer: " + predicted_agg + " > " + answer) +What is the name of the first actor? +Predicted answer: Brad Pitt +How many movies has George Clooney played in? +Predicted answer: COUNT > 69 +What is the total number of movies? +Predicted answer: SUM > 87, 53, 69 +``` + + +Here we explain how you can use [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices. + +However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that: + +```py +>>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering +>>> import pandas as pd + +>>> model_name = "google/tapas-base-finetuned-wtq" +>>> model = TFTapasForQuestionAnswering.from_pretrained(model_name) +>>> tokenizer = TapasTokenizer.from_pretrained(model_name) + +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> queries = [ +... "What is the name of the first actor?", +... "How many movies has George Clooney played in?", +... "What is the total number of movies?", +... ] +>>> table = pd.DataFrame.from_dict(data) +>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf") +>>> outputs = model(**inputs) +>>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( +... inputs, outputs.logits, outputs.logits_aggregation +... ) + +>>> # let's print out the results: +>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} +>>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] + +>>> answers = [] +>>> for coordinates in predicted_answer_coordinates: +... if len(coordinates) == 1: +... # only a single cell: +... answers.append(table.iat[coordinates[0]]) +... else: +... # multiple cells +... cell_values = [] +... for coordinate in coordinates: +... cell_values.append(table.iat[coordinate]) +... answers.append(", ".join(cell_values)) + +>>> display(table) +>>> print("") +>>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): +... print(query) +... if predicted_agg == "NONE": +... print("Predicted answer: " + answer) +... else: +... print("Predicted answer: " + predicted_agg + " > " + answer) +What is the name of the first actor? +Predicted answer: Brad Pitt +How many movies has George Clooney played in? +Predicted answer: COUNT > 69 +What is the total number of movies? +Predicted answer: SUM > 87, 53, 69 +``` + + + +In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the `prev_labels` token types can be overwritten by the predicted `labels` of the previous table-question pair. Again, more info can be found in [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for PyTorch) and [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for TensorFlow). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Masked language modeling task guide](../tasks/masked_language_modeling) + +## TAPAS specific outputs +[[autodoc]] models.tapas.modeling_tapas.TableQuestionAnsweringOutput + +## TapasConfig +[[autodoc]] TapasConfig + +## TapasTokenizer +[[autodoc]] TapasTokenizer + - __call__ + - convert_logits_to_predictions + - save_vocabulary + +## TapasModel +[[autodoc]] TapasModel + - forward + +## TapasForMaskedLM +[[autodoc]] TapasForMaskedLM + - forward + +## TapasForSequenceClassification +[[autodoc]] TapasForSequenceClassification + - forward + +## TapasForQuestionAnswering +[[autodoc]] TapasForQuestionAnswering + - forward + +## TFTapasModel +[[autodoc]] TFTapasModel + - call + +## TFTapasForMaskedLM +[[autodoc]] TFTapasForMaskedLM + - call + +## TFTapasForSequenceClassification +[[autodoc]] TFTapasForSequenceClassification + - call + +## TFTapasForQuestionAnswering +[[autodoc]] TFTapasForQuestionAnswering + - call \ No newline at end of file diff --git a/docs/source/en/model_doc/tapas.mdx b/docs/source/en/model_doc/tapas.mdx deleted file mode 100644 index 5a2b54e8c32c..000000000000 --- a/docs/source/en/model_doc/tapas.mdx +++ /dev/null @@ -1,614 +0,0 @@ - - -# TAPAS - -## Overview - -The TAPAS model was proposed in [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://www.aclweb.org/anthology/2020.acl-main.398) -by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically -designed (and pre-trained) for answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 -token types that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset comprising -millions of tables from English Wikipedia and corresponding texts. - -For question answering, TAPAS has 2 heads on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or summing) among selected cells. TAPAS has been fine-tuned on several datasets: -- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft) -- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University) -- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce). - -It achieves state-of-the-art on both SQA and WTQ, while having comparable performance to SOTA on WikiSQL, with a much simpler architecture. - -The abstract from the paper is the following: - -*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition, the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.* - -In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced dataset of millions of automatically created training examples which are learned in an intermediate step prior to fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on [TabFact](https://github.com/wenhuchen/Table-Fact-Checking), a large-scale dataset with 16k Wikipedia tables for table entailment (a binary classification task). For more details, see their follow-up paper: [Understanding tables with intermediate pre-training](https://www.aclweb.org/anthology/2020.findings-emnlp.27/) by Julian Martin Eisenschlos, Syrine Krichene and Thomas Müller. - - - - TAPAS architecture. Taken from the original blog post. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tensorflow version of this model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/tapas). - -Tips: - -- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell of the table). Note that this is something that was added after the publication of the original TAPAS paper. According to the authors, this usually results in a slightly better performance, and allows you to encode longer sequences without running out of embeddings. This is reflected in the `reset_position_index_per_cell` parameter of [`TapasConfig`], which is set to `True` by default. The default versions of the models available on the [hub](https://huggingface.co/models?search=tapas) all use relative position embeddings. You can still use the ones with absolute position embeddings by passing in an additional argument `revision="no_reset"` when calling the `from_pretrained()` method. Note that it's usually advised to pad the inputs on the right rather than the left. -- TAPAS is based on BERT, so `TAPAS-base` for example corresponds to a `BERT-base` architecture. Of course, `TAPAS-large` will result in the best performance (the results reported in the paper are from `TAPAS-large`). Results of the various sized models are shown on the [original Github repository](https://github.com/google-research/tapas>). -- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more info. -- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language modeling (CLM) objective are better in that regard. Note that TAPAS can be used as an encoder in the EncoderDecoderModel framework, to combine it with an autoregressive text decoder such as GPT-2. - -## Usage: fine-tuning - -Here we explain how you can fine-tune [`TapasForQuestionAnswering`] on your own dataset. - -**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment** - -Basically, there are 3 different ways in which one can fine-tune [`TapasForQuestionAnswering`], corresponding to the different datasets on which Tapas was fine-tuned: - -1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is he?". Here, questions do not involve any aggregation (all questions are cell selection questions). -2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his career?". This case is also called **weak supervision**, since the model itself must learn the appropriate aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision. -3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation operator is much easier. - -To summarize: - -| **Task** | **Example dataset** | **Description** | -|-------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------| -| Conversational | SQA | Conversational, only cell selection questions | -| Weak supervision for aggregation | WTQ | Questions might involve aggregation, and the model must learn this given only the answer as supervision | -| Strong supervision for aggregation | WikiSQL-supervised | Questions might involve aggregation, and the model must learn this given the gold aggregation operator | - - - -Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. - -```py ->>> from transformers import TapasConfig, TapasForQuestionAnswering - ->>> # for example, the base sized model with default SQA configuration ->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base") - ->>> # or, the base sized model with WTQ configuration ->>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq") ->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) - ->>> # or, the base sized model with WikiSQL configuration ->>> config = TapasConfig("google-base-finetuned-wikisql-supervised") ->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) -``` - -Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example: - -```py ->>> from transformers import TapasConfig, TapasForQuestionAnswering - ->>> # you can initialize the classification heads any way you want (see docs of TapasConfig) ->>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) ->>> # initializing the pre-trained base sized model with our custom classification heads ->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) -``` - - -Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. Be sure to have installed the [tensorflow_probability](https://github.com/tensorflow/probability) dependency: - -```py ->>> from transformers import TapasConfig, TFTapasForQuestionAnswering - ->>> # for example, the base sized model with default SQA configuration ->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base") - ->>> # or, the base sized model with WTQ configuration ->>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq") ->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) - ->>> # or, the base sized model with WikiSQL configuration ->>> config = TapasConfig("google-base-finetuned-wikisql-supervised") ->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) -``` - -Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TFTapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example: - -```py ->>> from transformers import TapasConfig, TFTapasForQuestionAnswering - ->>> # you can initialize the classification heads any way you want (see docs of TapasConfig) ->>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) ->>> # initializing the pre-trained base sized model with our custom classification heads ->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) -``` - - - -What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info. - -For a list of all pre-trained and fine-tuned TAPAS checkpoints available on HuggingFace's hub, see [here](https://huggingface.co/models?search=tapas). - -**STEP 2: Prepare your data in the SQA format** - -Second, no matter what you picked above, you should prepare your dataset in the [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) format. This format is a TSV/CSV file with the following columns: - -- `id`: optional, id of the table-question pair, for bookkeeping purposes. -- `annotator`: optional, id of the person who annotated the table-question pair, for bookkeeping purposes. -- `position`: integer indicating if the question is the first, second, third,... related to the table. Only required in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised. -- `question`: string -- `table_file`: string, name of a csv file containing the tabular data -- `answer_coordinates`: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is part of the answer) -- `answer_text`: list of one or more strings (each string being a cell value that is part of the answer) -- `aggregation_label`: index of the aggregation operator. Only required in case of strong supervision for aggregation (the WikiSQL-supervised case) -- `float_answer`: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of weak supervision for aggregation (such as WTQ and WikiSQL) - -The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the SQA format. The author explains this [here](https://github.com/google-research/tapas/issues/50#issuecomment-705465960). A conversion of this script that works with HuggingFace's implementation can be found [here](https://github.com/NielsRogge/tapas_utils). Interestingly, these conversion scripts are not perfect (the `answer_coordinates` and `float_answer` fields are populated based on the `answer_text`), meaning that WTQ and WikiSQL results could actually be improved. - -**STEP 3: Convert your data into tensors using TapasTokenizer** - - - -Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TapasForQuestionAnswering`] requires different -inputs to be fine-tuned: - -| **Task** | **Required inputs** | -|------------------------------------|---------------------------------------------------------------------------------------------------------------------| -| Conversational | `input_ids`, `attention_mask`, `token_type_ids`, `labels` | -| Weak supervision for aggregation | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` | -| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels` | - -[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example: - -```py ->>> from transformers import TapasTokenizer ->>> import pandas as pd - ->>> model_name = "google/tapas-base" ->>> tokenizer = TapasTokenizer.from_pretrained(model_name) - ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> queries = [ -... "What is the name of the first actor?", -... "How many movies has George Clooney played in?", -... "What is the total number of movies?", -... ] ->>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]] ->>> answer_text = [["Brad Pitt"], ["69"], ["209"]] ->>> table = pd.DataFrame.from_dict(data) ->>> inputs = tokenizer( -... table=table, -... queries=queries, -... answer_coordinates=answer_coordinates, -... answer_text=answer_text, -... padding="max_length", -... return_tensors="pt", -... ) ->>> inputs -{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]), -'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])} -``` - -Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data. -Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches: - -```py ->>> import torch ->>> import pandas as pd - ->>> tsv_path = "your_path_to_the_tsv_file" ->>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files" - - ->>> class TableDataset(torch.utils.data.Dataset): -... def __init__(self, data, tokenizer): -... self.data = data -... self.tokenizer = tokenizer - -... def __getitem__(self, idx): -... item = data.iloc[idx] -... table = pd.read_csv(table_csv_path + item.table_file).astype( -... str -... ) # be sure to make your table data text only -... encoding = self.tokenizer( -... table=table, -... queries=item.question, -... answer_coordinates=item.answer_coordinates, -... answer_text=item.answer_text, -... truncation=True, -... padding="max_length", -... return_tensors="pt", -... ) -... # remove the batch dimension which the tokenizer adds by default -... encoding = {key: val.squeeze(0) for key, val in encoding.items()} -... # add the float_answer which is also required (weak supervision for aggregation case) -... encoding["float_answer"] = torch.tensor(item.float_answer) -... return encoding - -... def __len__(self): -... return len(self.data) - - ->>> data = pd.read_csv(tsv_path, sep="\t") ->>> train_dataset = TableDataset(data, tokenizer) ->>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) -``` - - -Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [`TapasTokenizer`] to convert table-question pairs into `input_ids`, `attention_mask`, `token_type_ids` and so on. Again, based on which of the three cases you picked above, [`TFTapasForQuestionAnswering`] requires different -inputs to be fine-tuned: - -| **Task** | **Required inputs** | -|------------------------------------|---------------------------------------------------------------------------------------------------------------------| -| Conversational | `input_ids`, `attention_mask`, `token_type_ids`, `labels` | -| Weak supervision for aggregation | `input_ids`, `attention_mask`, `token_type_ids`, `labels`, `numeric_values`, `numeric_values_scale`, `float_answer` | -| Strong supervision for aggregation | `input ids`, `attention mask`, `token type ids`, `labels`, `aggregation_labels` | - -[`TapasTokenizer`] creates the `labels`, `numeric_values` and `numeric_values_scale` based on the `answer_coordinates` and `answer_text` columns of the TSV file. The `float_answer` and `aggregation_labels` are already in the TSV file of step 2. Here's an example: - -```py ->>> from transformers import TapasTokenizer ->>> import pandas as pd - ->>> model_name = "google/tapas-base" ->>> tokenizer = TapasTokenizer.from_pretrained(model_name) - ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> queries = [ -... "What is the name of the first actor?", -... "How many movies has George Clooney played in?", -... "What is the total number of movies?", -... ] ->>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]] ->>> answer_text = [["Brad Pitt"], ["69"], ["209"]] ->>> table = pd.DataFrame.from_dict(data) ->>> inputs = tokenizer( -... table=table, -... queries=queries, -... answer_coordinates=answer_coordinates, -... answer_text=answer_text, -... padding="max_length", -... return_tensors="tf", -... ) ->>> inputs -{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]), -'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])} -``` - -Note that [`TapasTokenizer`] expects the data of the table to be **text-only**. You can use `.astype(str)` on a dataframe to turn it into text-only data. -Of course, this only shows how to encode a single training example. It is advised to create a dataloader to iterate over batches: - -```py ->>> import tensorflow as tf ->>> import pandas as pd - ->>> tsv_path = "your_path_to_the_tsv_file" ->>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files" - - ->>> class TableDataset: -... def __init__(self, data, tokenizer): -... self.data = data -... self.tokenizer = tokenizer - -... def __iter__(self): -... for idx in range(self.__len__()): -... item = self.data.iloc[idx] -... table = pd.read_csv(table_csv_path + item.table_file).astype( -... str -... ) # be sure to make your table data text only -... encoding = self.tokenizer( -... table=table, -... queries=item.question, -... answer_coordinates=item.answer_coordinates, -... answer_text=item.answer_text, -... truncation=True, -... padding="max_length", -... return_tensors="tf", -... ) -... # remove the batch dimension which the tokenizer adds by default -... encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()} -... # add the float_answer which is also required (weak supervision for aggregation case) -... encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32) -... yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[ -... "numeric_values_scale" -... ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"] - -... def __len__(self): -... return len(self.data) - - ->>> data = pd.read_csv(tsv_path, sep="\t") ->>> train_dataset = TableDataset(data, tokenizer) ->>> output_signature = ( -... tf.TensorSpec(shape=(512,), dtype=tf.int32), -... tf.TensorSpec(shape=(512,), dtype=tf.int32), -... tf.TensorSpec(shape=(512,), dtype=tf.float32), -... tf.TensorSpec(shape=(512,), dtype=tf.float32), -... tf.TensorSpec(shape=(512, 7), dtype=tf.int32), -... tf.TensorSpec(shape=(512,), dtype=tf.int32), -... tf.TensorSpec(shape=(512,), dtype=tf.float32), -... ) ->>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32) -``` - - - -Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group together the `queries`, `answer_coordinates` and `answer_text` per table (in the order of their `position` -index) and batch encode each table with its questions. This will make sure that the `prev_labels` token types (see docs of [`TapasTokenizer`]) are set correctly. See [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info. See [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) for more info regarding using the TensorFlow model. - -**STEP 4: Train (fine-tune) the model - - - -You can then fine-tune [`TapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case): - -```py ->>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW - ->>> # this is the default WTQ configuration ->>> config = TapasConfig( -... num_aggregation_labels=4, -... use_answer_as_supervision=True, -... answer_loss_cutoff=0.664694, -... cell_selection_preference=0.207951, -... huber_loss_delta=0.121194, -... init_cell_selection_weights_to_zero=True, -... select_one_column=True, -... allow_empty_column_selection=False, -... temperature=0.0352513, -... ) ->>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) - ->>> optimizer = AdamW(model.parameters(), lr=5e-5) - ->>> model.train() ->>> for epoch in range(2): # loop over the dataset multiple times -... for batch in train_dataloader: -... # get the inputs; -... input_ids = batch["input_ids"] -... attention_mask = batch["attention_mask"] -... token_type_ids = batch["token_type_ids"] -... labels = batch["labels"] -... numeric_values = batch["numeric_values"] -... numeric_values_scale = batch["numeric_values_scale"] -... float_answer = batch["float_answer"] - -... # zero the parameter gradients -... optimizer.zero_grad() - -... # forward + backward + optimize -... outputs = model( -... input_ids=input_ids, -... attention_mask=attention_mask, -... token_type_ids=token_type_ids, -... labels=labels, -... numeric_values=numeric_values, -... numeric_values_scale=numeric_values_scale, -... float_answer=float_answer, -... ) -... loss = outputs.loss -... loss.backward() -... optimizer.step() -``` - - -You can then fine-tune [`TFTapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case): - -```py ->>> import tensorflow as tf ->>> from transformers import TapasConfig, TFTapasForQuestionAnswering - ->>> # this is the default WTQ configuration ->>> config = TapasConfig( -... num_aggregation_labels=4, -... use_answer_as_supervision=True, -... answer_loss_cutoff=0.664694, -... cell_selection_preference=0.207951, -... huber_loss_delta=0.121194, -... init_cell_selection_weights_to_zero=True, -... select_one_column=True, -... allow_empty_column_selection=False, -... temperature=0.0352513, -... ) ->>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) - ->>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) - ->>> for epoch in range(2): # loop over the dataset multiple times -... for batch in train_dataloader: -... # get the inputs; -... input_ids = batch[0] -... attention_mask = batch[1] -... token_type_ids = batch[4] -... labels = batch[-1] -... numeric_values = batch[2] -... numeric_values_scale = batch[3] -... float_answer = batch[6] - -... # forward + backward + optimize -... with tf.GradientTape() as tape: -... outputs = model( -... input_ids=input_ids, -... attention_mask=attention_mask, -... token_type_ids=token_type_ids, -... labels=labels, -... numeric_values=numeric_values, -... numeric_values_scale=numeric_values_scale, -... float_answer=float_answer, -... ) -... grads = tape.gradient(outputs.loss, model.trainable_weights) -... optimizer.apply_gradients(zip(grads, model.trainable_weights)) -``` - - - -## Usage: inference - - - -Here we explain how you can use [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices. - -However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that: - -```py ->>> from transformers import TapasTokenizer, TapasForQuestionAnswering ->>> import pandas as pd - ->>> model_name = "google/tapas-base-finetuned-wtq" ->>> model = TapasForQuestionAnswering.from_pretrained(model_name) ->>> tokenizer = TapasTokenizer.from_pretrained(model_name) - ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> queries = [ -... "What is the name of the first actor?", -... "How many movies has George Clooney played in?", -... "What is the total number of movies?", -... ] ->>> table = pd.DataFrame.from_dict(data) ->>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt") ->>> outputs = model(**inputs) ->>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( -... inputs, outputs.logits.detach(), outputs.logits_aggregation.detach() -... ) - ->>> # let's print out the results: ->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} ->>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] - ->>> answers = [] ->>> for coordinates in predicted_answer_coordinates: -... if len(coordinates) == 1: -... # only a single cell: -... answers.append(table.iat[coordinates[0]]) -... else: -... # multiple cells -... cell_values = [] -... for coordinate in coordinates: -... cell_values.append(table.iat[coordinate]) -... answers.append(", ".join(cell_values)) - ->>> display(table) ->>> print("") ->>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): -... print(query) -... if predicted_agg == "NONE": -... print("Predicted answer: " + answer) -... else: -... print("Predicted answer: " + predicted_agg + " > " + answer) -What is the name of the first actor? -Predicted answer: Brad Pitt -How many movies has George Clooney played in? -Predicted answer: COUNT > 69 -What is the total number of movies? -Predicted answer: SUM > 87, 53, 69 -``` - - -Here we explain how you can use [`TFTapasForQuestionAnswering`] for inference (i.e. making predictions on new data). For inference, only `input_ids`, `attention_mask` and `token_type_ids` (which you can obtain using [`TapasTokenizer`]) have to be provided to the model to obtain the logits. Next, you can use the handy [`~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices. - -However, note that inference is **different** depending on whether or not the setup is conversational. In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example of that: - -```py ->>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering ->>> import pandas as pd - ->>> model_name = "google/tapas-base-finetuned-wtq" ->>> model = TFTapasForQuestionAnswering.from_pretrained(model_name) ->>> tokenizer = TapasTokenizer.from_pretrained(model_name) - ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> queries = [ -... "What is the name of the first actor?", -... "How many movies has George Clooney played in?", -... "What is the total number of movies?", -... ] ->>> table = pd.DataFrame.from_dict(data) ->>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf") ->>> outputs = model(**inputs) ->>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( -... inputs, outputs.logits, outputs.logits_aggregation -... ) - ->>> # let's print out the results: ->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} ->>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] - ->>> answers = [] ->>> for coordinates in predicted_answer_coordinates: -... if len(coordinates) == 1: -... # only a single cell: -... answers.append(table.iat[coordinates[0]]) -... else: -... # multiple cells -... cell_values = [] -... for coordinate in coordinates: -... cell_values.append(table.iat[coordinate]) -... answers.append(", ".join(cell_values)) - ->>> display(table) ->>> print("") ->>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): -... print(query) -... if predicted_agg == "NONE": -... print("Predicted answer: " + answer) -... else: -... print("Predicted answer: " + predicted_agg + " > " + answer) -What is the name of the first actor? -Predicted answer: Brad Pitt -How many movies has George Clooney played in? -Predicted answer: COUNT > 69 -What is the total number of movies? -Predicted answer: SUM > 87, 53, 69 -``` - - - -In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the `prev_labels` token types can be overwritten by the predicted `labels` of the previous table-question pair. Again, more info can be found in [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for PyTorch) and [this notebook](https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) (for TensorFlow). - -## TAPAS specific outputs -[[autodoc]] models.tapas.modeling_tapas.TableQuestionAnsweringOutput - -## TapasConfig -[[autodoc]] TapasConfig - -## TapasTokenizer -[[autodoc]] TapasTokenizer - - __call__ - - convert_logits_to_predictions - - save_vocabulary - -## TapasModel -[[autodoc]] TapasModel - - forward - -## TapasForMaskedLM -[[autodoc]] TapasForMaskedLM - - forward - -## TapasForSequenceClassification -[[autodoc]] TapasForSequenceClassification - - forward - -## TapasForQuestionAnswering -[[autodoc]] TapasForQuestionAnswering - - forward - -## TFTapasModel -[[autodoc]] TFTapasModel - - call - -## TFTapasForMaskedLM -[[autodoc]] TFTapasForMaskedLM - - call - -## TFTapasForSequenceClassification -[[autodoc]] TFTapasForSequenceClassification - - call - -## TFTapasForQuestionAnswering -[[autodoc]] TFTapasForQuestionAnswering - - call \ No newline at end of file diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md new file mode 100644 index 000000000000..52234b5c59bc --- /dev/null +++ b/docs/source/en/model_doc/tapex.md @@ -0,0 +1,143 @@ + + +# TAPEX + + + +This model is in maintenance mode only, so we won't accept any new PRs changing its code. + +If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0. +You can do so by running the following command: `pip install -U transformers==4.30.0`. + + + +## Overview + +The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, +Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after +which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. + +TAPEX has been fine-tuned on several datasets: +- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft) +- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University) +- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce) +- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab). + +The abstract from the paper is the following: + +*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is +still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we +propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically +synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL +executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that +TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements +on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy +to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs +and to achieve new state-of-the-art results on various downstream tasks.* + +Tips: + +- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. +- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact. +- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format: + `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`. +- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer, + and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). + +## Usage: inference + +Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model. +We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us, +based on the configuration file of the checkpoint on the hub. + +```python +>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +>>> import pandas as pd + +>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq") + +>>> # prepare table + question +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> table = pd.DataFrame.from_dict(data) +>>> question = "how many movies does Leonardo Di Caprio have?" + +>>> encoding = tokenizer(table, question, return_tensors="pt") + +>>> # let the model generate an answer autoregressively +>>> outputs = model.generate(**encoding) + +>>> # decode back to text +>>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] +>>> print(predicted_answer) +53 +``` + +Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table +and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this: + +```python +>>> # prepare table + question +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> table = pd.DataFrame.from_dict(data) +>>> questions = [ +... "how many movies does Leonardo Di Caprio have?", +... "which actor has 69 movies?", +... "what's the first name of the actor who has 87 movies?", +... ] +>>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt") + +>>> # let the model generate an answer autoregressively +>>> outputs = model.generate(**encoding) + +>>> # decode back to text +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +[' 53', ' george clooney', ' brad pitt'] +``` + +In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents +of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important +benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto). + +```python +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact") +>>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact") + +>>> # prepare table + sentence +>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +>>> table = pd.DataFrame.from_dict(data) +>>> sentence = "George Clooney has 30 movies" + +>>> encoding = tokenizer(table, sentence, return_tensors="pt") + +>>> # forward pass +>>> outputs = model(**encoding) + +>>> # print prediction +>>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item() +>>> print(model.config.id2label[predicted_class_idx]) +Refused +``` + + +## TapexTokenizer + +[[autodoc]] TapexTokenizer + - __call__ + - save_vocabulary \ No newline at end of file diff --git a/docs/source/en/model_doc/tapex.mdx b/docs/source/en/model_doc/tapex.mdx deleted file mode 100644 index f6e65764e50d..000000000000 --- a/docs/source/en/model_doc/tapex.mdx +++ /dev/null @@ -1,130 +0,0 @@ - - -# TAPEX - -## Overview - -The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, -Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after -which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking. - -TAPEX has been fine-tuned on several datasets: -- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft) -- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University) -- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce) -- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab). - -The abstract from the paper is the following: - -*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is -still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we -propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically -synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL -executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that -TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements -on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy -to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs -and to achieve new state-of-the-art results on various downstream tasks.* - -Tips: - -- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model. -- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact. -- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format: - `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`. -- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer, - and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below). - -## Usage: inference - -Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model. -We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us, -based on the configuration file of the checkpoint on the hub. - -```python ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM ->>> import pandas as pd - ->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq") ->>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq") - ->>> # prepare table + question ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> table = pd.DataFrame.from_dict(data) ->>> question = "how many movies does Leonardo Di Caprio have?" - ->>> encoding = tokenizer(table, question, return_tensors="pt") - ->>> # let the model generate an answer autoregressively ->>> outputs = model.generate(**encoding) - ->>> # decode back to text ->>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] ->>> print(predicted_answer) -53 -``` - -Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table -and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this: - -```python ->>> # prepare table + question ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> table = pd.DataFrame.from_dict(data) ->>> questions = [ -... "how many movies does Leonardo Di Caprio have?", -... "which actor has 69 movies?", -... "what's the first name of the actor who has 87 movies?", -... ] ->>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt") - ->>> # let the model generate an answer autoregressively ->>> outputs = model.generate(**encoding) - ->>> # decode back to text ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -[' 53', ' george clooney', ' brad pitt'] -``` - -In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents -of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important -benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto). - -```python ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact") ->>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact") - ->>> # prepare table + sentence ->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} ->>> table = pd.DataFrame.from_dict(data) ->>> sentence = "George Clooney has 30 movies" - ->>> encoding = tokenizer(table, sentence, return_tensors="pt") - ->>> # forward pass ->>> outputs = model(**encoding) - ->>> # print prediction ->>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item() ->>> print(model.config.id2label[predicted_class_idx]) -Refused -``` - - -## TapexTokenizer - -[[autodoc]] TapexTokenizer - - __call__ - - save_vocabulary \ No newline at end of file diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md new file mode 100644 index 000000000000..208798aa1c68 --- /dev/null +++ b/docs/source/en/model_doc/time_series_transformer.md @@ -0,0 +1,83 @@ + + +# Time Series Transformer + + + +This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight +breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title). + + + +## Overview + +The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting. + +Tips: + +- Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`] +adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a +point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values. +- [`TimeSeriesTransformerForPrediction`] consists of 2 blocks: an encoder, which takes a `context_length` of time series values as input (called `past_values`), +and a decoder, which predicts a `prediction_length` of time series values into the future (called `future_values`). During training, one needs to provide +pairs of (`past_values` and `future_values`) to the model. +- In addition to the raw (`past_values` and `future_values`), one typically provides additional features to the model. These can be the following: + - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder. + Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector). + e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). + - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder. + Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector). + e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). + - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`). + An example here is the store ID or region ID that identifies a given time-series. + Note that these features need to be known for ALL data points (also those in the future). + - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`). + An example here is the image representation of the product for which you have the time-series values (like the [ResNet](resnet) embedding of a "shoe" picture, + if your time-series is about the sales of shoes). + Note that these features need to be known for ALL data points (also those in the future). +- The model is trained using "teacher-forcing", similar to how a Transformer is trained for machine translation. This means that, during training, one shifts the +`future_values` one position to the right as input to the decoder, prepended by the last value of `past_values`. At each time step, the model needs to predict the +next target. So the set-up of training is similar to a GPT model for language, except that there's no notion of `decoder_start_token_id` (we just use the last value +of the context as initial input for the decoder). +- At inference time, we give the final value of the `past_values` as input to the decoder. Next, we can sample from the model to make a prediction at the next time step, +which is then fed to the decoder in order to make the next prediction (also called autoregressive generation). + + +This model was contributed by [kashif](https://huggingface.co/kashif). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +- Check out the Time Series Transformer blog-post in HuggingFace blog: [Probabilistic Time Series Forecasting with 🤗 Transformers](https://huggingface.co/blog/time-series-transformers) + + +## TimeSeriesTransformerConfig + +[[autodoc]] TimeSeriesTransformerConfig + + +## TimeSeriesTransformerModel + +[[autodoc]] TimeSeriesTransformerModel + - forward + + +## TimeSeriesTransformerForPrediction + +[[autodoc]] TimeSeriesTransformerForPrediction + - forward diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx deleted file mode 100644 index 5dedef02eaa8..000000000000 --- a/docs/source/en/model_doc/time_series_transformer.mdx +++ /dev/null @@ -1,73 +0,0 @@ - - -# Time Series Transformer - - - -This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight -breaking changes to fix it in the future. If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title). - - - -## Overview - -The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting. - -Tips: - -- Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`] -adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a -point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values. -- [`TimeSeriesTransformerForPrediction`] consists of 2 blocks: an encoder, which takes a `context_length` of time series values as input (called `past_values`), -and a decoder, which predicts a `prediction_length` of time series values into the future (called `future_values`). During training, one needs to provide -pairs of (`past_values` and `future_values`) to the model. -- In addition to the raw (`past_values` and `future_values`), one typically provides additional features to the model. These can be the following: - - `past_time_features`: temporal features which the model will add to `past_values`. These serve as "positional encodings" for the Transformer encoder. - Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector). - e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). - - `future_time_features`: temporal features which the model will add to `future_values`. These serve as "positional encodings" for the Transformer decoder. - Examples are "day of the month", "month of the year", etc. as scalar values (and then stacked together as a vector). - e.g. if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). - - `static_categorical_features`: categorical features which are static over time (i.e., have the same value for all `past_values` and `future_values`). - An example here is the store ID or region ID that identifies a given time-series. - Note that these features need to be known for ALL data points (also those in the future). - - `static_real_features`: real-valued features which are static over time (i.e., have the same value for all `past_values` and `future_values`). - An example here is the image representation of the product for which you have the time-series values (like the [ResNet](resnet) embedding of a "shoe" picture, - if your time-series is about the sales of shoes). - Note that these features need to be known for ALL data points (also those in the future). -- The model is trained using "teacher-forcing", similar to how a Transformer is trained for machine translation. This means that, during training, one shifts the -`future_values` one position to the right as input to the decoder, prepended by the last value of `past_values`. At each time step, the model needs to predict the -next target. So the set-up of training is similar to a GPT model for language, except that there's no notion of `decoder_start_token_id` (we just use the last value -of the context as initial input for the decoder). -- At inference time, we give the final value of the `past_values` as input to the decoder. Next, we can sample from the model to make a prediction at the next time step, -which is then fed to the decoder in order to make the next prediction (also called autoregressive generation). - - -This model was contributed by [kashif]( + +# TimeSformer + +## Overview + +The TimeSformer model was proposed in [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Facebook Research. +This work is a milestone in action-recognition field being the first video transformer. It inspired many transformer based video understanding and classification papers. + +The abstract from the paper is the following: + +*We present a convolution-free approach to video classification built exclusively on self-attention over space and time. Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). Code and models are available at: [this https URL](https://github.com/facebookresearch/TimeSformer).* + +Tips: + +There are many pretrained variants. Select your pretrained model based on the dataset it is trained on. Moreover, the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model. + +This model was contributed by [fcakyon](https://huggingface.co/fcakyon). +The original code can be found [here](https://github.com/facebookresearch/TimeSformer). + +## Documentation resources + +- [Video classification task guide](../tasks/video_classification) + +## TimesformerConfig + +[[autodoc]] TimesformerConfig + +## TimesformerModel + +[[autodoc]] TimesformerModel + - forward + +## TimesformerForVideoClassification + +[[autodoc]] TimesformerForVideoClassification + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/timesformer.mdx b/docs/source/en/model_doc/timesformer.mdx deleted file mode 100644 index 602ec4f4f2a7..000000000000 --- a/docs/source/en/model_doc/timesformer.mdx +++ /dev/null @@ -1,44 +0,0 @@ - - -# TimeSformer - -## Overview - -The TimeSformer model was proposed in [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Facebook Research. -This work is a milestone in action-recognition field being the first video transformer. It inspired many transformer based video understanding and classification papers. - -The abstract from the paper is the following: - -*We present a convolution-free approach to video classification built exclusively on self-attention over space and time. Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). Code and models are available at: [this https URL](https://github.com/facebookresearch/TimeSformer).* - -Tips: - -There are many pretrained variants. Select your pretrained model based on the dataset it is trained on. Moreover, the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model. - -This model was contributed by [fcakyon](https://huggingface.co/fcakyon). -The original code can be found [here](https://github.com/facebookresearch/TimeSformer). - - -## TimesformerConfig - -[[autodoc]] TimesformerConfig - -## TimesformerModel - -[[autodoc]] TimesformerModel - - forward - -## TimesformerForVideoClassification - -[[autodoc]] TimesformerForVideoClassification - - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md new file mode 100644 index 000000000000..548642f7bb9f --- /dev/null +++ b/docs/source/en/model_doc/trajectory_transformer.md @@ -0,0 +1,62 @@ + + +# Trajectory Transformer + + + +This model is in maintenance mode only, so we won't accept any new PRs changing its code. + +If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0. +You can do so by running the following command: `pip install -U transformers==4.30.0`. + + + +## Overview + +The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine. + +The abstract from the paper is the following: + +*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models, +leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence +modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards. +Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well +in other domains, such as natural-language processing, can also provide effective solutions to the RL problem. +To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture +to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence +modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common +in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction, +imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with +existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.* + +Tips: + +This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from +actions, states and rewards from all previous timesteps. This model will treat all these elements together +as one big sequence (a trajectory). + +This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer). + +## TrajectoryTransformerConfig + +[[autodoc]] TrajectoryTransformerConfig + + +## TrajectoryTransformerModel + +[[autodoc]] TrajectoryTransformerModel + - forward diff --git a/docs/source/en/model_doc/trajectory_transformer.mdx b/docs/source/en/model_doc/trajectory_transformer.mdx deleted file mode 100644 index da7a55a50eca..000000000000 --- a/docs/source/en/model_doc/trajectory_transformer.mdx +++ /dev/null @@ -1,49 +0,0 @@ - - -# Trajectory Transformer - -## Overview - -The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine. - -The abstract from the paper is the following: - -*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models, -leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence -modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards. -Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well -in other domains, such as natural-language processing, can also provide effective solutions to the RL problem. -To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture -to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence -modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common -in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction, -imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with -existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.* - -Tips: - -This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from -actions, states and rewards from all previous timesteps. This model will treat all these elements together -as one big sequence (a trajectory). - -This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer). - -## TrajectoryTransformerConfig - -[[autodoc]] TrajectoryTransformerConfig - - -## TrajectoryTransformerModel - -[[autodoc]] TrajectoryTransformerModel - - forward diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md new file mode 100644 index 000000000000..beb5ba2fea83 --- /dev/null +++ b/docs/source/en/model_doc/transfo-xl.md @@ -0,0 +1,123 @@ + + +# Transformer XL + +
+ +Models + + +Spaces + +
+ +## Overview + +The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan +Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can +reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax +inputs and outputs (tied). + +The abstract from the paper is the following: + +*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the +setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency +beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a +novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the +context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450% +longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+ +times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of +bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn +Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably +coherent, novel text articles with thousands of tokens.* + +Tips: + +- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The + original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left. +- Transformer-XL is one of the few models that has no sequence length limit. +- Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that may span across multiple documents, and segments are fed in order to the model. +- Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments. +- This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed. + +This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl). + + + +TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035) + + + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Causal language modeling task guide](../tasks/language_modeling) + +## TransfoXLConfig + +[[autodoc]] TransfoXLConfig + +## TransfoXLTokenizer + +[[autodoc]] TransfoXLTokenizer + - save_vocabulary + +## TransfoXL specific outputs + +[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput + +[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput + +[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput + +[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput + +## TransfoXLModel + +[[autodoc]] TransfoXLModel + - forward + +## TransfoXLLMHeadModel + +[[autodoc]] TransfoXLLMHeadModel + - forward + +## TransfoXLForSequenceClassification + +[[autodoc]] TransfoXLForSequenceClassification + - forward + +## TFTransfoXLModel + +[[autodoc]] TFTransfoXLModel + - call + +## TFTransfoXLLMHeadModel + +[[autodoc]] TFTransfoXLLMHeadModel + - call + +## TFTransfoXLForSequenceClassification + +[[autodoc]] TFTransfoXLForSequenceClassification + - call + +## Internal Layers + +[[autodoc]] AdaptiveEmbedding + +[[autodoc]] TFAdaptiveEmbedding diff --git a/docs/source/en/model_doc/transfo-xl.mdx b/docs/source/en/model_doc/transfo-xl.mdx deleted file mode 100644 index 7e2a7701426c..000000000000 --- a/docs/source/en/model_doc/transfo-xl.mdx +++ /dev/null @@ -1,103 +0,0 @@ - - -# Transformer XL - -## Overview - -The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan -Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can -reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax -inputs and outputs (tied). - -The abstract from the paper is the following: - -*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the -setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency -beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a -novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the -context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450% -longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+ -times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of -bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn -Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably -coherent, novel text articles with thousands of tokens.* - -Tips: - -- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The - original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left. -- Transformer-XL is one of the few models that has no sequence length limit. - -This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl). - - - -TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035) - - - - -## TransfoXLConfig - -[[autodoc]] TransfoXLConfig - -## TransfoXLTokenizer - -[[autodoc]] TransfoXLTokenizer - - save_vocabulary - -## TransfoXL specific outputs - -[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput - -[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput - -[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput - -[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput - -## TransfoXLModel - -[[autodoc]] TransfoXLModel - - forward - -## TransfoXLLMHeadModel - -[[autodoc]] TransfoXLLMHeadModel - - forward - -## TransfoXLForSequenceClassification - -[[autodoc]] TransfoXLForSequenceClassification - - forward - -## TFTransfoXLModel - -[[autodoc]] TFTransfoXLModel - - call - -## TFTransfoXLLMHeadModel - -[[autodoc]] TFTransfoXLLMHeadModel - - call - -## TFTransfoXLForSequenceClassification - -[[autodoc]] TFTransfoXLForSequenceClassification - - call - -## Internal Layers - -[[autodoc]] AdaptiveEmbedding - -[[autodoc]] TFAdaptiveEmbedding diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md new file mode 100644 index 000000000000..bfab93ad663b --- /dev/null +++ b/docs/source/en/model_doc/trocr.md @@ -0,0 +1,126 @@ + + +# TrOCR + +## Overview + +The TrOCR model was proposed in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained +Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, +Zhoujun Li, Furu Wei. TrOCR consists of an image Transformer encoder and an autoregressive text Transformer decoder to +perform [optical character recognition (OCR)](https://en.wikipedia.org/wiki/Optical_character_recognition). + +The abstract from the paper is the following: + +*Text recognition is a long-standing research problem for document digitalization. Existing approaches for text recognition +are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language +model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end +text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the +Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but +effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments +show that the TrOCR model outperforms the current state-of-the-art models on both printed and handwritten text recognition +tasks.* + + + + TrOCR architecture. Taken from the original paper. + +Please refer to the [`VisionEncoderDecoder`] class on how to use this model. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found +[here](https://github.com/microsoft/unilm/tree/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr). + +Tips: + +- The quickest way to get started with TrOCR is by checking the [tutorial + notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR), which show how to use the model + at inference time as well as fine-tuning on custom data. +- TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. It achieves state-of-the-art results + on both printed (e.g. the [SROIE dataset](https://paperswithcode.com/dataset/sroie) and handwritten (e.g. the [IAM + Handwriting dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>) text recognition tasks. For more + information, see the [official models](https://huggingface.co/models?other=trocr>). +- TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework. + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A blog post on [Accelerating Document AI](https://huggingface.co/blog/document-ai) with TrOCR. +- A blog post on how to [Document AI](https://github.com/philschmid/document-ai-transformers) with TrOCR. +- A notebook on how to [finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb). +- A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo. +- A notebook on [finetune TrOCR on the IAM Handwriting Database](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb) using native PyTorch. +- A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb). + + + +- [Casual language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) task guide. + +⚡️ Inference + +- An interactive-demo on [TrOCR handwritten character recognition](https://huggingface.co/spaces/nielsr/TrOCR-handwritten). + +## Inference + +TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of +[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image. + +The [`ViTImageProcessor`/`DeiTImageProcessor`] class is responsible for preprocessing the input image and +[`RobertaTokenizer`/`XLMRobertaTokenizer`] decodes the generated target tokens to the target string. The +[`TrOCRProcessor`] wraps [`ViTImageProcessor`/`DeiTImageProcessor`] and [`RobertaTokenizer`/`XLMRobertaTokenizer`] +into a single instance to both extract the input features and decode the predicted token ids. + +- Step-by-step Optical Character Recognition (OCR) + +``` py +>>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel +>>> import requests +>>> from PIL import Image + +>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") +>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") + +>>> # load image from the IAM dataset +>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + +>>> pixel_values = processor(image, return_tensors="pt").pixel_values +>>> generated_ids = model.generate(pixel_values) + +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` + +See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOCR checkpoints. + +## TrOCRConfig + +[[autodoc]] TrOCRConfig + +## TrOCRProcessor + +[[autodoc]] TrOCRProcessor + - __call__ + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## TrOCRForCausalLM + +[[autodoc]] TrOCRForCausalLM + - forward diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.mdx deleted file mode 100644 index 3e3a6c100753..000000000000 --- a/docs/source/en/model_doc/trocr.mdx +++ /dev/null @@ -1,101 +0,0 @@ - - -# TrOCR - -## Overview - -The TrOCR model was proposed in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained -Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, -Zhoujun Li, Furu Wei. TrOCR consists of an image Transformer encoder and an autoregressive text Transformer decoder to -perform [optical character recognition (OCR)](https://en.wikipedia.org/wiki/Optical_character_recognition). - -The abstract from the paper is the following: - -*Text recognition is a long-standing research problem for document digitalization. Existing approaches for text recognition -are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language -model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end -text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the -Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but -effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments -show that the TrOCR model outperforms the current state-of-the-art models on both printed and handwritten text recognition -tasks.* - - - - TrOCR architecture. Taken from the original paper. - -Please refer to the [`VisionEncoderDecoder`] class on how to use this model. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found -[here](https://github.com/microsoft/unilm/tree/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr). - -Tips: - -- The quickest way to get started with TrOCR is by checking the [tutorial - notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR), which show how to use the model - at inference time as well as fine-tuning on custom data. -- TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. It achieves state-of-the-art results - on both printed (e.g. the [SROIE dataset](https://paperswithcode.com/dataset/sroie) and handwritten (e.g. the [IAM - Handwriting dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>) text recognition tasks. For more - information, see the [official models](https://huggingface.co/models?other=trocr>). -- TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework. - -## Inference - -TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of -[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image. - -The [`ViTImageProcessor`/`DeiTImageProcessor`] class is responsible for preprocessing the input image and -[`RobertaTokenizer`/`XLMRobertaTokenizer`] decodes the generated target tokens to the target string. The -[`TrOCRProcessor`] wraps [`ViTImageProcessor`/`DeiTImageProcessor`] and [`RobertaTokenizer`/`XLMRobertaTokenizer`] -into a single instance to both extract the input features and decode the predicted token ids. - -- Step-by-step Optical Character Recognition (OCR) - -``` py ->>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel ->>> import requests ->>> from PIL import Image - ->>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") ->>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") - ->>> # load image from the IAM dataset ->>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" ->>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - ->>> pixel_values = processor(image, return_tensors="pt").pixel_values ->>> generated_ids = model.generate(pixel_values) - ->>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] -``` - -See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOCR checkpoints. - -## TrOCRConfig - -[[autodoc]] TrOCRConfig - -## TrOCRProcessor - -[[autodoc]] TrOCRProcessor - - __call__ - - from_pretrained - - save_pretrained - - batch_decode - - decode - -## TrOCRForCausalLM - -[[autodoc]] TrOCRForCausalLM - - forward diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md new file mode 100644 index 000000000000..5ddb6badb71f --- /dev/null +++ b/docs/source/en/model_doc/tvlt.md @@ -0,0 +1,77 @@ + + +# TVLT + +## Overview + +The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) +by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc. + +The abstract from the paper is the following: + +*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.* + +Tips: + +- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model. + This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one. +- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding. +- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality. +- The PyTorch version of this model is only available in torch 1.10 and higher. + +

+ +

+ + TVLT architecture. Taken from the original paper. + +The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang). + +## TvltConfig + +[[autodoc]] TvltConfig + +## TvltProcessor + +[[autodoc]] TvltProcessor + - __call__ + +## TvltImageProcessor + +[[autodoc]] TvltImageProcessor + - preprocess + +## TvltFeatureExtractor + +[[autodoc]] TvltFeatureExtractor + - __call__ + +## TvltModel + +[[autodoc]] TvltModel + - forward + +## TvltForPreTraining + +[[autodoc]] TvltForPreTraining + - forward + +## TvltForAudioVisualClassification + +[[autodoc]] TvltForAudioVisualClassification + - forward diff --git a/docs/source/en/model_doc/ul2.md b/docs/source/en/model_doc/ul2.md new file mode 100644 index 000000000000..3863f23a7d73 --- /dev/null +++ b/docs/source/en/model_doc/ul2.md @@ -0,0 +1,35 @@ + + +# UL2 + +## Overview + +The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler. + +The abstract from the paper is the following: + +*Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized and unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization.* + +Tips: + +- UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks. +- UL2 has the same architecture as [T5v1.1](t5v1.1) but uses the Gated-SiLU activation function instead of Gated-GELU. +- The authors release checkpoints of one architecture which can be seen [here](https://huggingface.co/google/ul2) + +The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2). + +This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn). diff --git a/docs/source/en/model_doc/ul2.mdx b/docs/source/en/model_doc/ul2.mdx deleted file mode 100644 index 2481285747fa..000000000000 --- a/docs/source/en/model_doc/ul2.mdx +++ /dev/null @@ -1,31 +0,0 @@ - - -# UL2 - -## Overview - -The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler. - -The abstract from the paper is the following: - -*Existing pre-trained models are generally geared towards a particular class of problems. To date, there seems to be still no consensus on what the right architecture and pre-training setup should be. This paper presents a unified framework for pre-training models that are universally effective across datasets and setups. We begin by disentangling architectural archetypes with pre-training objectives -- two concepts that are commonly conflated. Next, we present a generalized and unified perspective for self-supervision in NLP and show how different pre-training objectives can be cast as one another and how interpolating between different objectives can be effective. We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization.* - -Tips: - -- UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks. -- UL2 has the same architecture as [T5v1.1](t5v1.1) but uses the Gated-SiLU activation function instead of Gated-GELU. -- The authors release checkpoints of one architecture which can be seen [here](https://huggingface.co/google/ul2) - -The original code can be found [here](https://github.com/google-research/google-research/tree/master/ul2). - -This model was contributed by [DanielHesslow](https://huggingface.co/Seledorn). diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md new file mode 100644 index 000000000000..4e6375bd465a --- /dev/null +++ b/docs/source/en/model_doc/umt5.md @@ -0,0 +1,104 @@ + + +# UMT5 + +
+ +Models + + +Spaces + +
+ +## Overview + +The UMT5 model was proposed in [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. + +The abstract from the paper is the following: + +*Pretrained multilingual large language models have typically used heuristic temperature-based sampling to balance between different languages. However previous work has not systematically evaluated the efficacy of different pretraining language distributions across model scales. In this paper, we propose a new sampling method, UniMax, that delivers more uniform coverage of head languages while mitigating overfitting on tail languages by explicitly capping the number of repeats over each language's corpus. We perform an extensive series of ablations testing a range of sampling strategies on a suite of multilingual benchmarks, while varying model scale. We find that UniMax outperforms standard temperature-based sampling, and the benefits persist as scale increases. As part of our contribution, we release: (i) an improved and refreshed mC4 multilingual corpus consisting of 29 trillion characters across 107 languages, and (ii) a suite of pretrained umT5 model checkpoints trained with UniMax sampling.* + +Tips: + +- UMT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training. +Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model. +- Since umT5 was pre-trained in an unsupervise manner, there's no real advantage to using a task prefix during single-task +fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix. + +Google has released the following variants: + +- [google/umt5-small](https://huggingface.co/google/umt5-small) +- [google/umt5-base](https://huggingface.co/google/umt5-base) +- [google/umt5-xl](https://huggingface.co/google/umt5-xl) +- [google/umt5-xxl](https://huggingface.co/google/umt5-xxl). + +This model was contributed by [agemagician](https://huggingface.co/agemagician) and [stefan-it](https://huggingface.co/stefan-it). The original code can be +found [here](https://github.com/google-research/t5x). + +One can refer to [T5's documentation page](t5) for more tips, code examples and notebooks. + +## Differences with mT5? +`UmT5` is based on mT5, with a non-shared relative positional bias that is computed for each layer. This means that the model set `has_relative_bias` for each layer. +The conversion script is also different because the model was saved in t5x's latest checkpointing format. + +# Sample usage + +```python +>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/umt5-small") +>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small") + +>>> inputs = tokenizer( +... "A walks into a bar and orders a with pinch of .", +... return_tensors="pt", +... ) +>>> outputs = model.generate(**inputs) +>>> print(tokenizer.batch_decode(outputs)) +['nyone who drink a alcohol A A. This I'] +``` + +## UMT5Config + +[[autodoc]] UMT5Config + +## UMT5Model + +[[autodoc]] UMT5Model + - forward + +## UMT5ForConditionalGeneration + +[[autodoc]] UMT5ForConditionalGeneration + - forward + +## UMT5EncoderModel + +[[autodoc]] UMT5EncoderModel + - forward + +## UMT5ForSequenceClassification + +[[autodoc]] UMT5ForSequenceClassification + - forward + +## UMT5ForQuestionAnswering + +[[autodoc]] UMT5ForQuestionAnswering + - forward + diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md new file mode 100644 index 000000000000..25489d9eeffd --- /dev/null +++ b/docs/source/en/model_doc/unispeech-sat.md @@ -0,0 +1,92 @@ + + +# UniSpeech-SAT + +## Overview + +The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware +Pre-Training](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, +Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu . + +The abstract from the paper is the following: + +*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled +data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in +speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In +this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are +introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to +the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function. +Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where +additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed +methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves +state-of-the-art performance in universal representation learning, especially for speaker identification oriented +tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training +dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.* + +Tips: + +- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. + Please use [`Wav2Vec2Processor`] for the feature extraction. +- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be + decoded using [`Wav2Vec2CTCTokenizer`]. +- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be +found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT). + +## Documentation resources + +- [Audio classification task guide](../tasks/audio_classification) +- [Automatic speech recognition task guide](../tasks/asr) + +## UniSpeechSatConfig + +[[autodoc]] UniSpeechSatConfig + +## UniSpeechSat specific outputs + +[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput + +## UniSpeechSatModel + +[[autodoc]] UniSpeechSatModel + - forward + +## UniSpeechSatForCTC + +[[autodoc]] UniSpeechSatForCTC + - forward + +## UniSpeechSatForSequenceClassification + +[[autodoc]] UniSpeechSatForSequenceClassification + - forward + +## UniSpeechSatForAudioFrameClassification + +[[autodoc]] UniSpeechSatForAudioFrameClassification + - forward + +## UniSpeechSatForXVector + +[[autodoc]] UniSpeechSatForXVector + - forward + +## UniSpeechSatForPreTraining + +[[autodoc]] UniSpeechSatForPreTraining + - forward diff --git a/docs/source/en/model_doc/unispeech-sat.mdx b/docs/source/en/model_doc/unispeech-sat.mdx deleted file mode 100644 index e2ceb783ea94..000000000000 --- a/docs/source/en/model_doc/unispeech-sat.mdx +++ /dev/null @@ -1,84 +0,0 @@ - - -# UniSpeech-SAT - -## Overview - -The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware -Pre-Training](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, -Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu . - -The abstract from the paper is the following: - -*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled -data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in -speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In -this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are -introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to -the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function. -Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where -additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed -methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves -state-of-the-art performance in universal representation learning, especially for speaker identification oriented -tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training -dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.* - -Tips: - -- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. - Please use [`Wav2Vec2Processor`] for the feature extraction. -- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be - decoded using [`Wav2Vec2CTCTokenizer`]. -- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be -found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT). - - -## UniSpeechSatConfig - -[[autodoc]] UniSpeechSatConfig - -## UniSpeechSat specific outputs - -[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput - -## UniSpeechSatModel - -[[autodoc]] UniSpeechSatModel - - forward - -## UniSpeechSatForCTC - -[[autodoc]] UniSpeechSatForCTC - - forward - -## UniSpeechSatForSequenceClassification - -[[autodoc]] UniSpeechSatForSequenceClassification - - forward - -## UniSpeechSatForAudioFrameClassification - -[[autodoc]] UniSpeechSatForAudioFrameClassification - - forward - -## UniSpeechSatForXVector - -[[autodoc]] UniSpeechSatForXVector - - forward - -## UniSpeechSatForPreTraining - -[[autodoc]] UniSpeechSatForPreTraining - - forward diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md new file mode 100644 index 000000000000..8338aa1bda2e --- /dev/null +++ b/docs/source/en/model_doc/unispeech.md @@ -0,0 +1,77 @@ + + +# UniSpeech + +## Overview + +The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael +Zeng, Xuedong Huang . + +The abstract from the paper is the following: + +*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both +unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive +self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture +information more correlated with phonetic structures and improve the generalization across languages and domains. We +evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The +results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech +recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all +testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task, +i.e., a relative word error rate reduction of 6% against the previous approach.* + +Tips: + +- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please + use [`Wav2Vec2Processor`] for the feature extraction. +- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be + decoded using [`Wav2Vec2CTCTokenizer`]. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be +found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech). + +## Documentation resources + +- [Audio classification task guide](../tasks/audio_classification) +- [Automatic speech recognition task guide](../tasks/asr) + +## UniSpeechConfig + +[[autodoc]] UniSpeechConfig + +## UniSpeech specific outputs + +[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput + +## UniSpeechModel + +[[autodoc]] UniSpeechModel + - forward + +## UniSpeechForCTC + +[[autodoc]] UniSpeechForCTC + - forward + +## UniSpeechForSequenceClassification + +[[autodoc]] UniSpeechForSequenceClassification + - forward + +## UniSpeechForPreTraining + +[[autodoc]] UniSpeechForPreTraining + - forward diff --git a/docs/source/en/model_doc/unispeech.mdx b/docs/source/en/model_doc/unispeech.mdx deleted file mode 100644 index 37d0a0a708e9..000000000000 --- a/docs/source/en/model_doc/unispeech.mdx +++ /dev/null @@ -1,69 +0,0 @@ - - -# UniSpeech - -## Overview - -The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael -Zeng, Xuedong Huang . - -The abstract from the paper is the following: - -*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both -unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive -self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture -information more correlated with phonetic structures and improve the generalization across languages and domains. We -evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The -results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech -recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all -testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task, -i.e., a relative word error rate reduction of 6% against the previous approach.* - -Tips: - -- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please - use [`Wav2Vec2Processor`] for the feature extraction. -- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be - decoded using [`Wav2Vec2CTCTokenizer`]. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be -found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech). - - -## UniSpeechConfig - -[[autodoc]] UniSpeechConfig - -## UniSpeech specific outputs - -[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput - -## UniSpeechModel - -[[autodoc]] UniSpeechModel - - forward - -## UniSpeechForCTC - -[[autodoc]] UniSpeechForCTC - - forward - -## UniSpeechForSequenceClassification - -[[autodoc]] UniSpeechForSequenceClassification - - forward - -## UniSpeechForPreTraining - -[[autodoc]] UniSpeechForPreTraining - - forward diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md new file mode 100644 index 000000000000..db651acaa406 --- /dev/null +++ b/docs/source/en/model_doc/upernet.md @@ -0,0 +1,79 @@ + + +# UPerNet + +## Overview + +The UPerNet model was proposed in [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) +by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. UPerNet is a general framework to effectively segment +a wide range of concepts from images, leveraging any vision backbone like [ConvNeXt](convnext) or [Swin](swin). + +The abstract from the paper is the following: + +*Humans recognize the visual world at multiple levels: we effortlessly categorize scenes and detect objects inside, while also identifying the textures and surfaces of the objects along with their different compositional parts. In this paper, we study a new task called Unified Perceptual Parsing, which requires the machine vision systems to recognize as many visual concepts as possible from a given image. A multi-task framework called UPerNet and a training strategy are developed to learn from heterogeneous image annotations. We benchmark our framework on Unified Perceptual Parsing and show that it is able to effectively segment a wide range of concepts from images. The trained networks are further applied to discover visual knowledge in natural scenes.* + + + + UPerNet framework. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code is based on OpenMMLab's mmsegmentation [here](https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/uper_head.py). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UPerNet. + +- Demo notebooks for UPerNet can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UPerNet). +- [`UperNetForSemanticSegmentation`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/semantic-segmentation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb). +- See also: [Semantic segmentation task guide](../tasks/semantic_segmentation) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## Usage + +UPerNet is a general framework for semantic segmentation. It can be used with any vision backbone, like so: + +```py +from transformers import SwinConfig, UperNetConfig, UperNetForSemanticSegmentation + +backbone_config = SwinConfig(out_features=["stage1", "stage2", "stage3", "stage4"]) + +config = UperNetConfig(backbone_config=backbone_config) +model = UperNetForSemanticSegmentation(config) +``` + +To use another vision backbone, like [ConvNeXt](convnext), simply instantiate the model with the appropriate backbone: + +```py +from transformers import ConvNextConfig, UperNetConfig, UperNetForSemanticSegmentation + +backbone_config = ConvNextConfig(out_features=["stage1", "stage2", "stage3", "stage4"]) + +config = UperNetConfig(backbone_config=backbone_config) +model = UperNetForSemanticSegmentation(config) +``` + +Note that this will randomly initialize all the weights of the model. + +## UperNetConfig + +[[autodoc]] UperNetConfig + +## UperNetForSemanticSegmentation + +[[autodoc]] UperNetForSemanticSegmentation + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md new file mode 100644 index 000000000000..b9539602d3b8 --- /dev/null +++ b/docs/source/en/model_doc/van.md @@ -0,0 +1,74 @@ + + +# VAN + + + +This model is in maintenance mode only, so we won't accept any new PRs changing its code. + +If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0. +You can do so by running the following command: `pip install -U transformers==4.30.0`. + + + +## Overview + +The VAN model was proposed in [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. + +This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations. + +The abstract from the paper is the following: + +*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).* + +Tips: + +- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages. + +The figure below illustrates the architecture of a Visual Aattention Layer. Taken from the [original paper](https://arxiv.org/abs/2202.09741). + + + +This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VAN. + + + +- [`VanForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## VanConfig + +[[autodoc]] VanConfig + + +## VanModel + +[[autodoc]] VanModel + - forward + + +## VanForImageClassification + +[[autodoc]] VanForImageClassification + - forward + diff --git a/docs/source/en/model_doc/van.mdx b/docs/source/en/model_doc/van.mdx deleted file mode 100644 index 9fc05ab3e752..000000000000 --- a/docs/source/en/model_doc/van.mdx +++ /dev/null @@ -1,51 +0,0 @@ - - -# VAN - -## Overview - -The VAN model was proposed in [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. - -This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations. - -The abstract from the paper is the following: - -*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).* - -Tips: - -- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages. - -The figure below illustrates the architecture of a Visual Aattention Layer. Taken from the [original paper](https://arxiv.org/abs/2202.09741). - - - -This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification). - - -## VanConfig - -[[autodoc]] VanConfig - - -## VanModel - -[[autodoc]] VanModel - - forward - - -## VanForImageClassification - -[[autodoc]] VanForImageClassification - - forward - diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md new file mode 100644 index 000000000000..5a3620040ad9 --- /dev/null +++ b/docs/source/en/model_doc/videomae.md @@ -0,0 +1,81 @@ + + +# VideoMAE + +## Overview + +The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. +VideoMAE extends masked auto encoders ([MAE](vit_mae)) to video, claiming state-of-the-art performance on several video classification benchmarks. + +The abstract from the paper is the following: + +*Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking and reconstruction. These simple designs turn out to be effective for overcoming information leakage caused by the temporal correlation during video reconstruction. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets are important issues in SSVP. Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data.* + +Tips: + +- One can use [`VideoMAEImageProcessor`] to prepare videos for the model. It will resize + normalize all frames of a video for you. +- [`VideoMAEForPreTraining`] includes the decoder on top for self-supervised pre-training. + + + + VideoMAE pre-training. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/MCG-NJU/VideoMAE). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VideoMAE. If +you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll +review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +**Video classification** +- [A notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) that shows how +to fine-tune a VideoMAE model on a custom dataset. +- [Video classification task guide](../tasks/video_classification) +- [A 🤗 Space](https://huggingface.co/spaces/sayakpaul/video-classification-ucf101-subset) showing how to perform inference with a video classification model. + + +## VideoMAEConfig + +[[autodoc]] VideoMAEConfig + +## VideoMAEFeatureExtractor + +[[autodoc]] VideoMAEFeatureExtractor + - __call__ + +## VideoMAEImageProcessor + +[[autodoc]] VideoMAEImageProcessor + - preprocess + +## VideoMAEModel + +[[autodoc]] VideoMAEModel + - forward + +## VideoMAEForPreTraining + +[[autodoc]] transformers.VideoMAEForPreTraining + - forward + +## VideoMAEForVideoClassification + +[[autodoc]] transformers.VideoMAEForVideoClassification + - forward diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx deleted file mode 100644 index 76e822ef8a5c..000000000000 --- a/docs/source/en/model_doc/videomae.mdx +++ /dev/null @@ -1,77 +0,0 @@ - - -# VideoMAE - -## Overview - -The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. -VideoMAE extends masked auto encoders ([MAE](vit_mae)) to video, claiming state-of-the-art performance on several video classification benchmarks. - -The abstract from the paper is the following: - -*Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking and reconstruction. These simple designs turn out to be effective for overcoming information leakage caused by the temporal correlation during video reconstruction. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets are important issues in SSVP. Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data.* - -Tips: - -- One can use [`VideoMAEImageProcessor`] to prepare videos for the model. It will resize + normalize all frames of a video for you. -- [`VideoMAEForPreTraining`] includes the decoder on top for self-supervised pre-training. - - - - VideoMAE pre-training. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). -The original code can be found [here](https://github.com/MCG-NJU/VideoMAE). - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VideoMAE. If -you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll -review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - -**Video classification** -- [A notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) that shows how -to fine-tune a VideoMAE model on a custom dataset. -- [Video classification task page](https://huggingface.co/tasks/video-classification) -- [A 🤗 Space](https://huggingface.co/spaces/sayakpaul/video-classification-ucf101-subset) showing how to perform inference with a video classification model. - - -## VideoMAEConfig - -[[autodoc]] VideoMAEConfig - -## VideoMAEFeatureExtractor - -[[autodoc]] VideoMAEFeatureExtractor - - __call__ - -## VideoMAEImageProcessor - -[[autodoc]] VideoMAEImageProcessor - - preprocess - -## VideoMAEModel - -[[autodoc]] VideoMAEModel - - forward - -## VideoMAEForPreTraining - -[[autodoc]] transformers.VideoMAEForPreTraining - - forward - -## VideoMAEForVideoClassification - -[[autodoc]] transformers.VideoMAEForVideoClassification - - forward diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md new file mode 100644 index 000000000000..2e2f4a140d20 --- /dev/null +++ b/docs/source/en/model_doc/vilt.md @@ -0,0 +1,108 @@ + + +# ViLT + +## Overview + +The ViLT model was proposed in [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) +by Wonjae Kim, Bokyung Son, Ildoo Kim. ViLT incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design +for Vision-and-Language Pre-training (VLP). + +The abstract from the paper is the following: + +*Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. +Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision +(e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we +find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more +computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive +power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, +Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically +simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of +times faster than previous VLP models, yet with competitive or better downstream task performance.* + +Tips: + +- The quickest way to get started with ViLT is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViLT) + (which showcase both inference and fine-tuning on custom data). +- ViLT is a model that takes both `pixel_values` and `input_ids` as input. One can use [`ViltProcessor`] to prepare data for the model. + This processor wraps a image processor (for the image modality) and a tokenizer (for the language modality) into one. +- ViLT is trained with images of various sizes: the authors resize the shorter edge of input images to 384 and limit the longer edge to + under 640 while preserving the aspect ratio. To make batching of images possible, the authors use a `pixel_mask` that indicates + which pixel values are real and which are padding. [`ViltProcessor`] automatically creates this for you. +- The design of ViLT is very similar to that of a standard Vision Transformer (ViT). The only difference is that the model includes + additional embedding layers for the language modality. + + + + ViLT architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/dandelin/ViLT). + + +Tips: + +- The PyTorch version of this model is only available in torch 1.10 and higher. + +## ViltConfig + +[[autodoc]] ViltConfig + +## ViltFeatureExtractor + +[[autodoc]] ViltFeatureExtractor + - __call__ + +## ViltImageProcessor + +[[autodoc]] ViltImageProcessor + - preprocess + +## ViltProcessor + +[[autodoc]] ViltProcessor + - __call__ + +## ViltModel + +[[autodoc]] ViltModel + - forward + +## ViltForMaskedLM + +[[autodoc]] ViltForMaskedLM + - forward + +## ViltForQuestionAnswering + +[[autodoc]] ViltForQuestionAnswering + - forward + +## ViltForImagesAndTextClassification + +[[autodoc]] ViltForImagesAndTextClassification + - forward + +## ViltForImageAndTextRetrieval + +[[autodoc]] ViltForImageAndTextRetrieval + - forward + +## ViltForTokenClassification + +[[autodoc]] ViltForTokenClassification + - forward diff --git a/docs/source/en/model_doc/vilt.mdx b/docs/source/en/model_doc/vilt.mdx deleted file mode 100644 index 7c8653e1a3b9..000000000000 --- a/docs/source/en/model_doc/vilt.mdx +++ /dev/null @@ -1,104 +0,0 @@ - - -# ViLT - -## Overview - -The ViLT model was proposed in [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) -by Wonjae Kim, Bokyung Son, Ildoo Kim. ViLT incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design -for Vision-and-Language Pre-training (VLP). - -The abstract from the paper is the following: - -*Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. -Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision -(e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we -find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more -computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive -power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, -Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically -simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of -times faster than previous VLP models, yet with competitive or better downstream task performance.* - -Tips: - -- The quickest way to get started with ViLT is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViLT) - (which showcase both inference and fine-tuning on custom data). -- ViLT is a model that takes both `pixel_values` and `input_ids` as input. One can use [`ViltProcessor`] to prepare data for the model. - This processor wraps a feature extractor (for the image modality) and a tokenizer (for the language modality) into one. -- ViLT is trained with images of various sizes: the authors resize the shorter edge of input images to 384 and limit the longer edge to - under 640 while preserving the aspect ratio. To make batching of images possible, the authors use a `pixel_mask` that indicates - which pixel values are real and which are padding. [`ViltProcessor`] automatically creates this for you. -- The design of ViLT is very similar to that of a standard Vision Transformer (ViT). The only difference is that the model includes - additional embedding layers for the language modality. - - - - ViLT architecture. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/dandelin/ViLT). - - -Tips: - -- The PyTorch version of this model is only available in torch 1.10 and higher. - -## ViltConfig - -[[autodoc]] ViltConfig - -## ViltFeatureExtractor - -[[autodoc]] ViltFeatureExtractor - - __call__ - -## ViltImageProcessor - -[[autodoc]] ViltImageProcessor - - preprocess - -## ViltProcessor - -[[autodoc]] ViltProcessor - - __call__ - -## ViltModel - -[[autodoc]] ViltModel - - forward - -## ViltForMaskedLM - -[[autodoc]] ViltForMaskedLM - - forward - -## ViltForQuestionAnswering - -[[autodoc]] ViltForQuestionAnswering - - forward - -## ViltForImagesAndTextClassification - -[[autodoc]] ViltForImagesAndTextClassification - - forward - -## ViltForImageAndTextRetrieval - -[[autodoc]] ViltForImageAndTextRetrieval - - forward - -## ViltForTokenClassification - -[[autodoc]] ViltForTokenClassification - - forward diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md new file mode 100644 index 000000000000..0beeaeae108b --- /dev/null +++ b/docs/source/en/model_doc/vision-encoder-decoder.md @@ -0,0 +1,170 @@ + + +# Vision Encoder Decoder Models + +## Overview + +The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any +pretrained Transformer-based vision model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin)) +and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)). + +The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for +example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, +Zhoujun Li, Furu Wei. + +After such a [`VisionEncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like any other models (see the examples below +for more information). + +An example application is image captioning, in which the encoder is used to encode the image, after which an autoregressive language model generates +the caption. Another example is optical character recognition. Refer to [TrOCR](trocr), which is an instance of [`VisionEncoderDecoderModel`]. + +## Randomly initializing `VisionEncoderDecoderModel` from model configurations. + +[`VisionEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`ViTModel`] configuration for the encoder +and the default [`BertForCausalLM`] configuration for the decoder. + +```python +>>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel + +>>> config_encoder = ViTConfig() +>>> config_decoder = BertConfig() + +>>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) +>>> model = VisionEncoderDecoderModel(config=config) +``` + +## Initialising `VisionEncoderDecoderModel` from a pretrained encoder and a pretrained decoder. + +[`VisionEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based vision model, *e.g.* [Swin](swin), can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder. +Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. +Initializing [`VisionEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder). +To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecoderModel.from_encoder_decoder_pretrained`] method. + +```python +>>> from transformers import VisionEncoderDecoderModel + +>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( +... "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased" +... ) +``` + +## Loading an existing `VisionEncoderDecoderModel` checkpoint and perform inference. + +To load fine-tuned checkpoints of the `VisionEncoderDecoderModel` class, [`VisionEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers. + +To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. + +```python +>>> import requests +>>> from PIL import Image + +>>> from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel + +>>> # load a fine-tuned image captioning model and corresponding tokenizer and image processor +>>> model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +>>> tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +>>> image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + +>>> # let's perform inference on an image +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) +>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values + +>>> # autoregressively generate caption (uses greedy decoding by default) +>>> generated_ids = model.generate(pixel_values) +>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +>>> print(generated_text) +a cat laying on a blanket next to a cat laying on a bed +``` + +## Loading a PyTorch checkpoint into `TFVisionEncoderDecoderModel`. + +[`TFVisionEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a +PyTorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only PyTorch +checkpoints for a particular vision encoder-decoder model, a workaround is: + +```python +>>> from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel + +>>> _model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + +>>> _model.encoder.save_pretrained("./encoder") +>>> _model.decoder.save_pretrained("./decoder") + +>>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( +... "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True +... ) +>>> # This is only for copying some specific attributes of this particular model. +>>> model.config = _model.config +``` + +## Training + +Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (image, text) pairs. +As you can see, only 2 inputs are required for the model in order to compute a loss: `pixel_values` (which are the +images) and `labels` (which are the `input_ids` of the encoded target sequence). + +```python +>>> from transformers import ViTImageProcessor, BertTokenizer, VisionEncoderDecoderModel +>>> from datasets import load_dataset + +>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") +>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") +>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( +... "google/vit-base-patch16-224-in21k", "bert-base-uncased" +... ) + +>>> model.config.decoder_start_token_id = tokenizer.cls_token_id +>>> model.config.pad_token_id = tokenizer.pad_token_id + +>>> dataset = load_dataset("huggingface/cats-image") +>>> image = dataset["test"]["image"][0] +>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values + +>>> labels = tokenizer( +... "an image of two cats chilling on a couch", +... return_tensors="pt", +... ).input_ids + +>>> # the forward function automatically creates the correct decoder_input_ids +>>> loss = model(pixel_values=pixel_values, labels=labels).loss +``` + +This model was contributed by [nielsr](https://github.com/nielsrogge). This model's TensorFlow and Flax versions +were contributed by [ydshieh](https://github.com/ydshieh). + +## VisionEncoderDecoderConfig + +[[autodoc]] VisionEncoderDecoderConfig + +## VisionEncoderDecoderModel + +[[autodoc]] VisionEncoderDecoderModel + - forward + - from_encoder_decoder_pretrained + +## TFVisionEncoderDecoderModel + +[[autodoc]] TFVisionEncoderDecoderModel + - call + - from_encoder_decoder_pretrained + +## FlaxVisionEncoderDecoderModel + +[[autodoc]] FlaxVisionEncoderDecoderModel + - __call__ + - from_encoder_decoder_pretrained diff --git a/docs/source/en/model_doc/vision-encoder-decoder.mdx b/docs/source/en/model_doc/vision-encoder-decoder.mdx deleted file mode 100644 index 0241224c0667..000000000000 --- a/docs/source/en/model_doc/vision-encoder-decoder.mdx +++ /dev/null @@ -1,166 +0,0 @@ - - -# Vision Encoder Decoder Models - -## Overview - -The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any -pretrained Transformer-based vision model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin)) -and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)). - -The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for -example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, -Zhoujun Li, Furu Wei. - -After such a [`VisionEncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like any other models (see the examples below -for more information). - -An example application is image captioning, in which the encoder is used to encode the image, after which an autoregressive language model generates -the caption. Another example is optical character recognition. Refer to [TrOCR](trocr), which is an instance of [`VisionEncoderDecoderModel`]. - -## Randomly initializing `VisionEncoderDecoderModel` from model configurations. - -[`VisionEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`ViTModel`] configuration for the encoder -and the default [`BertForCausalLM`] configuration for the decoder. - -```python ->>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel - ->>> config_encoder = ViTConfig() ->>> config_decoder = BertConfig() - ->>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) ->>> model = VisionEncoderDecoderModel(config=config) -``` - -## Initialising `VisionEncoderDecoderModel` from a pretrained encoder and a pretrained decoder. - -[`VisionEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based vision model, *e.g.* [Swin](swin), can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder. -Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. -Initializing [`VisionEncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder). -To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecoderModel.from_encoder_decoder_pretrained`] method. - -```python ->>> from transformers import VisionEncoderDecoderModel - ->>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( -... "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased" -... ) -``` - -## Loading an existing `VisionEncoderDecoderModel` checkpoint and perform inference. - -To load fine-tuned checkpoints of the `VisionEncoderDecoderModel` class, [`VisionEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers. - -To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. - -```python ->>> import requests ->>> from PIL import Image - ->>> from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel - ->>> # load a fine-tuned image captioning model and corresponding tokenizer and image processor ->>> model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") ->>> tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") ->>> image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") - ->>> # let's perform inference on an image ->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" ->>> image = Image.open(requests.get(url, stream=True).raw) ->>> pixel_values = image_processor(image, return_tensors="pt").pixel_values - ->>> # autoregressively generate caption (uses greedy decoding by default) ->>> generated_ids = model.generate(pixel_values) ->>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] ->>> print(generated_text) -a cat laying on a blanket next to a cat laying on a bed -``` - -## Loading a PyTorch checkpoint into `TFVisionEncoderDecoderModel`. - -[`TFVisionEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a -PyTorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only PyTorch -checkpoints for a particular vision encoder-decoder model, a workaround is: - -```python ->>> from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel - ->>> _model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") - ->>> _model.encoder.save_pretrained("./encoder") ->>> _model.decoder.save_pretrained("./decoder") - ->>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( -... "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True -... ) ->>> # This is only for copying some specific attributes of this particular model. ->>> model.config = _model.config -``` - -## Training - -Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (image, text) pairs. -As you can see, only 2 inputs are required for the model in order to compute a loss: `pixel_values` (which are the -images) and `labels` (which are the `input_ids` of the encoded target sequence). - -```python ->>> from transformers import ViTImageProcessor, BertTokenizer, VisionEncoderDecoderModel ->>> from datasets import load_dataset - ->>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") ->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") ->>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( -... "google/vit-base-patch16-224-in21k", "bert-base-uncased" -... ) - ->>> model.config.decoder_start_token_id = tokenizer.cls_token_id ->>> model.config.pad_token_id = tokenizer.pad_token_id - ->>> dataset = load_dataset("huggingface/cats-image") ->>> image = dataset["test"]["image"][0] ->>> pixel_values = image_processor(image, return_tensors="pt").pixel_values - ->>> labels = tokenizer( -... "an image of two cats chilling on a couch", -... return_tensors="pt", -... ).input_ids - ->>> # the forward function automatically creates the correct decoder_input_ids ->>> loss = model(pixel_values=pixel_values, labels=labels).loss -``` - -This model was contributed by [nielsr](https://github.com/nielsrogge). This model's TensorFlow and Flax versions -were contributed by [ydshieh](https://github.com/ydshieh). - -## VisionEncoderDecoderConfig - -[[autodoc]] VisionEncoderDecoderConfig - -## VisionEncoderDecoderModel - -[[autodoc]] VisionEncoderDecoderModel - - forward - - from_encoder_decoder_pretrained - -## TFVisionEncoderDecoderModel - -[[autodoc]] TFVisionEncoderDecoderModel - - call - - from_encoder_decoder_pretrained - -## FlaxVisionEncoderDecoderModel - -[[autodoc]] FlaxVisionEncoderDecoderModel - - __call__ - - from_encoder_decoder_pretrained diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md new file mode 100644 index 000000000000..6fa9728cac46 --- /dev/null +++ b/docs/source/en/model_doc/vision-text-dual-encoder.md @@ -0,0 +1,52 @@ + + +# VisionTextDualEncoder + +## Overview + +The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with +any pretrained vision autoencoding model as the vision encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) and any pretrained text autoencoding model as the text encoder (*e.g.* [RoBERTa](roberta), [BERT](bert)). Two projection layers are added on top of both the vision and text encoder to project the output embeddings +to a shared latent space. The projection layers are randomly initialized so the model should be fine-tuned on a +downstream task. This model can be used to align the vision-text embeddings using CLIP like contrastive image-text +training and then can be used for zero-shot vision tasks such image-classification or retrieval. + +In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how +leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement on +new zero-shot vision tasks such as image classification or retrieval. + +## VisionTextDualEncoderConfig + +[[autodoc]] VisionTextDualEncoderConfig + +## VisionTextDualEncoderProcessor + +[[autodoc]] VisionTextDualEncoderProcessor + +## VisionTextDualEncoderModel + +[[autodoc]] VisionTextDualEncoderModel + - forward + +## FlaxVisionTextDualEncoderModel + +[[autodoc]] FlaxVisionTextDualEncoderModel + - __call__ + +## TFVisionTextDualEncoderModel + +[[autodoc]] TFVisionTextDualEncoderModel + - call diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.mdx b/docs/source/en/model_doc/vision-text-dual-encoder.mdx deleted file mode 100644 index c7ee59d77abb..000000000000 --- a/docs/source/en/model_doc/vision-text-dual-encoder.mdx +++ /dev/null @@ -1,43 +0,0 @@ - - -# VisionTextDualEncoder - -## Overview - -The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with -any pretrained vision autoencoding model as the vision encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) and any pretrained text autoencoding model as the text encoder (*e.g.* [RoBERTa](roberta), [BERT](bert)). Two projection layers are added on top of both the vision and text encoder to project the output embeddings -to a shared latent space. The projection layers are randomly initialized so the model should be fine-tuned on a -downstream task. This model can be used to align the vision-text embeddings using CLIP like contrastive image-text -training and then can be used for zero-shot vision tasks such image-classification or retrieval. - -In [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991) it is shown how -leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement on -new zero-shot vision tasks such as image classification or retrieval. - -## VisionTextDualEncoderConfig - -[[autodoc]] VisionTextDualEncoderConfig - -## VisionTextDualEncoderProcessor - -[[autodoc]] VisionTextDualEncoderProcessor - -## VisionTextDualEncoderModel - -[[autodoc]] VisionTextDualEncoderModel - - forward - -## FlaxVisionTextDualEncoderModel - -[[autodoc]] FlaxVisionTextDualEncoderModel - - __call__ diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md new file mode 100644 index 000000000000..7d84c0d9faec --- /dev/null +++ b/docs/source/en/model_doc/visual_bert.md @@ -0,0 +1,129 @@ + + +# VisualBERT + +## Overview + +The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +VisualBERT is a neural network trained on a variety of (image, text) pairs. + +The abstract from the paper is the following: + +*We propose VisualBERT, a simple and flexible framework for modeling a broad range of vision-and-language tasks. +VisualBERT consists of a stack of Transformer layers that implicitly align elements of an input text and regions in an +associated input image with self-attention. We further propose two visually-grounded language model objectives for +pre-training VisualBERT on image caption data. Experiments on four vision-and-language tasks including VQA, VCR, NLVR2, +and Flickr30K show that VisualBERT outperforms or rivals with state-of-the-art models while being significantly +simpler. Further analysis demonstrates that VisualBERT can ground elements of language to image regions without any +explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between +verbs and image regions corresponding to their arguments.* + +Tips: + +1. Most of the checkpoints provided work with the [`VisualBertForPreTraining`] configuration. Other + checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR + ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). Hence, if you are not working on these downstream tasks, it is + recommended that you use the pretrained checkpoints. + +2. For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints. + We do not provide the detector and its weights as a part of the package, but it will be available in the research + projects, and the states can be loaded directly into the detector provided. + +## Usage + +VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice, +visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare +embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical +dimension. + +To feed images to the model, each image is passed through a pre-trained object detector and the regions and the +bounding boxes are extracted. The authors use the features generated after passing these regions through a pre-trained +CNN like ResNet as visual embeddings. They also add absolute position embeddings, and feed the resulting sequence of +vectors to a standard BERT model. The text input is concatenated in the front of the visual embeddings in the embedding +layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The segment IDs must also be set +appropriately for the textual and visual parts. + +The [`BertTokenizer`] is used to encode the text. A custom detector/image processor must be used +to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models: + +- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert) : This notebook + contains an example on VisualBERT VQA. + +- [Generate Embeddings for VisualBERT (Colab Notebook)](https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing) : This notebook contains + an example on how to generate visual embeddings. + +The following example shows how to get the last hidden state using [`VisualBertModel`]: + +```python +>>> import torch +>>> from transformers import BertTokenizer, VisualBertModel + +>>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre") +>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + +>>> inputs = tokenizer("What is the man eating?", return_tensors="pt") +>>> # this is a custom function that returns the visual embeddings given the image path +>>> visual_embeds = get_visual_embeddings(image_path) + +>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) +>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) +>>> inputs.update( +... { +... "visual_embeds": visual_embeds, +... "visual_token_type_ids": visual_token_type_ids, +... "visual_attention_mask": visual_attention_mask, +... } +... ) +>>> outputs = model(**inputs) +>>> last_hidden_state = outputs.last_hidden_state +``` + +This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert). + +## VisualBertConfig + +[[autodoc]] VisualBertConfig + +## VisualBertModel + +[[autodoc]] VisualBertModel + - forward + +## VisualBertForPreTraining + +[[autodoc]] VisualBertForPreTraining + - forward + +## VisualBertForQuestionAnswering + +[[autodoc]] VisualBertForQuestionAnswering + - forward + +## VisualBertForMultipleChoice + +[[autodoc]] VisualBertForMultipleChoice + - forward + +## VisualBertForVisualReasoning + +[[autodoc]] VisualBertForVisualReasoning + - forward + +## VisualBertForRegionToPhraseAlignment + +[[autodoc]] VisualBertForRegionToPhraseAlignment + - forward diff --git a/docs/source/en/model_doc/visual_bert.mdx b/docs/source/en/model_doc/visual_bert.mdx deleted file mode 100644 index df8858b1fa67..000000000000 --- a/docs/source/en/model_doc/visual_bert.mdx +++ /dev/null @@ -1,125 +0,0 @@ - - -# VisualBERT - -## Overview - -The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -VisualBERT is a neural network trained on a variety of (image, text) pairs. - -The abstract from the paper is the following: - -*We propose VisualBERT, a simple and flexible framework for modeling a broad range of vision-and-language tasks. -VisualBERT consists of a stack of Transformer layers that implicitly align elements of an input text and regions in an -associated input image with self-attention. We further propose two visually-grounded language model objectives for -pre-training VisualBERT on image caption data. Experiments on four vision-and-language tasks including VQA, VCR, NLVR2, -and Flickr30K show that VisualBERT outperforms or rivals with state-of-the-art models while being significantly -simpler. Further analysis demonstrates that VisualBERT can ground elements of language to image regions without any -explicit supervision and is even sensitive to syntactic relationships, tracking, for example, associations between -verbs and image regions corresponding to their arguments.* - -Tips: - -1. Most of the checkpoints provided work with the [`VisualBertForPreTraining`] configuration. Other - checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR - ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). Hence, if you are not working on these downstream tasks, it is - recommended that you use the pretrained checkpoints. - -2. For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints. - We do not provide the detector and its weights as a part of the package, but it will be available in the research - projects, and the states can be loaded directly into the detector provided. - -## Usage - -VisualBERT is a multi-modal vision and language model. It can be used for visual question answering, multiple choice, -visual reasoning and region-to-phrase correspondence tasks. VisualBERT uses a BERT-like transformer to prepare -embeddings for image-text pairs. Both the text and visual features are then projected to a latent space with identical -dimension. - -To feed images to the model, each image is passed through a pre-trained object detector and the regions and the -bounding boxes are extracted. The authors use the features generated after passing these regions through a pre-trained -CNN like ResNet as visual embeddings. They also add absolute position embeddings, and feed the resulting sequence of -vectors to a standard BERT model. The text input is concatenated in the front of the visual embeddings in the embedding -layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The segment IDs must also be set -appropriately for the textual and visual parts. - -The [`BertTokenizer`] is used to encode the text. A custom detector/image processor must be used -to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models: - -- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert) : This notebook - contains an example on VisualBERT VQA. - -- [Generate Embeddings for VisualBERT (Colab Notebook)](https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing) : This notebook contains - an example on how to generate visual embeddings. - -The following example shows how to get the last hidden state using [`VisualBertModel`]: - -```python ->>> import torch ->>> from transformers import BertTokenizer, VisualBertModel - ->>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre") ->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - ->>> inputs = tokenizer("What is the man eating?", return_tensors="pt") ->>> # this is a custom function that returns the visual embeddings given the image path ->>> visual_embeds = get_visual_embeddings(image_path) - ->>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) ->>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) ->>> inputs.update( -... { -... "visual_embeds": visual_embeds, -... "visual_token_type_ids": visual_token_type_ids, -... "visual_attention_mask": visual_attention_mask, -... } -... ) ->>> outputs = model(**inputs) ->>> last_hidden_state = outputs.last_hidden_state -``` - -This model was contributed by [gchhablani](https://huggingface.co/gchhablani). The original code can be found [here](https://github.com/uclanlp/visualbert). - -## VisualBertConfig - -[[autodoc]] VisualBertConfig - -## VisualBertModel - -[[autodoc]] VisualBertModel - - forward - -## VisualBertForPreTraining - -[[autodoc]] VisualBertForPreTraining - - forward - -## VisualBertForQuestionAnswering - -[[autodoc]] VisualBertForQuestionAnswering - - forward - -## VisualBertForMultipleChoice - -[[autodoc]] VisualBertForMultipleChoice - - forward - -## VisualBertForVisualReasoning - -[[autodoc]] VisualBertForVisualReasoning - - forward - -## VisualBertForRegionToPhraseAlignment - -[[autodoc]] VisualBertForRegionToPhraseAlignment - - forward diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md new file mode 100644 index 000000000000..409580d09481 --- /dev/null +++ b/docs/source/en/model_doc/vit.md @@ -0,0 +1,186 @@ + + +# Vision Transformer (ViT) + +## Overview + +The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition +at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk +Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob +Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining +very good results compared to familiar convolutional architectures. + + +The abstract from the paper is the following: + +*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its +applications to computer vision remain limited. In vision, attention is either applied in conjunction with +convolutional networks, or used to replace certain components of convolutional networks while keeping their overall +structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to +sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of +data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), +Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring +substantially fewer computational resources to train.* + +Tips: + +- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer). +- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches, + which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be + used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of + vectors to a standard Transformer encoder. +- As the Vision Transformer expects each image to be of the same size (resolution), one can use + [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model. +- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of + each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch + resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit). +- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of + 14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million + images and 1,000 classes). +- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to + use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov + et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform + 2D interpolation of the pre-trained position embeddings, according to their location in the original image. +- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed + an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked + language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant + improvement of 2% to training from scratch, but still 4% behind supervised pre-training. + + + + ViT architecture. Taken from the original paper. + +Following the original Vision Transformer, some follow-up works have been made: + +- [DeiT](deit) (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. + The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into [`ViTModel`] or + [`ViTForImageClassification`]. There are 4 variants available (in 3 different sizes): *facebook/deit-tiny-patch16-224*, + *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. Note that one should + use [`DeiTImageProcessor`] in order to prepare images for the model. + +- [BEiT](beit) (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained + vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE. + +- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using + the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting + objects, without having ever been trained to do so. DINO checkpoints can be found on the [hub](https://huggingface.co/models?other=dino). + +- [MAE](vit_mae) (Masked Autoencoders) by Facebook AI. By pre-training Vision Transformers to reconstruct pixel values for a high portion + (75%) of masked patches (using an asymmetric encoder-decoder architecture), the authors show that this simple method outperforms + supervised pre-training after fine-tuning. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be +found [here](https://github.com/google-research/vision_transformer). + +Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), who already converted the weights from JAX to PyTorch. Credits +go to him! + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. + + + +- [`ViTForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- A blog on fine-tuning [`ViTForImageClassification`] on a custom dataset can be found [here](https://huggingface.co/blog/fine-tune-vit). +- More demo notebooks to fine-tune [`ViTForImageClassification`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer). +- [Image classification task guide](../tasks/image_classification) + +Besides that: + +- [`ViTForMaskedImageModeling`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +`ViTForImageClassification` is supported by: + + +- A blog post on how to [Fine-Tune ViT for Image Classification with Hugging Face Transformers](https://huggingface.co/blog/fine-tune-vit) +- A blog post on [Image Classification with Hugging Face Transformers and `Keras`](https://www.philschmid.de/image-classification-huggingface-transformers-keras) +- A notebook on [Fine-tuning for Image Classification with Hugging Face Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) +- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with the Hugging Face Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) +- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) + +⚗️ Optimization + +- A blog post on how to [Accelerate Vision Transformer (ViT) with Quantization using Optimum](https://www.philschmid.de/optimizing-vision-transformer) + +⚡️ Inference + +- A notebook on [Quick demo: Vision Transformer (ViT) by Google Brain](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb) + +🚀 Deploy + +- A blog post on [Deploying Tensorflow Vision Models in Hugging Face with TF Serving](https://huggingface.co/blog/tf-serving-vision) +- A blog post on [Deploying Hugging Face ViT on Vertex AI](https://huggingface.co/blog/deploy-vertex-ai) +- A blog post on [Deploying Hugging Face ViT on Kubernetes with TF Serving](https://huggingface.co/blog/deploy-tfserving-kubernetes) + + +## ViTConfig + +[[autodoc]] ViTConfig + +## ViTFeatureExtractor + +[[autodoc]] ViTFeatureExtractor + - __call__ + + +## ViTImageProcessor + +[[autodoc]] ViTImageProcessor + - preprocess + +## ViTModel + +[[autodoc]] ViTModel + - forward + +## ViTForMaskedImageModeling + +[[autodoc]] ViTForMaskedImageModeling + - forward + +## ViTForImageClassification + +[[autodoc]] ViTForImageClassification + - forward + +## TFViTModel + +[[autodoc]] TFViTModel + - call + +## TFViTForImageClassification + +[[autodoc]] TFViTForImageClassification + - call + +## FlaxVitModel + +[[autodoc]] FlaxViTModel + - __call__ + +## FlaxViTForImageClassification + +[[autodoc]] FlaxViTForImageClassification + - __call__ diff --git a/docs/source/en/model_doc/vit.mdx b/docs/source/en/model_doc/vit.mdx deleted file mode 100644 index de31278dfe75..000000000000 --- a/docs/source/en/model_doc/vit.mdx +++ /dev/null @@ -1,166 +0,0 @@ - - -# Vision Transformer (ViT) - -## Overview - -The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition -at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk -Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob -Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining -very good results compared to familiar convolutional architectures. - - -The abstract from the paper is the following: - -*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its -applications to computer vision remain limited. In vision, attention is either applied in conjunction with -convolutional networks, or used to replace certain components of convolutional networks while keeping their overall -structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to -sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of -data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), -Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring -substantially fewer computational resources to train.* - -Tips: - -- Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer). -- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches, - which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be - used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of - vectors to a standard Transformer encoder. -- As the Vision Transformer expects each image to be of the same size (resolution), one can use - [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model. -- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of - each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch - resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit). -- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of - 14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million - images and 1,000 classes). -- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to - use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov - et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform - 2D interpolation of the pre-trained position embeddings, according to their location in the original image. -- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed - an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked - language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant - improvement of 2% to training from scratch, but still 4% behind supervised pre-training. - - - - ViT architecture. Taken from the original paper. - -Following the original Vision Transformer, some follow-up works have been made: - -- [DeiT](deit) (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. - The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into [`ViTModel`] or - [`ViTForImageClassification`]. There are 4 variants available (in 3 different sizes): *facebook/deit-tiny-patch16-224*, - *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. Note that one should - use [`DeiTImageProcessor`] in order to prepare images for the model. - -- [BEiT](beit) (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained - vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE. - -- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using - the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting - objects, without having ever been trained to do so. DINO checkpoints can be found on the [hub](https://huggingface.co/models?other=dino). - -- [MAE](vit_mae) (Masked Autoencoders) by Facebook AI. By pre-training Vision Transformers to reconstruct pixel values for a high portion - (75%) of masked patches (using an asymmetric encoder-decoder architecture), the authors show that this simple method outperforms - supervised pre-training after fine-tuning. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be -found [here](https://github.com/google-research/vision_transformer). - -Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models), who already converted the weights from JAX to PyTorch. Credits -go to him! - - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - -`ViTForImageClassification` is supported by: - - -- A blog post on how to [Fine-Tune ViT for Image Classification with Hugging Face Transformers](https://huggingface.co/blog/fine-tune-vit) -- A blog post on [Image Classification with Hugging Face Transformers and `Keras`](https://www.philschmid.de/image-classification-huggingface-transformers-keras) -- A notebook on [Fine-tuning for Image Classification with Hugging Face Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) -- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with the Hugging Face Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) -- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) - -⚗️ Optimization - -- A blog post on how to [Accelerate Vision Transformer (ViT) with Quantization using Optimum](https://www.philschmid.de/optimizing-vision-transformer) - -⚡️ Inference - -- A notebook on [Quick demo: Vision Transformer (ViT) by Google Brain](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb) - -🚀 Deploy - -- A blog post on [Deploying Tensorflow Vision Models in Hugging Face with TF Serving](https://huggingface.co/blog/tf-serving-vision) -- A blog post on [Deploying Hugging Face ViT on Vertex AI](https://huggingface.co/blog/deploy-vertex-ai) -- A blog post on [Deploying Hugging Face ViT on Kubernetes with TF Serving](https://huggingface.co/blog/deploy-tfserving-kubernetes) - - -## ViTConfig - -[[autodoc]] ViTConfig - -## ViTFeatureExtractor - -[[autodoc]] ViTFeatureExtractor - - __call__ - - -## ViTImageProcessor - -[[autodoc]] ViTImageProcessor - - preprocess - -## ViTModel - -[[autodoc]] ViTModel - - forward - -## ViTForMaskedImageModeling - -[[autodoc]] ViTForMaskedImageModeling - - forward - -## ViTForImageClassification - -[[autodoc]] ViTForImageClassification - - forward - -## TFViTModel - -[[autodoc]] TFViTModel - - call - -## TFViTForImageClassification - -[[autodoc]] TFViTForImageClassification - - call - -## FlaxVitModel - -[[autodoc]] FlaxViTModel - - __call__ - -## FlaxViTForImageClassification - -[[autodoc]] FlaxViTForImageClassification - - __call__ diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md new file mode 100644 index 000000000000..84969cd0f622 --- /dev/null +++ b/docs/source/en/model_doc/vit_hybrid.md @@ -0,0 +1,73 @@ + + +# Hybrid Vision Transformer (ViT Hybrid) + +## Overview + +The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition +at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk +Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob +Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining +very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit), +by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer. + + +The abstract from the paper is the following: + +*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its +applications to computer vision remain limited. In vision, attention is either applied in conjunction with +convolutional networks, or used to replace certain components of convolutional networks while keeping their overall +structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to +sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of +data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), +Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring +substantially fewer computational resources to train.* + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be +found [here](https://github.com/google-research/vision_transformer). + + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid. + + + +- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + +## ViTHybridConfig + +[[autodoc]] ViTHybridConfig + +## ViTHybridImageProcessor + +[[autodoc]] ViTHybridImageProcessor + - preprocess + +## ViTHybridModel + +[[autodoc]] ViTHybridModel + - forward + +## ViTHybridForImageClassification + +[[autodoc]] ViTHybridForImageClassification + - forward diff --git a/docs/source/en/model_doc/vit_hybrid.mdx b/docs/source/en/model_doc/vit_hybrid.mdx deleted file mode 100644 index 8885af0dfe0f..000000000000 --- a/docs/source/en/model_doc/vit_hybrid.mdx +++ /dev/null @@ -1,68 +0,0 @@ - - -# Hybrid Vision Transformer (ViT Hybrid) - -## Overview - -The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition -at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk -Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob -Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining -very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit), -by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer. - - -The abstract from the paper is the following: - -*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its -applications to computer vision remain limited. In vision, attention is either applied in conjunction with -convolutional networks, or used to replace certain components of convolutional networks while keeping their overall -structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to -sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of -data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), -Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring -substantially fewer computational resources to train.* - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be -found [here](https://github.com/google-research/vision_transformer). - - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid. - - - -- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). - -If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - - -## ViTHybridConfig - -[[autodoc]] ViTHybridConfig - -## ViTHybridImageProcessor - -[[autodoc]] ViTHybridImageProcessor - - preprocess - -## ViTHybridModel - -[[autodoc]] ViTHybridModel - - forward - -## ViTHybridForImageClassification - -[[autodoc]] ViTHybridForImageClassification - - forward diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md new file mode 100644 index 000000000000..c14cc7e57c90 --- /dev/null +++ b/docs/source/en/model_doc/vit_mae.md @@ -0,0 +1,90 @@ + + +# ViTMAE + +## Overview + +The ViTMAE model was proposed in [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v2) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, +Piotr Dollár, Ross Girshick. The paper shows that, by pre-training a Vision Transformer (ViT) to reconstruct pixel values for masked patches, one can get results after +fine-tuning that outperform supervised pre-training. + +The abstract from the paper is the following: + +*This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the +input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates +only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask +tokens. Second, we find that masking a high proportion of the input image, e.g., 75%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs +enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity +models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream +tasks outperforms supervised pre-training and shows promising scaling behavior.* + +Tips: + +- MAE (masked auto encoding) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training objective is relatively simple: +by masking a large portion (75%) of the image patches, the model must reconstruct raw pixel values. One can use [`ViTMAEForPreTraining`] for this purpose. +- After pre-training, one "throws away" the decoder used to reconstruct pixels, and one uses the encoder for fine-tuning/linear probing. This means that after +fine-tuning, one can directly plug in the weights into a [`ViTForImageClassification`]. +- One can use [`ViTImageProcessor`] to prepare images for the model. See the code examples for more info. +- Note that the encoder of MAE is only used to encode the visual patches. The encoded patches are then concatenated with mask tokens, which the decoder (which also +consists of Transformer blocks) takes as input. Each mask token is a shared, learned vector that indicates the presence of a missing patch to be predicted. Fixed +sin/cos position embeddings are added both to the input of the encoder and the decoder. +- For a visual understanding of how MAEs work you can check out this [post](https://keras.io/examples/vision/masked_image_modeling/). + + + + MAE architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [sayakpaul](https://github.com/sayakpaul) and +[ariG23498](https://github.com/ariG23498) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/mae). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMAE. + +- [`ViTMAEForPreTraining`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining), allowing you to pre-train the model from scratch/further pre-train the model on custom data. +- A notebook that illustrates how to visualize reconstructed pixel values with [`ViTMAEForPreTraining`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb). + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## ViTMAEConfig + +[[autodoc]] ViTMAEConfig + + +## ViTMAEModel + +[[autodoc]] ViTMAEModel + - forward + + +## ViTMAEForPreTraining + +[[autodoc]] transformers.ViTMAEForPreTraining + - forward + + +## TFViTMAEModel + +[[autodoc]] TFViTMAEModel + - call + + +## TFViTMAEForPreTraining + +[[autodoc]] transformers.TFViTMAEForPreTraining + - call diff --git a/docs/source/en/model_doc/vit_mae.mdx b/docs/source/en/model_doc/vit_mae.mdx deleted file mode 100644 index 454423707046..000000000000 --- a/docs/source/en/model_doc/vit_mae.mdx +++ /dev/null @@ -1,81 +0,0 @@ - - -# ViTMAE - -## Overview - -The ViTMAE model was proposed in [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v2) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, -Piotr Dollár, Ross Girshick. The paper shows that, by pre-training a Vision Transformer (ViT) to reconstruct pixel values for masked patches, one can get results after -fine-tuning that outperform supervised pre-training. - -The abstract from the paper is the following: - -*This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the -input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates -only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask -tokens. Second, we find that masking a high proportion of the input image, e.g., 75%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs -enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity -models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream -tasks outperforms supervised pre-training and shows promising scaling behavior.* - -Tips: - -- MAE (masked auto encoding) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training objective is relatively simple: -by masking a large portion (75%) of the image patches, the model must reconstruct raw pixel values. One can use [`ViTMAEForPreTraining`] for this purpose. -- An example Python script that illustrates how to pre-train [`ViTMAEForPreTraining`] from scratch can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). -One can easily tweak it for their own use case. -- A notebook that illustrates how to visualize reconstructed pixel values with [`ViTMAEForPreTraining`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb). -- After pre-training, one "throws away" the decoder used to reconstruct pixels, and one uses the encoder for fine-tuning/linear probing. This means that after -fine-tuning, one can directly plug in the weights into a [`ViTForImageClassification`]. -- One can use [`ViTImageProcessor`] to prepare images for the model. See the code examples for more info. -- Note that the encoder of MAE is only used to encode the visual patches. The encoded patches are then concatenated with mask tokens, which the decoder (which also -consists of Transformer blocks) takes as input. Each mask token is a shared, learned vector that indicates the presence of a missing patch to be predicted. Fixed -sin/cos position embeddings are added both to the input of the encoder and the decoder. -- For a visual understanding of how MAEs work you can check out this [post](https://keras.io/examples/vision/masked_image_modeling/). - - - - MAE architecture. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [sayakpaul](https://github.com/sayakpaul) and -[ariG23498](https://github.com/ariG23498) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/mae). - - -## ViTMAEConfig - -[[autodoc]] ViTMAEConfig - - -## ViTMAEModel - -[[autodoc]] ViTMAEModel - - forward - - -## ViTMAEForPreTraining - -[[autodoc]] transformers.ViTMAEForPreTraining - - forward - - -## TFViTMAEModel - -[[autodoc]] TFViTMAEModel - - call - - -## TFViTMAEForPreTraining - -[[autodoc]] transformers.TFViTMAEForPreTraining - - call diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md new file mode 100644 index 000000000000..ded0245194f8 --- /dev/null +++ b/docs/source/en/model_doc/vit_msn.md @@ -0,0 +1,78 @@ + + +# ViTMSN + +## Overview + +The ViTMSN model was proposed in [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, +Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. The paper presents a joint-embedding architecture to match the prototypes +of masked patches with that of the unmasked patches. With this setup, their method yields excellent performance in the low-shot and extreme low-shot +regimes. + +The abstract from the paper is the following: + +*We propose Masked Siamese Networks (MSN), a self-supervised learning framework for learning image representations. Our +approach matches the representation of an image view containing randomly masked patches to the representation of the original +unmasked image. This self-supervised pre-training strategy is particularly scalable when applied to Vision Transformers since only the +unmasked patches are processed by the network. As a result, MSNs improve the scalability of joint-embedding architectures, +while producing representations of a high semantic level that perform competitively on low-shot image classification. For instance, +on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy, +and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark.* + +Tips: + +- MSN (masked siamese networks) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training +objective is to match the prototypes assigned to the unmasked views of the images to that of the masked views of the same images. +- The authors have only released pre-trained weights of the backbone (ImageNet-1k pre-training). So, to use that on your own image classification dataset, +use the [`ViTMSNForImageClassification`] class which is initialized from [`ViTMSNModel`]. Follow +[this notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) for a detailed tutorial on fine-tuning. +- MSN is particularly useful in the low-shot and extreme low-shot regimes. Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K +labels when fine-tuned. + + +drawing + + MSN architecture. Taken from the original paper. + +This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT MSN. + + + +- [`ViTMSNForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## ViTMSNConfig + +[[autodoc]] ViTMSNConfig + + +## ViTMSNModel + +[[autodoc]] ViTMSNModel + - forward + + +## ViTMSNForImageClassification + +[[autodoc]] ViTMSNForImageClassification + - forward diff --git a/docs/source/en/model_doc/vit_msn.mdx b/docs/source/en/model_doc/vit_msn.mdx deleted file mode 100644 index 07faed51e6cb..000000000000 --- a/docs/source/en/model_doc/vit_msn.mdx +++ /dev/null @@ -1,64 +0,0 @@ - - -# ViTMSN - -## Overview - -The ViTMSN model was proposed in [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, -Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. The paper presents a joint-embedding architecture to match the prototypes -of masked patches with that of the unmasked patches. With this setup, their method yields excellent performance in the low-shot and extreme low-shot -regimes. - -The abstract from the paper is the following: - -*We propose Masked Siamese Networks (MSN), a self-supervised learning framework for learning image representations. Our -approach matches the representation of an image view containing randomly masked patches to the representation of the original -unmasked image. This self-supervised pre-training strategy is particularly scalable when applied to Vision Transformers since only the -unmasked patches are processed by the network. As a result, MSNs improve the scalability of joint-embedding architectures, -while producing representations of a high semantic level that perform competitively on low-shot image classification. For instance, -on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy, -and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark.* - -Tips: - -- MSN (masked siamese networks) is a method for self-supervised pre-training of Vision Transformers (ViTs). The pre-training -objective is to match the prototypes assigned to the unmasked views of the images to that of the masked views of the same images. -- The authors have only released pre-trained weights of the backbone (ImageNet-1k pre-training). So, to use that on your own image classification dataset, -use the [`ViTMSNForImageClassification`] class which is initialized from [`ViTMSNModel`]. Follow -[this notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) for a detailed tutorial on fine-tuning. -- MSN is particularly useful in the low-shot and extreme low-shot regimes. Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K -labels when fine-tuned. - - -drawing - - MSN architecture. Taken from the original paper. - -This model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/facebookresearch/msn). - - -## ViTMSNConfig - -[[autodoc]] ViTMSNConfig - - -## ViTMSNModel - -[[autodoc]] ViTMSNModel - - forward - - -## ViTMSNForImageClassification - -[[autodoc]] ViTMSNForImageClassification - - forward diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md new file mode 100644 index 000000000000..657e467ee319 --- /dev/null +++ b/docs/source/en/model_doc/vitdet.md @@ -0,0 +1,39 @@ + + +# ViTDet + +## Overview + +The ViTDet model was proposed in [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. +VitDet leverages the plain [Vision Transformer](vit) for the task of object detection. + +The abstract from the paper is the following: + +*We explore the plain, non-hierarchical Vision Transformer (ViT) as a backbone network for object detection. This design enables the original ViT architecture to be fine-tuned for object detection without needing to redesign a hierarchical backbone for pre-training. With minimal adaptations for fine-tuning, our plain-backbone detector can achieve competitive results. Surprisingly, we observe: (i) it is sufficient to build a simple feature pyramid from a single-scale feature map (without the common FPN design) and (ii) it is sufficient to use window attention (without shifting) aided with very few cross-window propagation blocks. With plain ViT backbones pre-trained as Masked Autoencoders (MAE), our detector, named ViTDet, can compete with the previous leading methods that were all based on hierarchical backbones, reaching up to 61.3 AP_box on the COCO dataset using only ImageNet-1K pre-training. We hope our study will draw attention to research on plain-backbone detectors.* + +Tips: + +- For the moment, only the backbone is available. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet). + + +## VitDetConfig + +[[autodoc]] VitDetConfig + +## VitDetModel + +[[autodoc]] VitDetModel + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md new file mode 100644 index 000000000000..479b398f8066 --- /dev/null +++ b/docs/source/en/model_doc/vitmatte.md @@ -0,0 +1,55 @@ + + +# ViTMatte + +## Overview + +The ViTMatte model was proposed in [Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. +ViTMatte leverages plain [Vision Transformers](vit) for the task of image matting, which is the process of accurately estimating the foreground object in images and videos. + +The abstract from the paper is the following: + +*Recently, plain vision Transformers (ViTs) have shown impressive performance on various computer vision tasks, thanks to their strong modeling capacity and large-scale pretraining. However, they have not yet conquered the problem of image matting. We hypothesize that image matting could also be boosted by ViTs and present a new efficient and robust ViT-based matting system, named ViTMatte. Our method utilizes (i) a hybrid attention mechanism combined with a convolution neck to help ViTs achieve an excellent performance-computation trade-off in matting tasks. (ii) Additionally, we introduce the detail capture module, which just consists of simple lightweight convolutions to complement the detailed information required by matting. To the best of our knowledge, ViTMatte is the first work to unleash the potential of ViT on image matting with concise adaptation. It inherits many superior properties from ViT to matting, including various pretraining strategies, concise architecture design, and flexible inference strategies. We evaluate ViTMatte on Composition-1k and Distinctions-646, the most commonly used benchmark for image matting, our method achieves state-of-the-art performance and outperforms prior matting works by a large margin.* + +Tips: + +- The model expects both the image and trimap (concatenated) as input. One can use [`ViTMatteImageProcessor`] for this purpose. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/hustvl/ViTMatte). + + + + ViTMatte high-level overview. Taken from the original paper. + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMatte. + +- A demo notebook regarding inference with [`VitMatteForImageMatting`], including background replacement, can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViTMatte). + + +## VitMatteConfig + +[[autodoc]] VitMatteConfig + +## VitMatteImageProcessor + +[[autodoc]] VitMatteImageProcessor + - preprocess + +## VitMatteForImageMatting + +[[autodoc]] VitMatteForImageMatting + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md new file mode 100644 index 000000000000..1b57df4027dd --- /dev/null +++ b/docs/source/en/model_doc/vits.md @@ -0,0 +1,162 @@ + + +# VITS + +## Overview + +The VITS model was proposed in [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. + + +VITS (**V**ariational **I**nference with adversarial learning for end-to-end **T**ext-to-**S**peech) is an end-to-end +speech synthesis model that predicts a speech waveform conditional on an input text sequence. It is a conditional variational +autoencoder (VAE) comprised of a posterior encoder, decoder, and conditional prior. + +A set of spectrogram-based acoustic features are predicted by the flow-based module, which is formed of a Transformer-based +text encoder and multiple coupling layers. The spectrogram is decoded using a stack of transposed convolutional layers, +much in the same style as the HiFi-GAN vocoder. Motivated by the one-to-many nature of the TTS problem, where the same text +input can be spoken in multiple ways, the model also includes a stochastic duration predictor, which allows the model to +synthesise speech with different rhythms from the same input text. + +The model is trained end-to-end with a combination of losses derived from variational lower bound and adversarial training. +To improve the expressiveness of the model, normalizing flows are applied to the conditional prior distribution. During +inference, the text encodings are up-sampled based on the duration prediction module, and then mapped into the +waveform using a cascade of the flow module and HiFi-GAN decoder. Due to the stochastic nature of the duration predictor, +the model is non-deterministic, and thus requires a fixed seed to generate the same speech waveform. + +The abstract from the paper is the following: + +*Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel sampling have been proposed, but their sample quality does not match that of two-stage TTS systems. In this work, we present a parallel end-to-end TTS method that generates more natural sounding audio than current two-stage models. Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth.* + +This model can also be used with TTS checkpoints from [Massively Multilingual Speech (MMS)](https://arxiv.org/abs/2305.13516) +as these checkpoints use the same architecture and a slightly modified tokenizer. + +This model was contributed by [Matthijs](https://huggingface.co/Matthijs) and [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original code can be found [here](https://github.com/jaywalnut310/vits). + +## Model Usage + +Both the VITS and MMS-TTS checkpoints can be used with the same API. Since the flow-based model is non-deterministic, it +is good practice to set a seed to ensure reproducibility of the outputs. For languages with a Roman alphabet, +such as English or French, the tokenizer can be used directly to pre-process the text inputs. The following code example +runs a forward pass using the MMS-TTS English checkpoint: + +```python +import torch +from transformers import VitsTokenizer, VitsModel, set_seed + +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +model = VitsModel.from_pretrained("facebook/mms-tts-eng") + +inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt") + +set_seed(555) # make deterministic + +with torch.no_grad(): + outputs = model(**inputs) + +waveform = outputs.waveform[0] +``` + +The resulting waveform can be saved as a `.wav` file: + +```python +import scipy + +scipy.io.wavfile.write("techno.wav", rate=model.config.sampling_rate, data=waveform) +``` + +Or displayed in a Jupyter Notebook / Google Colab: + +```python +from IPython.display import Audio + +Audio(waveform, rate=model.config.sampling_rate) +``` + +For certain languages with a non-Roman alphabet, such as Arabic, Mandarin or Hindi, the [`uroman`](https://github.com/isi-nlp/uroman) +perl package is required to pre-process the text inputs to the Roman alphabet. + +You can check whether you require the `uroman` package for your language by inspecting the `is_uroman` attribute of +the pre-trained `tokenizer`: + +```python +from transformers import VitsTokenizer + +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +print(tokenizer.is_uroman) +``` + +If required, you should apply the uroman package to your text inputs **prior** to passing them to the `VitsTokenizer`, +since currently the tokenizer does not support performing the pre-processing itself. + +To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path: + +```bash +git clone https://github.com/isi-nlp/uroman.git +cd uroman +export UROMAN=$(pwd) +``` + +You can then pre-process the text input using the following code snippet. You can either rely on using the bash variable +`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromaize` function: + +```python +import torch +from transformers import VitsTokenizer, VitsModel, set_seed +import os +import subprocess + +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor") +model = VitsModel.from_pretrained("facebook/mms-tts-kor") + +def uromanize(input_string, uroman_path): + """Convert non-Roman strings to Roman using the `uroman` perl package.""" + script_path = os.path.join(uroman_path, "bin", "uroman.pl") + + command = ["perl", script_path] + + process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # Execute the perl command + stdout, stderr = process.communicate(input=input_string.encode()) + + if process.returncode != 0: + raise ValueError(f"Error {process.returncode}: {stderr.decode()}") + + # Return the output as a string and skip the new-line character at the end + return stdout.decode()[:-1] + +text = "이봐 무슨 일이야" +uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"]) + +inputs = tokenizer(text=uromaized_text, return_tensors="pt") + +set_seed(555) # make deterministic +with torch.no_grad(): + outputs = model(inputs["input_ids"]) + +waveform = outputs.waveform[0] +``` + +## VitsConfig + +[[autodoc]] VitsConfig + +## VitsTokenizer + +[[autodoc]] VitsTokenizer + - __call__ + - save_vocabulary + +## VitsModel + +[[autodoc]] VitsModel + - forward diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md new file mode 100644 index 000000000000..755629a76752 --- /dev/null +++ b/docs/source/en/model_doc/vivit.md @@ -0,0 +1,44 @@ + + +# Video Vision Transformer (ViViT) + +## Overview + +The Vivit model was proposed in [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. +The paper proposes one of the first successful pure-transformer based set of models for video understanding. + +The abstract from the paper is the following: + +*We present pure-transformer based models for video classification, drawing upon the recent success of such models in image classification. Our model extracts spatio-temporal tokens from the input video, which are then encoded by a series of transformer layers. In order to handle the long sequences of tokens encountered in video, we propose several, efficient variants of our model which factorise the spatial- and temporal-dimensions of the input. Although transformer-based models are known to only be effective when large training datasets are available, we show how we can effectively regularise the model during training and leverage pretrained image models to be able to train on comparatively small datasets. We conduct thorough ablation studies, and achieve state-of-the-art results on multiple video classification benchmarks including Kinetics 400 and 600, Epic Kitchens, Something-Something v2 and Moments in Time, outperforming prior methods based on deep 3D convolutional networks.* + + +This model was contributed by [jegormeister](https://huggingface.co/jegormeister). The original code (written in JAX) can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit). + +## VivitConfig + +[[autodoc]] VivitConfig + +## VivitImageProcessor + +[[autodoc]] VivitImageProcessor + - preprocess + +## VivitModel + +[[autodoc]] VivitModel + - forward + +## VivitForVideoClassification + +[[autodoc]] transformers.VivitForVideoClassification + - forward diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md new file mode 100644 index 000000000000..87e255cd0c6e --- /dev/null +++ b/docs/source/en/model_doc/wav2vec2-conformer.md @@ -0,0 +1,81 @@ + + +# Wav2Vec2-Conformer + +## Overview + +The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. + +The official results of the model can be found in Table 3 and Table 4 of the paper. + +The Wav2Vec2-Conformer weights were released by the Meta AI team within the [Fairseq library](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md#pre-trained-models). + +Tips: + +- Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block + as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100). +- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields +an improved word error rate. +- Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2. +- Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or + rotary position embeddings by setting the correct `config.position_embeddings_type`. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). +The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec). + +## Documentation resources + +- [Audio classification task guide](../tasks/audio_classification) +- [Automatic speech recognition task guide](../tasks/asr) + +## Wav2Vec2ConformerConfig + +[[autodoc]] Wav2Vec2ConformerConfig + +## Wav2Vec2Conformer specific outputs + +[[autodoc]] models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForPreTrainingOutput + +## Wav2Vec2ConformerModel + +[[autodoc]] Wav2Vec2ConformerModel + - forward + +## Wav2Vec2ConformerForCTC + +[[autodoc]] Wav2Vec2ConformerForCTC + - forward + +## Wav2Vec2ConformerForSequenceClassification + +[[autodoc]] Wav2Vec2ConformerForSequenceClassification + - forward + +## Wav2Vec2ConformerForAudioFrameClassification + +[[autodoc]] Wav2Vec2ConformerForAudioFrameClassification + - forward + +## Wav2Vec2ConformerForXVector + +[[autodoc]] Wav2Vec2ConformerForXVector + - forward + +## Wav2Vec2ConformerForPreTraining + +[[autodoc]] Wav2Vec2ConformerForPreTraining + - forward diff --git a/docs/source/en/model_doc/wav2vec2-conformer.mdx b/docs/source/en/model_doc/wav2vec2-conformer.mdx deleted file mode 100644 index 2cfb38553f1e..000000000000 --- a/docs/source/en/model_doc/wav2vec2-conformer.mdx +++ /dev/null @@ -1,73 +0,0 @@ - - -# Wav2Vec2-Conformer - -## Overview - -The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. - -The official results of the model can be found in Table 3 and Table 4 of the paper. - -The Wav2Vec2-Conformer weights were released by the Meta AI team within the [Fairseq library](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md#pre-trained-models). - -Tips: - -- Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block - as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100). -- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields -an improved word error rate. -- Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2. -- Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or - rotary position embeddings by setting the correct `config.position_embeddings_type`. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). -The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec). - - -## Wav2Vec2ConformerConfig - -[[autodoc]] Wav2Vec2ConformerConfig - -## Wav2Vec2Conformer specific outputs - -[[autodoc]] models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForPreTrainingOutput - -## Wav2Vec2ConformerModel - -[[autodoc]] Wav2Vec2ConformerModel - - forward - -## Wav2Vec2ConformerForCTC - -[[autodoc]] Wav2Vec2ConformerForCTC - - forward - -## Wav2Vec2ConformerForSequenceClassification - -[[autodoc]] Wav2Vec2ConformerForSequenceClassification - - forward - -## Wav2Vec2ConformerForAudioFrameClassification - -[[autodoc]] Wav2Vec2ConformerForAudioFrameClassification - - forward - -## Wav2Vec2ConformerForXVector - -[[autodoc]] Wav2Vec2ConformerForXVector - - forward - -## Wav2Vec2ConformerForPreTraining - -[[autodoc]] Wav2Vec2ConformerForPreTraining - - forward diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md new file mode 100644 index 000000000000..3a67f66d9d1f --- /dev/null +++ b/docs/source/en/model_doc/wav2vec2.md @@ -0,0 +1,229 @@ + + +# Wav2Vec2 + +## Overview + +The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. + +The abstract from the paper is the following: + +*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on +transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks +the speech input in the latent space and solves a contrastive task defined over a quantization of the latent +representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the +clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state +of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and +pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech +recognition with limited amounts of labeled data.* + +Tips: + +- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. +- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded + using [`Wav2Vec2CTCTokenizer`]. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A notebook on how to [leverage a pretrained Wav2Vec2 model for emotion classification](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb). 🌎 +- [`Wav2Vec2ForCTC`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb). +- [Audio classification task guide](../tasks/audio_classification) + + + +- A blog post on [boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram). +- A blog post on how to [finetune Wav2Vec2 for English ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-wav2vec2-english). +- A blog post on [finetuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2). +- A notebook on how to [create YouTube captions from any video by transcribing audio with Wav2Vec2](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb). 🌎 +- [`Wav2Vec2ForCTC`] is supported by a notebook on [how to finetune a speech recognition model in English](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb), and [how to finetune a speech recognition model in any language](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb). +- [Automatic speech recognition task guide](../tasks/asr) + +🚀 Deploy + +- A blog post on how to deploy Wav2Vec2 for [Automatic Speech Recogntion with Hugging Face's Transformers & Amazon SageMaker](https://www.philschmid.de/automatic-speech-recognition-sagemaker). + +## Wav2Vec2Config + +[[autodoc]] Wav2Vec2Config + +## Wav2Vec2CTCTokenizer + +[[autodoc]] Wav2Vec2CTCTokenizer + - __call__ + - save_vocabulary + - decode + - batch_decode + - set_target_lang + +## Wav2Vec2FeatureExtractor + +[[autodoc]] Wav2Vec2FeatureExtractor + - __call__ + +## Wav2Vec2Processor + +[[autodoc]] Wav2Vec2Processor + - __call__ + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## Wav2Vec2ProcessorWithLM + +[[autodoc]] Wav2Vec2ProcessorWithLM + - __call__ + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode + +### Decoding multiple audios + +If you are planning to decode multiple batches of audios, you should consider using [`~Wav2Vec2ProcessorWithLM.batch_decode`] and passing an instantiated `multiprocessing.Pool`. +Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower than calling [`~Wav2Vec2ProcessorWithLM.decode`] for each audio individually, as it internally instantiates a new `Pool` for every call. See the example below: + +```python +>>> # Let's see how to use a user-managed pool for batch decoding multiple audios +>>> from multiprocessing import get_context +>>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC +>>> from datasets import load_dataset +>>> import datasets +>>> import torch + +>>> # import model, feature extractor, tokenizer +>>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm").to("cuda") +>>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") + +>>> # load example dataset +>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) + + +>>> def map_to_array(batch): +... batch["speech"] = batch["audio"]["array"] +... return batch + + +>>> # prepare speech data for batch inference +>>> dataset = dataset.map(map_to_array, remove_columns=["audio"]) + + +>>> def map_to_pred(batch, pool): +... inputs = processor(batch["speech"], sampling_rate=16_000, padding=True, return_tensors="pt") +... inputs = {k: v.to("cuda") for k, v in inputs.items()} + +... with torch.no_grad(): +... logits = model(**inputs).logits + +... transcription = processor.batch_decode(logits.cpu().numpy(), pool).text +... batch["transcription"] = transcription +... return batch + + +>>> # note: pool should be instantiated *after* `Wav2Vec2ProcessorWithLM`. +>>> # otherwise, the LM won't be available to the pool's sub-processes +>>> # select number of processes and batch_size based on number of CPU cores available and on dataset size +>>> with get_context("fork").Pool(processes=2) as pool: +... result = dataset.map( +... map_to_pred, batched=True, batch_size=2, fn_kwargs={"pool": pool}, remove_columns=["speech"] +... ) + +>>> result["transcription"][:2] +['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL', "NOR IS MISTER COULTER'S MANNER LESS INTERESTING THAN HIS MATTER"] +``` + +## Wav2Vec2 specific outputs + +[[autodoc]] models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput + +[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput + +[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput + +[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput + +[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput + +## Wav2Vec2Model + +[[autodoc]] Wav2Vec2Model + - forward + +## Wav2Vec2ForCTC + +[[autodoc]] Wav2Vec2ForCTC + - forward + - load_adapter + +## Wav2Vec2ForSequenceClassification + +[[autodoc]] Wav2Vec2ForSequenceClassification + - forward + +## Wav2Vec2ForAudioFrameClassification + +[[autodoc]] Wav2Vec2ForAudioFrameClassification + - forward + +## Wav2Vec2ForXVector + +[[autodoc]] Wav2Vec2ForXVector + - forward + +## Wav2Vec2ForPreTraining + +[[autodoc]] Wav2Vec2ForPreTraining + - forward + +## TFWav2Vec2Model + +[[autodoc]] TFWav2Vec2Model + - call + +## TFWav2Vec2ForSequenceClassification + +[[autodoc]] TFWav2Vec2ForSequenceClassification + - call + +## TFWav2Vec2ForCTC + +[[autodoc]] TFWav2Vec2ForCTC + - call + +## FlaxWav2Vec2Model + +[[autodoc]] FlaxWav2Vec2Model + - __call__ + +## FlaxWav2Vec2ForCTC + +[[autodoc]] FlaxWav2Vec2ForCTC + - __call__ + +## FlaxWav2Vec2ForPreTraining + +[[autodoc]] FlaxWav2Vec2ForPreTraining + - __call__ diff --git a/docs/source/en/model_doc/wav2vec2.mdx b/docs/source/en/model_doc/wav2vec2.mdx deleted file mode 100644 index 3acf176a27a8..000000000000 --- a/docs/source/en/model_doc/wav2vec2.mdx +++ /dev/null @@ -1,216 +0,0 @@ - - -# Wav2Vec2 - -## Overview - -The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. - -The abstract from the paper is the following: - -*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on -transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks -the speech input in the latent space and solves a contrastive task defined over a quantization of the latent -representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the -clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state -of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and -pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech -recognition with limited amounts of labeled data.* - -Tips: - -- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. -- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded - using [`Wav2Vec2CTCTokenizer`]. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - - - -- A notebook on how to [leverage a pretrained Wav2Vec2 model for emotion classification](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb). 🌎 -- [`Wav2Vec2ForCTC`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb). - - - -- A blog post on [boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram). -- A blog post on how to [finetune Wav2Vec2 for English ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-wav2vec2-english). -- A blog post on [finetuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2). -- A notebook on how to [create YouTube captions from any video by transcribing audio with Wav2Vec2](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb). 🌎 -- [`Wav2Vec2ForCTC`] is supported by a notebook on [how to finetune a speech recognition model in English](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb), and [how to finetune a speech recognition model in any language](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb). - -🚀 Deploy - -- A blog post on how to deploy Wav2Vec2 for [Automatic Speech Recogntion with Hugging Face's Transformers & Amazon SageMaker](https://www.philschmid.de/automatic-speech-recognition-sagemaker). - -## Wav2Vec2Config - -[[autodoc]] Wav2Vec2Config - -## Wav2Vec2CTCTokenizer - -[[autodoc]] Wav2Vec2CTCTokenizer - - __call__ - - save_vocabulary - - decode - - batch_decode - -## Wav2Vec2FeatureExtractor - -[[autodoc]] Wav2Vec2FeatureExtractor - - __call__ - -## Wav2Vec2Processor - -[[autodoc]] Wav2Vec2Processor - - __call__ - - pad - - from_pretrained - - save_pretrained - - batch_decode - - decode - -## Wav2Vec2ProcessorWithLM - -[[autodoc]] Wav2Vec2ProcessorWithLM - - __call__ - - pad - - from_pretrained - - save_pretrained - - batch_decode - - decode - -### Decoding multiple audios - -If you are planning to decode multiple batches of audios, you should consider using [`~Wav2Vec2ProcessorWithLM.batch_decode`] and passing an instantiated `multiprocessing.Pool`. -Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower than calling [`~Wav2Vec2ProcessorWithLM.decode`] for each audio individually, as it internally instantiates a new `Pool` for every call. See the example below: - -```python ->>> # Let's see how to use a user-managed pool for batch decoding multiple audios ->>> from multiprocessing import get_context ->>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC ->>> from datasets import load_dataset ->>> import datasets ->>> import torch - ->>> # import model, feature extractor, tokenizer ->>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm").to("cuda") ->>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") - ->>> # load example dataset ->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) - - ->>> def map_to_array(batch): -... batch["speech"] = batch["audio"]["array"] -... return batch - - ->>> # prepare speech data for batch inference ->>> dataset = dataset.map(map_to_array, remove_columns=["audio"]) - - ->>> def map_to_pred(batch, pool): -... inputs = processor(batch["speech"], sampling_rate=16_000, padding=True, return_tensors="pt") -... inputs = {k: v.to("cuda") for k, v in inputs.items()} - -... with torch.no_grad(): -... logits = model(**inputs).logits - -... transcription = processor.batch_decode(logits.cpu().numpy(), pool).text -... batch["transcription"] = transcription -... return batch - - ->>> # note: pool should be instantiated *after* `Wav2Vec2ProcessorWithLM`. ->>> # otherwise, the LM won't be available to the pool's sub-processes ->>> # select number of processes and batch_size based on number of CPU cores available and on dataset size ->>> with get_context("fork").Pool(processes=2) as pool: -... result = dataset.map( -... map_to_pred, batched=True, batch_size=2, fn_kwargs={"pool": pool}, remove_columns=["speech"] -... ) - ->>> result["transcription"][:2] -['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL', "NOR IS MISTER COULTER'S MANNER LESS INTERESTING THAN HIS MATTER"] -``` - -## Wav2Vec2 specific outputs - -[[autodoc]] models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput - -[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput - -[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput - -[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput - -[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput - -## Wav2Vec2Model - -[[autodoc]] Wav2Vec2Model - - forward - -## Wav2Vec2ForCTC - -[[autodoc]] Wav2Vec2ForCTC - - forward - -## Wav2Vec2ForSequenceClassification - -[[autodoc]] Wav2Vec2ForSequenceClassification - - forward - -## Wav2Vec2ForAudioFrameClassification - -[[autodoc]] Wav2Vec2ForAudioFrameClassification - - forward - -## Wav2Vec2ForXVector - -[[autodoc]] Wav2Vec2ForXVector - - forward - -## Wav2Vec2ForPreTraining - -[[autodoc]] Wav2Vec2ForPreTraining - - forward - -## TFWav2Vec2Model - -[[autodoc]] TFWav2Vec2Model - - call - -## TFWav2Vec2ForCTC - -[[autodoc]] TFWav2Vec2ForCTC - - call - -## FlaxWav2Vec2Model - -[[autodoc]] FlaxWav2Vec2Model - - __call__ - -## FlaxWav2Vec2ForCTC - -[[autodoc]] FlaxWav2Vec2ForCTC - - __call__ - -## FlaxWav2Vec2ForPreTraining - -[[autodoc]] FlaxWav2Vec2ForPreTraining - - __call__ diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.md b/docs/source/en/model_doc/wav2vec2_phoneme.md new file mode 100644 index 000000000000..a852bef637b2 --- /dev/null +++ b/docs/source/en/model_doc/wav2vec2_phoneme.md @@ -0,0 +1,60 @@ + + +# Wav2Vec2Phoneme + +## Overview + +The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., +2021](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. + +The abstract from the paper is the following: + +*Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech +recognition systems without any labeled data. However, in many cases there is labeled data available for related +languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer +learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by +mapping phonemes of the training languages to the target language using articulatory features. Experiments show that +this simple method significantly outperforms prior work which introduced task-specific architectures and used only part +of a monolingually pretrained model.* + +Tips: + +- Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2 +- Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. +- Wav2Vec2Phoneme model was trained using connectionist temporal classification (CTC) so the model output has to be + decoded using [`Wav2Vec2PhonemeCTCTokenizer`]. +- Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass + to a sequence of phonemes +- By default the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one + should make use of a dictionary and language model. + +Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten) + +The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec). + +Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, so one can refer to [`Wav2Vec2`]'s documentation page except for the tokenizer. + + +## Wav2Vec2PhonemeCTCTokenizer + +[[autodoc]] Wav2Vec2PhonemeCTCTokenizer + - __call__ + - batch_decode + - decode + - phonemize diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.mdx b/docs/source/en/model_doc/wav2vec2_phoneme.mdx deleted file mode 100644 index b39cf66ce136..000000000000 --- a/docs/source/en/model_doc/wav2vec2_phoneme.mdx +++ /dev/null @@ -1,56 +0,0 @@ - - -# Wav2Vec2Phoneme - -## Overview - -The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., -2021](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. - -The abstract from the paper is the following: - -*Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech -recognition systems without any labeled data. However, in many cases there is labeled data available for related -languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer -learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by -mapping phonemes of the training languages to the target language using articulatory features. Experiments show that -this simple method significantly outperforms prior work which introduced task-specific architectures and used only part -of a monolingually pretrained model.* - -Tips: - -- Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2 -- Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. -- Wav2Vec2Phoneme model was trained using connectionist temporal classification (CTC) so the model output has to be - decoded using [`Wav2Vec2PhonemeCTCTokenizer`]. -- Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass - to a sequence of phonemes -- By default the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one - should make use of a dictionary and language model. - -Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten) - -The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec). - -Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, so one can refer to [`Wav2Vec2`]'s documentation page except for the tokenizer. - - -## Wav2Vec2PhonemeCTCTokenizer - -[[autodoc]] Wav2Vec2PhonemeCTCTokenizer - - __call__ - - batch_decode - - decode - - phonemize diff --git a/docs/source/en/model_doc/wavlm.md b/docs/source/en/model_doc/wavlm.md new file mode 100644 index 000000000000..2754304d8264 --- /dev/null +++ b/docs/source/en/model_doc/wavlm.md @@ -0,0 +1,83 @@ + + +# WavLM + +## Overview + +The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, +Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, +Michael Zeng, Furu Wei. + +The abstract from the paper is the following: + +*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been +attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker +identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is +challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks. +WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity +preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on +recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where +additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up +the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB +benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.* + +Tips: + +- WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use + [`Wav2Vec2Processor`] for the feature extraction. +- WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded + using [`Wav2Vec2CTCTokenizer`]. +- WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks. + +Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm. + +This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be +found [here](https://github.com/microsoft/unilm/tree/master/wavlm). + +## Documentation resources + +- [Audio classification task guide](../tasks/audio_classification) +- [Automatic speech recognition task guide](../tasks/asr) + +## WavLMConfig + +[[autodoc]] WavLMConfig + +## WavLMModel + +[[autodoc]] WavLMModel + - forward + +## WavLMForCTC + +[[autodoc]] WavLMForCTC + - forward + +## WavLMForSequenceClassification + +[[autodoc]] WavLMForSequenceClassification + - forward + +## WavLMForAudioFrameClassification + +[[autodoc]] WavLMForAudioFrameClassification + - forward + +## WavLMForXVector + +[[autodoc]] WavLMForXVector + - forward diff --git a/docs/source/en/model_doc/wavlm.mdx b/docs/source/en/model_doc/wavlm.mdx deleted file mode 100644 index 8e2138a61187..000000000000 --- a/docs/source/en/model_doc/wavlm.mdx +++ /dev/null @@ -1,75 +0,0 @@ - - -# WavLM - -## Overview - -The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, -Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, -Michael Zeng, Furu Wei. - -The abstract from the paper is the following: - -*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been -attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker -identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is -challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks. -WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity -preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on -recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where -additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up -the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB -benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.* - -Tips: - -- WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please use - [`Wav2Vec2Processor`] for the feature extraction. -- WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded - using [`Wav2Vec2CTCTokenizer`]. -- WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks. - -Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm. - -This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The Authors' code can be -found [here](https://github.com/microsoft/unilm/tree/master/wavlm). - - -## WavLMConfig - -[[autodoc]] WavLMConfig - -## WavLMModel - -[[autodoc]] WavLMModel - - forward - -## WavLMForCTC - -[[autodoc]] WavLMForCTC - - forward - -## WavLMForSequenceClassification - -[[autodoc]] WavLMForSequenceClassification - - forward - -## WavLMForAudioFrameClassification - -[[autodoc]] WavLMForAudioFrameClassification - - forward - -## WavLMForXVector - -[[autodoc]] WavLMForXVector - - forward diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md new file mode 100644 index 000000000000..fbf806cd41df --- /dev/null +++ b/docs/source/en/model_doc/whisper.md @@ -0,0 +1,117 @@ + + +# Whisper + +## Overview + +The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. + +The abstract from the paper is the following: + +*We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.* + + +Tips: + +- The model usually performs well without requiring any finetuning. +- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation.GenerationMixin.generate`] function for inference. +- Inference is currently only implemented for short-form i.e. audio is pre-segmented into <=30s segments. Long-form (including timestamps) will be implemented in a future release. +- One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text. + +This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). +The original code can be found [here](https://github.com/openai/whisper). + + +## WhisperConfig + +[[autodoc]] WhisperConfig + +## WhisperTokenizer + +[[autodoc]] WhisperTokenizer + - set_prefix_tokens + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## WhisperTokenizerFast + +[[autodoc]] WhisperTokenizerFast + - set_prefix_tokens + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## WhisperFeatureExtractor + +[[autodoc]] WhisperFeatureExtractor + - __call__ + +## WhisperProcessor + +[[autodoc]] WhisperProcessor + - __call__ + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## WhisperModel + +[[autodoc]] WhisperModel + - forward + - _mask_input_features + +## WhisperForConditionalGeneration + +[[autodoc]] WhisperForConditionalGeneration + - forward + +## WhisperForAudioClassification + +[[autodoc]] WhisperForAudioClassification + - forward + + +## TFWhisperModel + +[[autodoc]] TFWhisperModel + - call + +## TFWhisperForConditionalGeneration + +[[autodoc]] TFWhisperForConditionalGeneration + - call + + +## FlaxWhisperModel + +[[autodoc]] FlaxWhisperModel + - __call__ + +## FlaxWhisperForConditionalGeneration + +[[autodoc]] FlaxWhisperForConditionalGeneration + - __call__ + +## FlaxWhisperForAudioClassification + +[[autodoc]] FlaxWhisperForAudioClassification + - __call__ + diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx deleted file mode 100644 index 4b7a60286184..000000000000 --- a/docs/source/en/model_doc/whisper.mdx +++ /dev/null @@ -1,81 +0,0 @@ - - -# Whisper - -## Overview - -The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. - -The abstract from the paper is the following: - -*We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.* - - -Tips: - -- The model usually performs well without requiring any finetuning. -- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation.GenerationMixin.generate`] function for inference. -- Inference is currently only implemented for short-form i.e. audio is pre-segmented into <=30s segments. Long-form (including timestamps) will be implemented in a future release. -- One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text. - -This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). -The original code can be found [here](https://github.com/openai/whisper). - - -## WhisperConfig - -[[autodoc]] WhisperConfig - -## WhisperTokenizer - -[[autodoc]] WhisperTokenizer - - set_prefix_tokens - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## WhisperFeatureExtractor - -[[autodoc]] WhisperFeatureExtractor - - __call__ - -## WhisperProcessor - -[[autodoc]] WhisperProcessor - - __call__ - - from_pretrained - - save_pretrained - - batch_decode - - decode - -## WhisperModel - -[[autodoc]] WhisperModel - - forward - -## WhisperForConditionalGeneration - -[[autodoc]] WhisperForConditionalGeneration - - forward - - -## TFWhisperModel - -[[autodoc]] TFWhisperModel - - call - -## TFWhisperForConditionalGeneration - -[[autodoc]] TFWhisperForConditionalGeneration - - call diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md new file mode 100644 index 000000000000..45c4c3db749b --- /dev/null +++ b/docs/source/en/model_doc/xclip.md @@ -0,0 +1,80 @@ + + +# X-CLIP + +## Overview + +The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. +X-CLIP is a minimal extension of [CLIP](clip) for video. The model consists of a text encoder, a cross-frame vision encoder, a multi-frame integration Transformer, and a video-specific prompt generator. + +The abstract from the paper is the following: + +*Contrastive language-image pretraining has shown great success in learning visual-textual joint representation from web-scale data, demonstrating remarkable "zero-shot" generalization ability for various image tasks. However, how to effectively expand such new language-image pretraining methods to video domains is still an open problem. In this work, we present a simple yet effective approach that adapts the pretrained language-image models to video recognition directly, instead of pretraining a new model from scratch. More concretely, to capture the long-range dependencies of frames along the temporal dimension, we propose a cross-frame attention mechanism that explicitly exchanges information across frames. Such module is lightweight and can be plugged into pretrained language-image models seamlessly. Moreover, we propose a video-specific prompting scheme, which leverages video content information for generating discriminative textual prompts. Extensive experiments demonstrate that our approach is effective and can be generalized to different video recognition scenarios. In particular, under fully-supervised settings, our approach achieves a top-1 accuracy of 87.1% on Kinectics-400, while using 12 times fewer FLOPs compared with Swin-L and ViViT-H. In zero-shot experiments, our approach surpasses the current state-of-the-art methods by +7.6% and +14.9% in terms of top-1 accuracy under two popular protocols. In few-shot scenarios, our approach outperforms previous best methods by +32.1% and +23.1% when the labeled data is extremely limited.* + +Tips: + +- Usage of X-CLIP is identical to [CLIP](clip). + + + + X-CLIP architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with X-CLIP. + +- Demo notebooks for X-CLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/X-CLIP). + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## XCLIPProcessor + +[[autodoc]] XCLIPProcessor + +## XCLIPConfig + +[[autodoc]] XCLIPConfig + - from_text_vision_configs + +## XCLIPTextConfig + +[[autodoc]] XCLIPTextConfig + +## XCLIPVisionConfig + +[[autodoc]] XCLIPVisionConfig + +## XCLIPModel + +[[autodoc]] XCLIPModel + - forward + - get_text_features + - get_video_features + +## XCLIPTextModel + +[[autodoc]] XCLIPTextModel + - forward + +## XCLIPVisionModel + +[[autodoc]] XCLIPVisionModel + - forward diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.mdx deleted file mode 100644 index 96832f46e5b8..000000000000 --- a/docs/source/en/model_doc/xclip.mdx +++ /dev/null @@ -1,70 +0,0 @@ - - -# X-CLIP - -## Overview - -The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. -X-CLIP is a minimal extension of [CLIP](clip) for video. The model consists of a text encoder, a cross-frame vision encoder, a multi-frame integration Transformer, and a video-specific prompt generator. - -The abstract from the paper is the following: - -*Contrastive language-image pretraining has shown great success in learning visual-textual joint representation from web-scale data, demonstrating remarkable "zero-shot" generalization ability for various image tasks. However, how to effectively expand such new language-image pretraining methods to video domains is still an open problem. In this work, we present a simple yet effective approach that adapts the pretrained language-image models to video recognition directly, instead of pretraining a new model from scratch. More concretely, to capture the long-range dependencies of frames along the temporal dimension, we propose a cross-frame attention mechanism that explicitly exchanges information across frames. Such module is lightweight and can be plugged into pretrained language-image models seamlessly. Moreover, we propose a video-specific prompting scheme, which leverages video content information for generating discriminative textual prompts. Extensive experiments demonstrate that our approach is effective and can be generalized to different video recognition scenarios. In particular, under fully-supervised settings, our approach achieves a top-1 accuracy of 87.1% on Kinectics-400, while using 12 times fewer FLOPs compared with Swin-L and ViViT-H. In zero-shot experiments, our approach surpasses the current state-of-the-art methods by +7.6% and +14.9% in terms of top-1 accuracy under two popular protocols. In few-shot scenarios, our approach outperforms previous best methods by +32.1% and +23.1% when the labeled data is extremely limited.* - -Tips: - -- Usage of X-CLIP is identical to [CLIP](clip). -- Demo notebooks for X-CLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/X-CLIP). - - - - X-CLIP architecture. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). -The original code can be found [here](https://github.com/microsoft/VideoX/tree/master/X-CLIP). - - -## XCLIPProcessor - -[[autodoc]] XCLIPProcessor - -## XCLIPConfig - -[[autodoc]] XCLIPConfig - - from_text_vision_configs - -## XCLIPTextConfig - -[[autodoc]] XCLIPTextConfig - -## XCLIPVisionConfig - -[[autodoc]] XCLIPVisionConfig - -## XCLIPModel - -[[autodoc]] XCLIPModel - - forward - - get_text_features - - get_video_features - -## XCLIPTextModel - -[[autodoc]] XCLIPTextModel - - forward - -## XCLIPVisionModel - -[[autodoc]] XCLIPVisionModel - - forward diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md new file mode 100644 index 000000000000..1b184c17e803 --- /dev/null +++ b/docs/source/en/model_doc/xglm.md @@ -0,0 +1,93 @@ + + +# XGLM + +## Overview + +The XGLM model was proposed in [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) +by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, +Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, +Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. + +The abstract from the paper is the following: + +*Large-scale autoregressive language models such as GPT-3 are few-shot learners that can perform a wide range of language +tasks without fine-tuning. While these models are known to be able to jointly represent many different languages, +their training data is dominated by English, potentially limiting their cross-lingual generalization. +In this work, we train multilingual autoregressive language models on a balanced corpus covering a diverse set of languages, +and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters +sets new state of the art in few-shot learning in more than 20 representative languages, outperforming GPT-3 of comparable size +in multilingual commonsense reasoning (with +7.4% absolute accuracy improvement in 0-shot settings and +9.4% in 4-shot settings) +and natural language inference (+5.4% in each of 0-shot and 4-shot settings). On the FLORES-101 machine translation benchmark, +our model outperforms GPT-3 on 171 out of 182 translation directions with 32 training examples, while surpassing the +official supervised baseline in 45 directions. We present a detailed analysis of where the model succeeds and fails, +showing in particular that it enables cross-lingual in-context learning on some tasks, while there is still room for improvement +on surface form robustness and adaptation to tasks that do not have a natural cloze form. Finally, we evaluate our models +in social value tasks such as hate speech detection in five languages and find it has limitations similar to comparable sized GPT-3 models.* + + +This model was contributed by [Suraj](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/xglm). + +## Documentation resources + +- [Causal language modeling task guide](../tasks/language_modeling) + +## XGLMConfig + +[[autodoc]] XGLMConfig + +## XGLMTokenizer + +[[autodoc]] XGLMTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## XGLMTokenizerFast + +[[autodoc]] XGLMTokenizerFast + +## XGLMModel + +[[autodoc]] XGLMModel + - forward + +## XGLMForCausalLM + +[[autodoc]] XGLMForCausalLM + - forward + +## TFXGLMModel + +[[autodoc]] TFXGLMModel + - call + +## TFXGLMForCausalLM + +[[autodoc]] TFXGLMForCausalLM + - call + +## FlaxXGLMModel + +[[autodoc]] FlaxXGLMModel + - __call__ + +## FlaxXGLMForCausalLM + +[[autodoc]] FlaxXGLMForCausalLM + - __call__ \ No newline at end of file diff --git a/docs/source/en/model_doc/xglm.mdx b/docs/source/en/model_doc/xglm.mdx deleted file mode 100644 index e35bab25f89c..000000000000 --- a/docs/source/en/model_doc/xglm.mdx +++ /dev/null @@ -1,85 +0,0 @@ - - -# XGLM - -## Overview - -The XGLM model was proposed in [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) -by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, -Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, -Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. - -The abstract from the paper is the following: - -*Large-scale autoregressive language models such as GPT-3 are few-shot learners that can perform a wide range of language -tasks without fine-tuning. While these models are known to be able to jointly represent many different languages, -their training data is dominated by English, potentially limiting their cross-lingual generalization. -In this work, we train multilingual autoregressive language models on a balanced corpus covering a diverse set of languages, -and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters -sets new state of the art in few-shot learning in more than 20 representative languages, outperforming GPT-3 of comparable size -in multilingual commonsense reasoning (with +7.4% absolute accuracy improvement in 0-shot settings and +9.4% in 4-shot settings) -and natural language inference (+5.4% in each of 0-shot and 4-shot settings). On the FLORES-101 machine translation benchmark, -our model outperforms GPT-3 on 171 out of 182 translation directions with 32 training examples, while surpassing the -official supervised baseline in 45 directions. We present a detailed analysis of where the model succeeds and fails, -showing in particular that it enables cross-lingual in-context learning on some tasks, while there is still room for improvement -on surface form robustness and adaptation to tasks that do not have a natural cloze form. Finally, we evaluate our models -in social value tasks such as hate speech detection in five languages and find it has limitations similar to comparable sized GPT-3 models.* - - -This model was contributed by [Suraj](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/xglm). - -## XGLMConfig - -[[autodoc]] XGLMConfig - -## XGLMTokenizer - -[[autodoc]] XGLMTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## XGLMTokenizerFast - -[[autodoc]] XGLMTokenizerFast - -## XGLMModel - -[[autodoc]] XGLMModel - - forward - -## XGLMForCausalLM - -[[autodoc]] XGLMForCausalLM - - forward - -## TFXGLMModel - -[[autodoc]] TFXGLMModel - - call - -## TFXGLMForCausalLM - -[[autodoc]] TFXGLMForCausalLM - - call - -## FlaxXGLMModel - -[[autodoc]] FlaxXGLMModel - - __call__ - -## FlaxXGLMForCausalLM - -[[autodoc]] FlaxXGLMForCausalLM - - __call__ \ No newline at end of file diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md new file mode 100644 index 000000000000..5e7ba5b7e3f5 --- /dev/null +++ b/docs/source/en/model_doc/xlm-prophetnet.md @@ -0,0 +1,91 @@ + + +# XLM-ProphetNet + +
+ +Models + + +Spaces + +
+ +**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign +@patrickvonplaten + + +## Overview + +The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei +Zhang, Ming Zhou on 13 Jan, 2020. + +XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of +just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual +"wiki100" Wikipedia dump. + +The abstract from the paper is the following: + +*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel +self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of +the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by +n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time +step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent +overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale +dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for +abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new +state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.* + +The Authors' code can be found [here](https://github.com/microsoft/ProphetNet). + +Tips: + +- XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE. + +## Documentation resources + +- [Causal language modeling task guide](../tasks/language_modeling) +- [Translation task guide](../tasks/translation) +- [Summarization task guide](../tasks/summarization) + +## XLMProphetNetConfig + +[[autodoc]] XLMProphetNetConfig + +## XLMProphetNetTokenizer + +[[autodoc]] XLMProphetNetTokenizer + +## XLMProphetNetModel + +[[autodoc]] XLMProphetNetModel + +## XLMProphetNetEncoder + +[[autodoc]] XLMProphetNetEncoder + +## XLMProphetNetDecoder + +[[autodoc]] XLMProphetNetDecoder + +## XLMProphetNetForConditionalGeneration + +[[autodoc]] XLMProphetNetForConditionalGeneration + +## XLMProphetNetForCausalLM + +[[autodoc]] XLMProphetNetForCausalLM diff --git a/docs/source/en/model_doc/xlm-prophetnet.mdx b/docs/source/en/model_doc/xlm-prophetnet.mdx deleted file mode 100644 index af4a3bb6e87e..000000000000 --- a/docs/source/en/model_doc/xlm-prophetnet.mdx +++ /dev/null @@ -1,68 +0,0 @@ - - -# XLM-ProphetNet - -**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign -@patrickvonplaten - - -## Overview - -The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei -Zhang, Ming Zhou on 13 Jan, 2020. - -XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of -just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual -"wiki100" Wikipedia dump. - -The abstract from the paper is the following: - -*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel -self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of -the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by -n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time -step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent -overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale -dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for -abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new -state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.* - -The Authors' code can be found [here](https://github.com/microsoft/ProphetNet). - -## XLMProphetNetConfig - -[[autodoc]] XLMProphetNetConfig - -## XLMProphetNetTokenizer - -[[autodoc]] XLMProphetNetTokenizer - -## XLMProphetNetModel - -[[autodoc]] XLMProphetNetModel - -## XLMProphetNetEncoder - -[[autodoc]] XLMProphetNetEncoder - -## XLMProphetNetDecoder - -[[autodoc]] XLMProphetNetDecoder - -## XLMProphetNetForConditionalGeneration - -[[autodoc]] XLMProphetNetForConditionalGeneration - -## XLMProphetNetForCausalLM - -[[autodoc]] XLMProphetNetForCausalLM diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md new file mode 100644 index 000000000000..b65929460706 --- /dev/null +++ b/docs/source/en/model_doc/xlm-roberta-xl.md @@ -0,0 +1,81 @@ + + +# XLM-RoBERTa-XL + +## Overview + +The XLM-RoBERTa-XL model was proposed in [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. + +The abstract from the paper is the following: + +*Recent work has demonstrated the effectiveness of cross-lingual language model pretraining for cross-lingual understanding. In this study, we present the results of two larger multilingual masked language models, with 3.5B and 10.7B parameters. Our two new models dubbed XLM-R XL and XLM-R XXL outperform XLM-R by 1.8% and 2.4% average accuracy on XNLI. Our model also outperforms the RoBERTa-Large model on several English tasks of the GLUE benchmark by 0.3% on average while handling 99 more languages. This suggests pretrained models with larger capacity may obtain both strong performance on high-resource languages while greatly improving low-resource languages. We make our code and models publicly available.* + +Tips: + +- XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does + not require `lang` tensors to understand which language is used, and should be able to determine the correct + language from the input ids. + +This model was contributed by [Soonhwan-Kwon](https://github.com/Soonhwan-Kwon) and [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## XLMRobertaXLConfig + +[[autodoc]] XLMRobertaXLConfig + +## XLMRobertaXLModel + +[[autodoc]] XLMRobertaXLModel + - forward + +## XLMRobertaXLForCausalLM + +[[autodoc]] XLMRobertaXLForCausalLM + - forward + +## XLMRobertaXLForMaskedLM + +[[autodoc]] XLMRobertaXLForMaskedLM + - forward + +## XLMRobertaXLForSequenceClassification + +[[autodoc]] XLMRobertaXLForSequenceClassification + - forward + +## XLMRobertaXLForMultipleChoice + +[[autodoc]] XLMRobertaXLForMultipleChoice + - forward + +## XLMRobertaXLForTokenClassification + +[[autodoc]] XLMRobertaXLForTokenClassification + - forward + +## XLMRobertaXLForQuestionAnswering + +[[autodoc]] XLMRobertaXLForQuestionAnswering + - forward diff --git a/docs/source/en/model_doc/xlm-roberta-xl.mdx b/docs/source/en/model_doc/xlm-roberta-xl.mdx deleted file mode 100644 index 01829a128c00..000000000000 --- a/docs/source/en/model_doc/xlm-roberta-xl.mdx +++ /dev/null @@ -1,69 +0,0 @@ - - -# XLM-RoBERTa-XL - -## Overview - -The XLM-RoBERTa-XL model was proposed in [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. - -The abstract from the paper is the following: - -*Recent work has demonstrated the effectiveness of cross-lingual language model pretraining for cross-lingual understanding. In this study, we present the results of two larger multilingual masked language models, with 3.5B and 10.7B parameters. Our two new models dubbed XLM-R XL and XLM-R XXL outperform XLM-R by 1.8% and 2.4% average accuracy on XNLI. Our model also outperforms the RoBERTa-Large model on several English tasks of the GLUE benchmark by 0.3% on average while handling 99 more languages. This suggests pretrained models with larger capacity may obtain both strong performance on high-resource languages while greatly improving low-resource languages. We make our code and models publicly available.* - -Tips: - -- XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does - not require `lang` tensors to understand which language is used, and should be able to determine the correct - language from the input ids. - -This model was contributed by [Soonhwan-Kwon](https://github.com/Soonhwan-Kwon) and [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). - - -## XLMRobertaXLConfig - -[[autodoc]] XLMRobertaXLConfig - -## XLMRobertaXLModel - -[[autodoc]] XLMRobertaXLModel - - forward - -## XLMRobertaXLForCausalLM - -[[autodoc]] XLMRobertaXLForCausalLM - - forward - -## XLMRobertaXLForMaskedLM - -[[autodoc]] XLMRobertaXLForMaskedLM - - forward - -## XLMRobertaXLForSequenceClassification - -[[autodoc]] XLMRobertaXLForSequenceClassification - - forward - -## XLMRobertaXLForMultipleChoice - -[[autodoc]] XLMRobertaXLForMultipleChoice - - forward - -## XLMRobertaXLForTokenClassification - -[[autodoc]] XLMRobertaXLForTokenClassification - - forward - -## XLMRobertaXLForQuestionAnswering - -[[autodoc]] XLMRobertaXLForQuestionAnswering - - forward diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md new file mode 100644 index 000000000000..935003156fd1 --- /dev/null +++ b/docs/source/en/model_doc/xlm-roberta.md @@ -0,0 +1,232 @@ + + +# XLM-RoBERTa + +
+ +Models + + +Spaces + +
+ +## Overview + +The XLM-RoBERTa model was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume +Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's +RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl +data. + +The abstract from the paper is the following: + +*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a +wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred +languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly +outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on +XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on +low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We +also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the +trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource +languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing +per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We +will make XLM-R code, data, and models publicly available.* + +Tips: + +- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does + not require `lang` tensors to understand which language is used, and should be able to determine the correct + language from the input ids. +- Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses masked language modeling on sentences coming from one language. +- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples + as well as the information relative to the inputs and outputs. + +This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with XLM-RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + + + +- A blog post on how to [finetune XLM RoBERTa for multiclass classification with Habana Gaudi on AWS](https://www.philschmid.de/habana-distributed-training) +- [`XLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb). +- [`TFXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). +- [`FlaxXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb). +- [Text classification](https://huggingface.co/docs/transformers/tasks/sequence_classification) chapter of the 🤗 Hugging Face Task Guides. +- [Text classification task guide](../tasks/sequence_classification) + + + +- [`XLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb). +- [`TFXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). +- [`FlaxXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification). +- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course. +- [Token classification task guide](../tasks/token_classification) + + + +- [`XLMRobertaForCausalLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). +- [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) chapter of the 🤗 Hugging Face Task Guides. +- [Causal language modeling task guide](../tasks/language_modeling) + + + +- [`XLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). +- [`TFXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). +- [`FlaxXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb). +- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course. +- [Masked language modeling](../tasks/masked_language_modeling) + + + +- [`XLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb). +- [`TFXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). +- [`FlaxXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering). +- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course. +- [Question answering task guide](../tasks/question_answering) + +**Multiple choice** + +- [`XLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb). +- [`TFXLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). +- [Multiple choice task guide](../tasks/multiple_choice) + +🚀 Deploy + +- A blog post on how to [Deploy Serverless XLM RoBERTa on AWS Lambda](https://www.philschmid.de/multilingual-serverless-xlm-roberta-with-huggingface). + +## XLMRobertaConfig + +[[autodoc]] XLMRobertaConfig + +## XLMRobertaTokenizer + +[[autodoc]] XLMRobertaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## XLMRobertaTokenizerFast + +[[autodoc]] XLMRobertaTokenizerFast + +## XLMRobertaModel + +[[autodoc]] XLMRobertaModel + - forward + +## XLMRobertaForCausalLM + +[[autodoc]] XLMRobertaForCausalLM + - forward + +## XLMRobertaForMaskedLM + +[[autodoc]] XLMRobertaForMaskedLM + - forward + +## XLMRobertaForSequenceClassification + +[[autodoc]] XLMRobertaForSequenceClassification + - forward + +## XLMRobertaForMultipleChoice + +[[autodoc]] XLMRobertaForMultipleChoice + - forward + +## XLMRobertaForTokenClassification + +[[autodoc]] XLMRobertaForTokenClassification + - forward + +## XLMRobertaForQuestionAnswering + +[[autodoc]] XLMRobertaForQuestionAnswering + - forward + +## TFXLMRobertaModel + +[[autodoc]] TFXLMRobertaModel + - call + +## TFXLMRobertaForCausalLM + +[[autodoc]] TFXLMRobertaForCausalLM + - call + +## TFXLMRobertaForMaskedLM + +[[autodoc]] TFXLMRobertaForMaskedLM + - call + +## TFXLMRobertaForSequenceClassification + +[[autodoc]] TFXLMRobertaForSequenceClassification + - call + +## TFXLMRobertaForMultipleChoice + +[[autodoc]] TFXLMRobertaForMultipleChoice + - call + +## TFXLMRobertaForTokenClassification + +[[autodoc]] TFXLMRobertaForTokenClassification + - call + +## TFXLMRobertaForQuestionAnswering + +[[autodoc]] TFXLMRobertaForQuestionAnswering + - call + +## FlaxXLMRobertaModel + +[[autodoc]] FlaxXLMRobertaModel + - __call__ + +## FlaxXLMRobertaForCausalLM + +[[autodoc]] FlaxXLMRobertaForCausalLM + - __call__ + +## FlaxXLMRobertaForMaskedLM + +[[autodoc]] FlaxXLMRobertaForMaskedLM + - __call__ + +## FlaxXLMRobertaForSequenceClassification + +[[autodoc]] FlaxXLMRobertaForSequenceClassification + - __call__ + +## FlaxXLMRobertaForMultipleChoice + +[[autodoc]] FlaxXLMRobertaForMultipleChoice + - __call__ + +## FlaxXLMRobertaForTokenClassification + +[[autodoc]] FlaxXLMRobertaForTokenClassification + - __call__ + +## FlaxXLMRobertaForQuestionAnswering + +[[autodoc]] FlaxXLMRobertaForQuestionAnswering + - __call__ diff --git a/docs/source/en/model_doc/xlm-roberta.mdx b/docs/source/en/model_doc/xlm-roberta.mdx deleted file mode 100644 index 941feac9d35a..000000000000 --- a/docs/source/en/model_doc/xlm-roberta.mdx +++ /dev/null @@ -1,202 +0,0 @@ - - -# XLM-RoBERTa - -## Overview - -The XLM-RoBERTa model was proposed in [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume -Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's -RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl -data. - -The abstract from the paper is the following: - -*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a -wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred -languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly -outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on -XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on -low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We -also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the -trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource -languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing -per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We -will make XLM-R code, data, and models publicly available.* - -Tips: - -- XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does - not require `lang` tensors to understand which language is used, and should be able to determine the correct - language from the input ids. -- This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples - as well as the information relative to the inputs and outputs. - -This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). - -## Resources - -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with XLM-RoBERTa. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. - - - -- A blog post on how to [finetune XLM RoBERTa for multiclass classification with Habana Gaudi on AWS](https://www.philschmid.de/habana-distributed-training) -- [`XLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb). -- [`TFXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). -- [`FlaxXLMRobertaForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb). -- [Text classification](https://huggingface.co/docs/transformers/tasks/sequence_classification) chapter of the 🤗 Hugging Face Task Guides. - - - -- [`XLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb). -- [`TFXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). -- [`FlaxXLMRobertaForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification). -- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course. - - - -- [`XLMRobertaForCausalLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). -- [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) chapter of the 🤗 Hugging Face Task Guides. - - - -- [`XLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb). -- [`TFXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). -- [`FlaxXLMRobertaForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb). -- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course. - - - -- [`XLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb). -- [`TFXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). -- [`FlaxXLMRobertaForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering). -- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course. - -**Multiple choice** - -- [`XLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb). -- [`TFXLMRobertaForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). - -🚀 Deploy - -- A blog post on how to [Deploy Serveless XLM RoBERTa on AWS Lambda](https://www.philschmid.de/multilingual-serverless-xlm-roberta-with-huggingface). - -## XLMRobertaConfig - -[[autodoc]] XLMRobertaConfig - -## XLMRobertaTokenizer - -[[autodoc]] XLMRobertaTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## XLMRobertaTokenizerFast - -[[autodoc]] XLMRobertaTokenizerFast - -## XLMRobertaModel - -[[autodoc]] XLMRobertaModel - - forward - -## XLMRobertaForCausalLM - -[[autodoc]] XLMRobertaForCausalLM - - forward - -## XLMRobertaForMaskedLM - -[[autodoc]] XLMRobertaForMaskedLM - - forward - -## XLMRobertaForSequenceClassification - -[[autodoc]] XLMRobertaForSequenceClassification - - forward - -## XLMRobertaForMultipleChoice - -[[autodoc]] XLMRobertaForMultipleChoice - - forward - -## XLMRobertaForTokenClassification - -[[autodoc]] XLMRobertaForTokenClassification - - forward - -## XLMRobertaForQuestionAnswering - -[[autodoc]] XLMRobertaForQuestionAnswering - - forward - -## TFXLMRobertaModel - -[[autodoc]] TFXLMRobertaModel - - call - -## TFXLMRobertaForMaskedLM - -[[autodoc]] TFXLMRobertaForMaskedLM - - call - -## TFXLMRobertaForSequenceClassification - -[[autodoc]] TFXLMRobertaForSequenceClassification - - call - -## TFXLMRobertaForMultipleChoice - -[[autodoc]] TFXLMRobertaForMultipleChoice - - call - -## TFXLMRobertaForTokenClassification - -[[autodoc]] TFXLMRobertaForTokenClassification - - call - -## TFXLMRobertaForQuestionAnswering - -[[autodoc]] TFXLMRobertaForQuestionAnswering - - call - -## FlaxXLMRobertaModel - -[[autodoc]] FlaxXLMRobertaModel - - __call__ - -## FlaxXLMRobertaForMaskedLM - -[[autodoc]] FlaxXLMRobertaForMaskedLM - - __call__ - -## FlaxXLMRobertaForSequenceClassification - -[[autodoc]] FlaxXLMRobertaForSequenceClassification - - __call__ - -## FlaxXLMRobertaForMultipleChoice - -[[autodoc]] FlaxXLMRobertaForMultipleChoice - - __call__ - -## FlaxXLMRobertaForTokenClassification - -[[autodoc]] FlaxXLMRobertaForTokenClassification - - __call__ - -## FlaxXLMRobertaForQuestionAnswering - -[[autodoc]] FlaxXLMRobertaForQuestionAnswering - - __call__ diff --git a/docs/source/en/model_doc/xlm-v.md b/docs/source/en/model_doc/xlm-v.md new file mode 100644 index 000000000000..38bed0dc46b5 --- /dev/null +++ b/docs/source/en/model_doc/xlm-v.md @@ -0,0 +1,47 @@ + + +# XLM-V + +## Overview + +XLM-V is multilingual language model with a one million token vocabulary trained on 2.5TB of data from Common Crawl (same as XLM-R). +It was introduced in the [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) +paper by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer and Madian Khabsa. + +From the abstract of the XLM-V paper: + +*Large multilingual language models typically rely on a single vocabulary shared across 100+ languages. +As these models have increased in parameter count and depth, vocabulary size has remained largely unchanged. +This vocabulary bottleneck limits the representational capabilities of multilingual models like XLM-R. +In this paper, we introduce a new approach for scaling to very large multilingual vocabularies by +de-emphasizing token sharing between languages with little lexical overlap and assigning vocabulary capacity +to achieve sufficient coverage for each individual language. Tokenizations using our vocabulary are typically +more semantically meaningful and shorter compared to XLM-R. Leveraging this improved vocabulary, we train XLM-V, +a multilingual language model with a one million token vocabulary. XLM-V outperforms XLM-R on every task we +tested on ranging from natural language inference (XNLI), question answering (MLQA, XQuAD, TyDiQA), and +named entity recognition (WikiAnn) to low-resource tasks (Americas NLI, MasakhaNER).* + +Tips: + +- XLM-V is compatible with the XLM-RoBERTa model architecture, only model weights from [`fairseq`](https://github.com/facebookresearch/fairseq) + library had to be converted. +- The `XLMTokenizer` implementation is used to load the vocab and performs tokenization. + +A XLM-V (base size) model is available under the [`facebook/xlm-v-base`](https://huggingface.co/facebook/xlm-v-base) identifier. + +This model was contributed by [stefan-it](https://huggingface.co/stefan-it), including detailed experiments with XLM-V on downstream tasks. +The experiments repository can be found [here](https://github.com/stefan-it/xlm-v-experiments). diff --git a/docs/source/en/model_doc/xlm.md b/docs/source/en/model_doc/xlm.md new file mode 100644 index 000000000000..8b5b31a2dbef --- /dev/null +++ b/docs/source/en/model_doc/xlm.md @@ -0,0 +1,150 @@ + + +# XLM + +
+ +Models + + +Spaces + +
+ +## Overview + +The XLM model was proposed in [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by +Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the following objectives: + +- a causal language modeling (CLM) objective (next token prediction), +- a masked language modeling (MLM) objective (BERT-like), or +- a Translation Language Modeling (TLM) object (extension of BERT's MLM to multiple language inputs) + +The abstract from the paper is the following: + +*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding. +In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We +propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual +data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain +state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our +approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we +obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised +machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the +previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.* + +Tips: + +- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to + select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation). +- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the [multi-lingual](../multilingual) page for more information. +- A transformer model trained on several languages. There are three different type of training for this model and the library provides checkpoints for all of them: + + * Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the previous section as well). One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages. + * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with dynamic masking of the tokens. + * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two different languages, with random masking. To predict one of the masked tokens, the model can use both, the surrounding context in language 1 and the context given by language 2. + +This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## XLMConfig + +[[autodoc]] XLMConfig + +## XLMTokenizer + +[[autodoc]] XLMTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## XLM specific outputs + +[[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput + +## XLMModel + +[[autodoc]] XLMModel + - forward + +## XLMWithLMHeadModel + +[[autodoc]] XLMWithLMHeadModel + - forward + +## XLMForSequenceClassification + +[[autodoc]] XLMForSequenceClassification + - forward + +## XLMForMultipleChoice + +[[autodoc]] XLMForMultipleChoice + - forward + +## XLMForTokenClassification + +[[autodoc]] XLMForTokenClassification + - forward + +## XLMForQuestionAnsweringSimple + +[[autodoc]] XLMForQuestionAnsweringSimple + - forward + +## XLMForQuestionAnswering + +[[autodoc]] XLMForQuestionAnswering + - forward + +## TFXLMModel + +[[autodoc]] TFXLMModel + - call + +## TFXLMWithLMHeadModel + +[[autodoc]] TFXLMWithLMHeadModel + - call + +## TFXLMForSequenceClassification + +[[autodoc]] TFXLMForSequenceClassification + - call + +## TFXLMForMultipleChoice + +[[autodoc]] TFXLMForMultipleChoice + - call + +## TFXLMForTokenClassification + +[[autodoc]] TFXLMForTokenClassification + - call + +## TFXLMForQuestionAnsweringSimple + +[[autodoc]] TFXLMForQuestionAnsweringSimple + - call diff --git a/docs/source/en/model_doc/xlm.mdx b/docs/source/en/model_doc/xlm.mdx deleted file mode 100644 index a441c64c86c9..000000000000 --- a/docs/source/en/model_doc/xlm.mdx +++ /dev/null @@ -1,124 +0,0 @@ - - -# XLM - -## Overview - -The XLM model was proposed in [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by -Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the following objectives: - -- a causal language modeling (CLM) objective (next token prediction), -- a masked language modeling (MLM) objective (BERT-like), or -- a Translation Language Modeling (TLM) object (extension of BERT's MLM to multiple language inputs) - -The abstract from the paper is the following: - -*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding. -In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We -propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual -data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain -state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our -approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we -obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised -machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the -previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.* - -Tips: - -- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to - select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation). -- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the [multi-lingual](../multilingual) page for more information. - -This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/facebookresearch/XLM/). - - -## XLMConfig - -[[autodoc]] XLMConfig - -## XLMTokenizer - -[[autodoc]] XLMTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## XLM specific outputs - -[[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput - -## XLMModel - -[[autodoc]] XLMModel - - forward - -## XLMWithLMHeadModel - -[[autodoc]] XLMWithLMHeadModel - - forward - -## XLMForSequenceClassification - -[[autodoc]] XLMForSequenceClassification - - forward - -## XLMForMultipleChoice - -[[autodoc]] XLMForMultipleChoice - - forward - -## XLMForTokenClassification - -[[autodoc]] XLMForTokenClassification - - forward - -## XLMForQuestionAnsweringSimple - -[[autodoc]] XLMForQuestionAnsweringSimple - - forward - -## XLMForQuestionAnswering - -[[autodoc]] XLMForQuestionAnswering - - forward - -## TFXLMModel - -[[autodoc]] TFXLMModel - - call - -## TFXLMWithLMHeadModel - -[[autodoc]] TFXLMWithLMHeadModel - - call - -## TFXLMForSequenceClassification - -[[autodoc]] TFXLMForSequenceClassification - - call - -## TFXLMForMultipleChoice - -[[autodoc]] TFXLMForMultipleChoice - - call - -## TFXLMForTokenClassification - -[[autodoc]] TFXLMForTokenClassification - - call - -## TFXLMForQuestionAnsweringSimple - -[[autodoc]] TFXLMForQuestionAnsweringSimple - - call diff --git a/docs/source/en/model_doc/xlnet.md b/docs/source/en/model_doc/xlnet.md new file mode 100644 index 000000000000..3685728cd72e --- /dev/null +++ b/docs/source/en/model_doc/xlnet.md @@ -0,0 +1,176 @@ + + +# XLNet + +
+ +Models + + +Spaces + +
+ +## Overview + +The XLNet model was proposed in [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, +Quoc V. Le. XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method to learn +bidirectional contexts by maximizing the expected likelihood over all permutations of the input sequence factorization +order. + +The abstract from the paper is the following: + +*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves +better performance than pretraining approaches based on autoregressive language modeling. However, relying on +corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a +pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive +pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all +permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive +formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into +pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large +margin, including question answering, natural language inference, sentiment analysis, and document ranking.* + +Tips: + +- The specific attention pattern can be controlled at training and test time using the `perm_mask` input. +- Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained + using only a sub-set of the output tokens as target which are selected with the `target_mapping` input. +- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and + `target_mapping` inputs to control the attention span and outputs (see examples in + *examples/pytorch/text-generation/run_generation.py*) +- XLNet is one of the few models that has no sequence length limit. +- XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,…,sequence length. +- XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies. + +This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## XLNetConfig + +[[autodoc]] XLNetConfig + +## XLNetTokenizer + +[[autodoc]] XLNetTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## XLNetTokenizerFast + +[[autodoc]] XLNetTokenizerFast + +## XLNet specific outputs + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetModelOutput + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput + +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput + +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput + +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput + +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput + +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput + +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput + +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput + +## XLNetModel + +[[autodoc]] XLNetModel + - forward + +## XLNetLMHeadModel + +[[autodoc]] XLNetLMHeadModel + - forward + +## XLNetForSequenceClassification + +[[autodoc]] XLNetForSequenceClassification + - forward + +## XLNetForMultipleChoice + +[[autodoc]] XLNetForMultipleChoice + - forward + +## XLNetForTokenClassification + +[[autodoc]] XLNetForTokenClassification + - forward + +## XLNetForQuestionAnsweringSimple + +[[autodoc]] XLNetForQuestionAnsweringSimple + - forward + +## XLNetForQuestionAnswering + +[[autodoc]] XLNetForQuestionAnswering + - forward + +## TFXLNetModel + +[[autodoc]] TFXLNetModel + - call + +## TFXLNetLMHeadModel + +[[autodoc]] TFXLNetLMHeadModel + - call + +## TFXLNetForSequenceClassification + +[[autodoc]] TFXLNetForSequenceClassification + - call + +## TFLNetForMultipleChoice + +[[autodoc]] TFXLNetForMultipleChoice + - call + +## TFXLNetForTokenClassification + +[[autodoc]] TFXLNetForTokenClassification + - call + +## TFXLNetForQuestionAnsweringSimple + +[[autodoc]] TFXLNetForQuestionAnsweringSimple + - call diff --git a/docs/source/en/model_doc/xlnet.mdx b/docs/source/en/model_doc/xlnet.mdx deleted file mode 100644 index ca30574690c5..000000000000 --- a/docs/source/en/model_doc/xlnet.mdx +++ /dev/null @@ -1,154 +0,0 @@ - - -# XLNet - -## Overview - -The XLNet model was proposed in [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, -Quoc V. Le. XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method to learn -bidirectional contexts by maximizing the expected likelihood over all permutations of the input sequence factorization -order. - -The abstract from the paper is the following: - -*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves -better performance than pretraining approaches based on autoregressive language modeling. However, relying on -corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a -pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive -pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all -permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive -formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into -pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large -margin, including question answering, natural language inference, sentiment analysis, and document ranking.* - -Tips: - -- The specific attention pattern can be controlled at training and test time using the `perm_mask` input. -- Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained - using only a sub-set of the output tokens as target which are selected with the `target_mapping` input. -- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and - `target_mapping` inputs to control the attention span and outputs (see examples in - *examples/pytorch/text-generation/run_generation.py*) -- XLNet is one of the few models that has no sequence length limit. - -This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/zihangdai/xlnet/). - - -## XLNetConfig - -[[autodoc]] XLNetConfig - -## XLNetTokenizer - -[[autodoc]] XLNetTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary - -## XLNetTokenizerFast - -[[autodoc]] XLNetTokenizerFast - -## XLNet specific outputs - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetModelOutput - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput - -[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput - -[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput - -[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput - -[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput - -[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput - -[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput - -[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput - -## XLNetModel - -[[autodoc]] XLNetModel - - forward - -## XLNetLMHeadModel - -[[autodoc]] XLNetLMHeadModel - - forward - -## XLNetForSequenceClassification - -[[autodoc]] XLNetForSequenceClassification - - forward - -## XLNetForMultipleChoice - -[[autodoc]] XLNetForMultipleChoice - - forward - -## XLNetForTokenClassification - -[[autodoc]] XLNetForTokenClassification - - forward - -## XLNetForQuestionAnsweringSimple - -[[autodoc]] XLNetForQuestionAnsweringSimple - - forward - -## XLNetForQuestionAnswering - -[[autodoc]] XLNetForQuestionAnswering - - forward - -## TFXLNetModel - -[[autodoc]] TFXLNetModel - - call - -## TFXLNetLMHeadModel - -[[autodoc]] TFXLNetLMHeadModel - - call - -## TFXLNetForSequenceClassification - -[[autodoc]] TFXLNetForSequenceClassification - - call - -## TFLNetForMultipleChoice - -[[autodoc]] TFXLNetForMultipleChoice - - call - -## TFXLNetForTokenClassification - -[[autodoc]] TFXLNetForTokenClassification - - call - -## TFXLNetForQuestionAnsweringSimple - -[[autodoc]] TFXLNetForQuestionAnsweringSimple - - call diff --git a/docs/source/en/model_doc/xls_r.md b/docs/source/en/model_doc/xls_r.md new file mode 100644 index 000000000000..8e22004244ca --- /dev/null +++ b/docs/source/en/model_doc/xls_r.md @@ -0,0 +1,47 @@ + + +# XLS-R + +## Overview + +The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman +Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. + +The abstract from the paper is the following: + +*This paper presents XLS-R, a large-scale model for cross-lingual speech representation learning based on wav2vec 2.0. +We train models with up to 2B parameters on nearly half a million hours of publicly available speech audio in 128 +languages, an order of magnitude more public data than the largest known prior work. Our evaluation covers a wide range +of tasks, domains, data regimes and languages, both high and low-resource. On the CoVoST-2 speech translation +benchmark, we improve the previous state of the art by an average of 7.4 BLEU over 21 translation directions into +English. For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as +VoxPopuli, lowering error rates by 14-34% relative on average. XLS-R also sets a new state of the art on VoxLingua107 +language identification. Moreover, we show that with sufficient model size, cross-lingual pretraining can outperform +English-only pretraining when translating English speech into other languages, a setting which favors monolingual +pretraining. We hope XLS-R can help to improve speech processing tasks for many more languages of the world.* + +Tips: + +- XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. +- XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using + [`Wav2Vec2CTCTokenizer`]. + +Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r. + +XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2). + +The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec). diff --git a/docs/source/en/model_doc/xls_r.mdx b/docs/source/en/model_doc/xls_r.mdx deleted file mode 100644 index 82a7e3b8afbd..000000000000 --- a/docs/source/en/model_doc/xls_r.mdx +++ /dev/null @@ -1,43 +0,0 @@ - - -# XLS-R - -## Overview - -The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman -Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. - -The abstract from the paper is the following: - -*This paper presents XLS-R, a large-scale model for cross-lingual speech representation learning based on wav2vec 2.0. -We train models with up to 2B parameters on nearly half a million hours of publicly available speech audio in 128 -languages, an order of magnitude more public data than the largest known prior work. Our evaluation covers a wide range -of tasks, domains, data regimes and languages, both high and low-resource. On the CoVoST-2 speech translation -benchmark, we improve the previous state of the art by an average of 7.4 BLEU over 21 translation directions into -English. For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as -VoxPopuli, lowering error rates by 14-34% relative on average. XLS-R also sets a new state of the art on VoxLingua107 -language identification. Moreover, we show that with sufficient model size, cross-lingual pretraining can outperform -English-only pretraining when translating English speech into other languages, a setting which favors monolingual -pretraining. We hope XLS-R can help to improve speech processing tasks for many more languages of the world.* - -Tips: - -- XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. -- XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using - [`Wav2Vec2CTCTokenizer`]. - -Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r. - -XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2). - -The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec). diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.md b/docs/source/en/model_doc/xlsr_wav2vec2.md new file mode 100644 index 000000000000..643d37416d38 --- /dev/null +++ b/docs/source/en/model_doc/xlsr_wav2vec2.md @@ -0,0 +1,45 @@ + + +# XLSR-Wav2Vec2 + +## Overview + +The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael +Auli. + +The abstract from the paper is the following: + +*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw +waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over +masked latent speech representations and jointly learns a quantization of the latents shared across languages. The +resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly +outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction +of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to +a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong +individual models. Analysis shows that the latent discrete speech representations are shared across languages with +increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing +XLSR-53, a large model pretrained in 53 languages.* + +Tips: + +- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. +- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be + decoded using [`Wav2Vec2CTCTokenizer`]. + +XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2). + +The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec). diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.mdx b/docs/source/en/model_doc/xlsr_wav2vec2.mdx deleted file mode 100644 index 32229f28b147..000000000000 --- a/docs/source/en/model_doc/xlsr_wav2vec2.mdx +++ /dev/null @@ -1,41 +0,0 @@ - - -# XLSR-Wav2Vec2 - -## Overview - -The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael -Auli. - -The abstract from the paper is the following: - -*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw -waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over -masked latent speech representations and jointly learns a quantization of the latents shared across languages. The -resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly -outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction -of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to -a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong -individual models. Analysis shows that the latent discrete speech representations are shared across languages with -increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing -XLSR-53, a large model pretrained in 53 languages.* - -Tips: - -- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. -- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be - decoded using [`Wav2Vec2CTCTokenizer`]. - -XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2). - -The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec). diff --git a/docs/source/en/model_doc/xmod.md b/docs/source/en/model_doc/xmod.md new file mode 100644 index 000000000000..5a3409bbc4c3 --- /dev/null +++ b/docs/source/en/model_doc/xmod.md @@ -0,0 +1,131 @@ + + +# X-MOD + +## Overview + +The X-MOD model was proposed in [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, and Mikel Artetxe. +X-MOD extends multilingual masked language models like [XLM-R](xlm-roberta) to include language-specific modular components (_language adapters_) during pre-training. For fine-tuning, the language adapters in each transformer layer are frozen. + +The abstract from the paper is the following: + +*Multilingual pre-trained models are known to suffer from the curse of multilinguality, which causes per-language performance to drop as they cover more languages. We address this issue by introducing language-specific modules, which allows us to grow the total capacity of the model, while keeping the total number of trainable parameters per language constant. In contrast with prior work that learns language-specific components post-hoc, we pre-train the modules of our Cross-lingual Modular (X-MOD) models from the start. Our experiments on natural language inference, named entity recognition and question answering show that our approach not only mitigates the negative interference between languages, but also enables positive transfer, resulting in improved monolingual and cross-lingual performance. Furthermore, our approach enables adding languages post-hoc with no measurable drop in performance, no longer limiting the model usage to the set of pre-trained languages.* + +Tips: +- X-MOD is similar to [XLM-R](xlm-roberta), but a difference is that the input language needs to be specified so that the correct language adapter can be activated. +- The main models – base and large – have adapters for 81 languages. + +This model was contributed by [jvamvas](https://huggingface.co/jvamvas). +The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/fairseq/models/xmod) and the original documentation is found [here](https://github.com/facebookresearch/fairseq/tree/58cc6cca18f15e6d56e3f60c959fe4f878960a60/examples/xmod). + +## Adapter Usage + +### Input language + +There are two ways to specify the input language: +1. By setting a default language before using the model: + +```python +from transformers import XmodModel + +model = XmodModel.from_pretrained("facebook/xmod-base") +model.set_default_language("en_XX") +``` + +2. By explicitly passing the index of the language adapter for each sample: + +```python +import torch + +input_ids = torch.tensor( + [ + [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2], + [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2], + ] +) +lang_ids = torch.LongTensor( + [ + 0, # en_XX + 8, # de_DE + ] +) +output = model(input_ids, lang_ids=lang_ids) +``` + +### Fine-tuning +The paper recommends that the embedding layer and the language adapters are frozen during fine-tuning. A method for doing this is provided: + +```python +model.freeze_embeddings_and_language_adapters() +# Fine-tune the model ... +``` + +### Cross-lingual transfer +After fine-tuning, zero-shot cross-lingual transfer can be tested by activating the language adapter of the target language: + +```python +model.set_default_language("de_DE") +# Evaluate the model on German examples ... +``` + +## Resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Causal language modeling task guide](../tasks/language_modeling) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## XmodConfig + +[[autodoc]] XmodConfig + +## XmodModel + +[[autodoc]] XmodModel + - forward + +## XmodForCausalLM + +[[autodoc]] XmodForCausalLM + - forward + +## XmodForMaskedLM + +[[autodoc]] XmodForMaskedLM + - forward + +## XmodForSequenceClassification + +[[autodoc]] XmodForSequenceClassification + - forward + +## XmodForMultipleChoice + +[[autodoc]] XmodForMultipleChoice + - forward + +## XmodForTokenClassification + +[[autodoc]] XmodForTokenClassification + - forward + +## XmodForQuestionAnswering + +[[autodoc]] XmodForQuestionAnswering + - forward diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md new file mode 100644 index 000000000000..6185c3a06757 --- /dev/null +++ b/docs/source/en/model_doc/yolos.md @@ -0,0 +1,78 @@ + + +# YOLOS + +## Overview + +The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. +YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN. + +The abstract from the paper is the following: + +*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.* + +Tips: + +- One can use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created. + + + + YOLOS architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS. + + + +- All example notebooks illustrating inference + fine-tuning [`YolosForObjectDetection`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS). +- See also: [Object detection task guide](../tasks/object_detection) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## YolosConfig + +[[autodoc]] YolosConfig + +## YolosImageProcessor + +[[autodoc]] YolosImageProcessor + - preprocess + - pad + - post_process_object_detection + +## YolosFeatureExtractor + +[[autodoc]] YolosFeatureExtractor + - __call__ + - pad + - post_process_object_detection + + +## YolosModel + +[[autodoc]] YolosModel + - forward + + +## YolosForObjectDetection + +[[autodoc]] YolosForObjectDetection + - forward diff --git a/docs/source/en/model_doc/yolos.mdx b/docs/source/en/model_doc/yolos.mdx deleted file mode 100644 index 838517ea765e..000000000000 --- a/docs/source/en/model_doc/yolos.mdx +++ /dev/null @@ -1,64 +0,0 @@ - - -# YOLOS - -## Overview - -The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. -YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN. - -The abstract from the paper is the following: - -*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.* - -Tips: - -- One can use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created. -- Demo notebooks (regarding inference and fine-tuning on custom data) can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS). - - - - YOLOS architecture. Taken from the original paper. - -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS). - -## YolosConfig - -[[autodoc]] YolosConfig - -## YolosImageProcessor - -[[autodoc]] YolosImageProcessor - - preprocess - - pad - - post_process_object_detection - -## YolosFeatureExtractor - -[[autodoc]] YolosFeatureExtractor - - __call__ - - pad - - post_process_object_detection - - -## YolosModel - -[[autodoc]] YolosModel - - forward - - -## YolosForObjectDetection - -[[autodoc]] YolosForObjectDetection - - forward diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md new file mode 100644 index 000000000000..4b98cd348c9a --- /dev/null +++ b/docs/source/en/model_doc/yoso.md @@ -0,0 +1,102 @@ + + +# YOSO + +## Overview + +The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) +by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. YOSO approximates standard softmax self-attention +via a Bernoulli sampling scheme based on Locality Sensitive Hashing (LSH). In principle, all the Bernoulli random variables can be sampled with +a single hash. + +The abstract from the paper is the following: + +*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is +the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically +on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling +attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear. +We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random +variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). +This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of +LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence +length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark, +for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable +speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL* + +Tips: + +- The YOSO attention algorithm is implemented through custom CUDA kernels, functions written in CUDA C++ that can be executed multiple times +in parallel on a GPU. +- The kernels provide a `fast_hash` function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. Using these +hash codes, the `lsh_cumulation` function approximates self-attention via LSH-based Bernoulli sampling. +- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully, +the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and +does not require compiling CUDA kernels. + + + + YOSO Attention Algorithm. Taken from the original paper. + +This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO). + +## Documentation resources + +- [Text classification task guide](../tasks/sequence_classification) +- [Token classification task guide](../tasks/token_classification) +- [Question answering task guide](../tasks/question_answering) +- [Masked language modeling task guide](../tasks/masked_language_modeling) +- [Multiple choice task guide](../tasks/multiple_choice) + +## YosoConfig + +[[autodoc]] YosoConfig + + +## YosoModel + +[[autodoc]] YosoModel + - forward + + +## YosoForMaskedLM + +[[autodoc]] YosoForMaskedLM + - forward + + +## YosoForSequenceClassification + +[[autodoc]] YosoForSequenceClassification + - forward + +## YosoForMultipleChoice + +[[autodoc]] YosoForMultipleChoice + - forward + + +## YosoForTokenClassification + +[[autodoc]] YosoForTokenClassification + - forward + + +## YosoForQuestionAnswering + +[[autodoc]] YosoForQuestionAnswering + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/yoso.mdx b/docs/source/en/model_doc/yoso.mdx deleted file mode 100644 index 997ab4d09416..000000000000 --- a/docs/source/en/model_doc/yoso.mdx +++ /dev/null @@ -1,91 +0,0 @@ - - -# YOSO - -## Overview - -The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) -by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. YOSO approximates standard softmax self-attention -via a Bernoulli sampling scheme based on Locality Sensitive Hashing (LSH). In principle, all the Bernoulli random variables can be sampled with -a single hash. - -The abstract from the paper is the following: - -*Transformer-based models are widely used in natural language processing (NLP). Central to the transformer model is -the self-attention mechanism, which captures the interactions of token pairs in the input sequences and depends quadratically -on the sequence length. Training such models on longer sequences is expensive. In this paper, we show that a Bernoulli sampling -attention mechanism based on Locality Sensitive Hashing (LSH), decreases the quadratic complexity of such models to linear. -We bypass the quadratic cost by considering self-attention as a sum of individual tokens associated with Bernoulli random -variables that can, in principle, be sampled at once by a single hash (although in practice, this number may be a small constant). -This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of -LSH (to enable deployment on GPU architectures). We evaluate our algorithm on the GLUE benchmark with standard 512 sequence -length where we see favorable performance relative to a standard pretrained Transformer. On the Long Range Arena (LRA) benchmark, -for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable -speed-ups and memory savings and often outperforms other efficient self-attention methods. Our code is available at this https URL* - -Tips: - -- The YOSO attention algorithm is implemented through custom CUDA kernels, functions written in CUDA C++ that can be executed multiple times -in parallel on a GPU. -- The kernels provide a `fast_hash` function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. Using these -hash codes, the `lsh_cumulation` function approximates self-attention via LSH-based Bernoulli sampling. -- To use the custom kernels, the user should set `config.use_expectation = False`. To ensure that the kernels are compiled successfully, -the user must install the correct version of PyTorch and cudatoolkit. By default, `config.use_expectation = True`, which uses YOSO-E and -does not require compiling CUDA kernels. - - - - YOSO Attention Algorithm. Taken from the original paper. - -This model was contributed by [novice03](https://huggingface.co/novice03). The original code can be found [here](https://github.com/mlpen/YOSO). - - -## YosoConfig - -[[autodoc]] YosoConfig - - -## YosoModel - -[[autodoc]] YosoModel - - forward - - -## YosoForMaskedLM - -[[autodoc]] YosoForMaskedLM - - forward - - -## YosoForSequenceClassification - -[[autodoc]] YosoForSequenceClassification - - forward - -## YosoForMultipleChoice - -[[autodoc]] YosoForMultipleChoice - - forward - - -## YosoForTokenClassification - -[[autodoc]] YosoForTokenClassification - - forward - - -## YosoForQuestionAnswering - -[[autodoc]] YosoForQuestionAnswering - - forward \ No newline at end of file diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md new file mode 100644 index 000000000000..0a0d5bb5b8bf --- /dev/null +++ b/docs/source/en/model_memory_anatomy.md @@ -0,0 +1,272 @@ + + +# Model training anatomy + +To understand performance optimization techniques that one can apply to improve efficiency of model training +speed and memory utilization, it's helpful to get familiar with how GPU is utilized during training, and how compute +intensity varies depending on an operation performed. + +Let's start by exploring a motivating example of GPU utilization and the training run of a model. For the demonstration, +we'll need to install a few libraries: + +```bash +pip install transformers datasets accelerate nvidia-ml-py3 +``` + +The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar +with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly. + +Then, we create some dummy data: random token IDs between 100 and 30000 and binary labels for a classifier. +In total, we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format. + + +```py +>>> import numpy as np +>>> from datasets import Dataset + + +>>> seq_len, dataset_size = 512, 512 +>>> dummy_data = { +... "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)), +... "labels": np.random.randint(0, 1, (dataset_size)), +... } +>>> ds = Dataset.from_dict(dummy_data) +>>> ds.set_format("pt") +``` + +To print summary statistics for the GPU utilization and the training run with the [`Trainer`] we define two helper functions: + +```py +>>> from pynvml import * + + +>>> def print_gpu_utilization(): +... nvmlInit() +... handle = nvmlDeviceGetHandleByIndex(0) +... info = nvmlDeviceGetMemoryInfo(handle) +... print(f"GPU memory occupied: {info.used//1024**2} MB.") + + +>>> def print_summary(result): +... print(f"Time: {result.metrics['train_runtime']:.2f}") +... print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") +... print_gpu_utilization() +``` + +Let's verify that we start with a free GPU memory: + +```py +>>> print_gpu_utilization() +GPU memory occupied: 0 MB. +``` + +That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on +your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by +the user. When a model is loaded to the GPU the kernels are also loaded, which can take up 1-2GB of memory. To see how +much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well. + +```py +>>> import torch + + +>>> torch.ones((1, 1)).to("cuda") +>>> print_gpu_utilization() +GPU memory occupied: 1343 MB. +``` + +We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses. + +## Load Model + +First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check +how much space just the weights use. + + +```py +>>> from transformers import AutoModelForSequenceClassification + + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda") +>>> print_gpu_utilization() +GPU memory occupied: 2631 MB. +``` + +We can see that the model weights alone take up 1.3 GB of GPU memory. The exact number depends on the specific +GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an +optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result +as with `nvidia-smi` CLI: + + +```bash +nvidia-smi +``` + +```bash +Tue Jan 11 08:58:05 2022 ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 Tesla V100-SXM2... On | 00000000:00:04.0 Off | 0 | +| N/A 37C P0 39W / 300W | 2631MiB / 16160MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| 0 N/A N/A 3721 C ...nvs/codeparrot/bin/python 2629MiB | ++-----------------------------------------------------------------------------+ +``` + +We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can +start training the model and see how the GPU memory consumption changes. First, we set up a few standard training +arguments: + +```py +default_args = { + "output_dir": "tmp", + "evaluation_strategy": "steps", + "num_train_epochs": 1, + "log_level": "error", + "report_to": "none", +} +``` + + + + If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python + kernel between experiments. + + + +## Memory utilization at vanilla training + +Let's use the [`Trainer`] and train the model without using any GPU performance optimization techniques and a batch size of 4: + +```py +>>> from transformers import TrainingArguments, Trainer, logging + +>>> logging.set_verbosity_error() + + +>>> training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) +>>> trainer = Trainer(model=model, args=training_args, train_dataset=ds) +>>> result = trainer.train() +>>> print_summary(result) +``` + +``` +Time: 57.82 +Samples/second: 8.86 +GPU memory occupied: 14949 MB. +``` + +We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size +can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our +model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. +To understand a bit better why this is the case let's have a look at a model's operations and memory needs. + +## Anatomy of Model's Operations + +Transformers architecture includes 3 main groups of operations grouped below by compute-intensity. + +1. **Tensor Contractions** + + Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer. + +2. **Statistical Normalizations** + + Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map. + +3. **Element-wise Operators** + + These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations. + +This knowledge can be helpful to know when analyzing performance bottlenecks. + +This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072) + + +## Anatomy of Model's Memory + +We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there +are many components during training that use GPU memory. The components on GPU memory are the following: + +1. model weights +2. optimizer states +3. gradients +4. forward activations saved for gradient computation +5. temporary buffers +6. functionality-specific memory + +A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For +inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per +model parameter for mixed precision inference, plus activation memory. + +Let's look at the details. + +**Model Weights:** + +- 4 bytes * number of parameters for fp32 training +- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory) + +**Optimizer States:** + +- 8 bytes * number of parameters for normal AdamW (maintains 2 states) +- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) +- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state) + +**Gradients** + +- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32) + +**Forward Activations** + +- size depends on many factors, the key ones being sequence length, hidden size and batch size. + +There are the input and output that are being passed and returned by the forward and the backward functions and the +forward activations saved for gradient computation. + +**Temporary Memory** + +Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the +moment these could require additional memory and could push to OOM. Therefore, when coding it's crucial to think +strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed. + +**Functionality-specific memory** + +Then, your software could have special memory needs. For example, when generating text using beam search, the software +needs to maintain multiple copies of inputs and outputs. + +**`forward` vs `backward` Execution Speed** + +For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates +into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually +bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward +(e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, +and writes once, gradInput). + +As you can see, there are potentially a few places where we could save GPU memory or speed up operations. +Now that you understand what affects GPU utilization and computation speed, refer to +the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) documentation page to learn about +performance optimization techniques. diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md new file mode 100644 index 000000000000..84d287570da1 --- /dev/null +++ b/docs/source/en/model_sharing.md @@ -0,0 +1,232 @@ + + +# Share a model + +The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. The next step is to share your model with the community! At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. We encourage you to consider sharing your model with the community to help others save time and resources. + +In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the [Model Hub](https://huggingface.co/models): + +- Programmatically push your files to the Hub. +- Drag-and-drop your files to the Hub with the web interface. + + + + + +To share a model with the community, you need an account on [huggingface.co](https://huggingface.co/join). You can also join an existing organization or create a new one. + + + +## Repository features + +Each repository on the Model Hub behaves like a typical GitHub repository. Our repositories offer versioning, commit history, and the ability to visualize differences. + +The Model Hub's built-in versioning is based on git and [git-lfs](https://git-lfs.github.com/). In other words, you can treat one model as one repository, enabling greater access control and scalability. Version control allows *revisions*, a method for pinning a specific version of a model with a commit hash, tag or branch. + +As a result, you can load a specific model version with the `revision` parameter: + +```py +>>> model = AutoModel.from_pretrained( +... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... ) +``` + +Files are also easily edited in a repository, and you can view the commit history as well as the difference: + +![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) + +## Setup + +Before sharing a model to the Hub, you will need your Hugging Face credentials. If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. This will store your access token in your Hugging Face cache folder (`~/.cache/` by default): + +```bash +huggingface-cli login +``` + +If you are using a notebook like Jupyter or Colaboratory, make sure you have the [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) library installed. This library allows you to programmatically interact with the Hub. + +```bash +pip install huggingface_hub +``` + +Then use `notebook_login` to sign-in to the Hub, and follow the link [here](https://huggingface.co/settings/token) to generate a token to login with: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Convert a model for all frameworks + +To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly. + +Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. + + + +Specify `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch: + +```py +>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) +>>> pt_model.save_pretrained("path/to/awesome-name-you-picked") +``` + + +Specify `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow: + +```py +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) +``` + +Then you can save your new TensorFlow model with its new checkpoint: + +```py +>>> tf_model.save_pretrained("path/to/awesome-name-you-picked") +``` + + +If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax: + +```py +>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( +... "path/to/awesome-name-you-picked", from_pt=True +... ) +``` + + + +## Push a model during training + + + + + +Sharing a model to the Hub is as simple as adding an extra parameter or callback. Remember from the [fine-tuning tutorial](training), the [`TrainingArguments`] class is where you specify hyperparameters and additional training options. One of these training options includes the ability to push a model directly to the Hub. Set `push_to_hub=True` in your [`TrainingArguments`]: + +```py +>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) +``` + +Pass your training arguments as usual to [`Trainer`]: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +After you fine-tune your model, call [`~transformers.Trainer.push_to_hub`] on [`Trainer`] to push the trained model to the Hub. 🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card! + +```py +>>> trainer.push_to_hub() +``` + + +Share a model to the Hub with [`PushToHubCallback`]. In the [`PushToHubCallback`] function, add: + +- An output directory for your model. +- A tokenizer. +- The `hub_model_id`, which is your Hub username and model name. + +```py +>>> from transformers import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" +... ) +``` + +Add the callback to [`fit`](https://keras.io/api/models/model_training_apis/), and 🤗 Transformers will push the trained model to the Hub: + +```py +>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) +``` + + + +## Use the `push_to_hub` function + +You can also call `push_to_hub` directly on your model to upload it to the Hub. + +Specify your model name in `push_to_hub`: + +```py +>>> pt_model.push_to_hub("my-awesome-model") +``` + +This creates a repository under your username with the model name `my-awesome-model`. Users can now load your model with the `from_pretrained` function: + +```py +>>> from transformers import AutoModel + +>>> model = AutoModel.from_pretrained("your_username/my-awesome-model") +``` + +If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`: + +```py +>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model") +``` + +The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository: + +```py +>>> tokenizer.push_to_hub("my-awesome-model") +``` + +Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model: + +```py +>>> tf_model.push_to_hub("my-awesome-model") +``` + +Now when you navigate to your Hugging Face profile, you should see your newly created model repository. Clicking on the **Files** tab will display all the files you've uploaded to the repository. + +For more details on how to create and upload files to a repository, refer to the Hub documentation [here](https://huggingface.co/docs/hub/how-to-upstream). + +## Upload with the web interface + +Users who prefer a no-code approach are able to upload a model through the Hub's web interface. Visit [huggingface.co/new](https://huggingface.co/new) to create a new repository: + +![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) + +From here, add some information about your model: + +- Select the **owner** of the repository. This can be yourself or any of the organizations you belong to. +- Pick a name for your model, which will also be the repository name. +- Choose whether your model is public or private. +- Specify the license usage for your model. + +Now click on the **Files** tab and click on the **Add file** button to upload a new file to your repository. Then drag-and-drop a file to upload and add a commit message. + +![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) + +## Add a model card + +To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. The model card is defined in the `README.md` file. You can add a model card by: + +* Manually creating and uploading a `README.md` file. +* Clicking on the **Edit model card** button in your model repository. + +Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards). diff --git a/docs/source/en/model_sharing.mdx b/docs/source/en/model_sharing.mdx deleted file mode 100644 index e6bd7fc4a6af..000000000000 --- a/docs/source/en/model_sharing.mdx +++ /dev/null @@ -1,228 +0,0 @@ - - -# Share a model - -The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. The next step is to share your model with the community! At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. We encourage you to consider sharing your model with the community to help others save time and resources. - -In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the [Model Hub](https://huggingface.co/models): - -- Programmatically push your files to the Hub. -- Drag-and-drop your files to the Hub with the web interface. - - - - - -To share a model with the community, you need an account on [huggingface.co](https://huggingface.co/join). You can also join an existing organization or create a new one. - - - -## Repository features - -Each repository on the Model Hub behaves like a typical GitHub repository. Our repositories offer versioning, commit history, and the ability to visualize differences. - -The Model Hub's built-in versioning is based on git and [git-lfs](https://git-lfs.github.com/). In other words, you can treat one model as one repository, enabling greater access control and scalability. Version control allows *revisions*, a method for pinning a specific version of a model with a commit hash, tag or branch. - -As a result, you can load a specific model version with the `revision` parameter: - -```py ->>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash -... ) -``` - -Files are also easily edited in a repository, and you can view the commit history as well as the difference: - -![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) - -## Setup - -Before sharing a model to the Hub, you will need your Hugging Face credentials. If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. This will store your access token in your Hugging Face cache folder (`~/.cache/` by default): - -```bash -huggingface-cli login -``` - -If you are using a notebook like Jupyter or Colaboratory, make sure you have the [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) library installed. This library allows you to programmatically interact with the Hub. - -```bash -pip install huggingface_hub -``` - -Then use `notebook_login` to sign-in to the Hub, and follow the link [here](https://huggingface.co/settings/token) to generate a token to login with: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Convert a model for all frameworks - -To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly. - -Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. - - - -Specify `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch: - -```py ->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) ->>> pt_model.save_pretrained("path/to/awesome-name-you-picked") -``` - - -Specify `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow: - -```py ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) -``` - -Then you can save your new TensorFlow model with it's new checkpoint: - -```py ->>> tf_model.save_pretrained("path/to/awesome-name-you-picked") -``` - - -If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax: - -```py ->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( -... "path/to/awesome-name-you-picked", from_pt=True -... ) -``` - - - -## Push a model during training - - - - - -Sharing a model to the Hub is as simple as adding an extra parameter or callback. Remember from the [fine-tuning tutorial](training), the [`TrainingArguments`] class is where you specify hyperparameters and additional training options. One of these training options includes the ability to push a model directly to the Hub. Set `push_to_hub=True` in your [`TrainingArguments`]: - -```py ->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) -``` - -Pass your training arguments as usual to [`Trainer`]: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -After you fine-tune your model, call [`~transformers.Trainer.push_to_hub`] on [`Trainer`] to push the trained model to the Hub. 🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card! - -```py ->>> trainer.push_to_hub() -``` - - -Share a model to the Hub with [`PushToHubCallback`]. In the [`PushToHubCallback`] function, add: - -- An output directory for your model. -- A tokenizer. -- The `hub_model_id`, which is your Hub username and model name. - -```py ->>> from transformers.keras.callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" -... ) -``` - -Add the callback to [`fit`](https://keras.io/api/models/model_training_apis/), and 🤗 Transformers will push the trained model to the Hub: - -```py ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) -``` - - - -## Use the `push_to_hub` function - -You can also call `push_to_hub` directly on your model to upload it to the Hub. - -Specify your model name in `push_to_hub`: - -```py ->>> pt_model.push_to_hub("my-awesome-model") -``` - -This creates a repository under your username with the model name `my-awesome-model`. Users can now load your model with the `from_pretrained` function: - -```py ->>> from transformers import AutoModel - ->>> model = AutoModel.from_pretrained("your_username/my-awesome-model") -``` - -If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`: - -```py ->>> pt_model.push_to_hub("my-awesome-org/my-awesome-model") -``` - -The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository: - -```py ->>> tokenizer.push_to_hub("my-awesome-model") -``` - -Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model: - -```py ->>> tf_model.push_to_hub("my-awesome-model") -``` - -Now when you navigate to the your Hugging Face profile, you should see your newly created model repository. Clicking on the **Files** tab will display all the files you've uploaded to the repository. - -For more details on how to create and upload files to a repository, refer to the Hub documentation [here](https://huggingface.co/docs/hub/how-to-upstream). - -## Upload with the web interface - -Users who prefer a no-code approach are able to upload a model through the Hub's web interface. Visit [huggingface.co/new](https://huggingface.co/new) to create a new repository: - -![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) - -From here, add some information about your model: - -- Select the **owner** of the repository. This can be yourself or any of the organizations you belong to. -- Pick a name for your model, which will also be the repository name. -- Choose whether your model is public or private. -- Specify the license usage for your model. - -Now click on the **Files** tab and click on the **Add file** button to upload a new file to your repository. Then drag-and-drop a file to upload and add a commit message. - -![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) - -## Add a model card - -To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. The model card is defined in the `README.md` file. You can add a model card by: - -* Manually creating and uploading a `README.md` file. -* Clicking on the **Edit model card** button in your model repository. - -Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards). diff --git a/docs/source/en/model_summary.md b/docs/source/en/model_summary.md new file mode 100644 index 000000000000..10acb4c50210 --- /dev/null +++ b/docs/source/en/model_summary.md @@ -0,0 +1,107 @@ + + +# The Transformer model family + +Since its introduction in 2017, the [original Transformer](https://arxiv.org/abs/1706.03762) model has inspired many new and exciting models that extend beyond natural language processing (NLP) tasks. There are models for [predicting the folded structure of proteins](https://huggingface.co/blog/deep-learning-with-proteins), [training a cheetah to run](https://huggingface.co/blog/train-decision-transformers), and [time series forecasting](https://huggingface.co/blog/time-series-transformers). With so many Transformer variants available, it can be easy to miss the bigger picture. What all these models have in common is they're based on the original Transformer architecture. Some models only use the encoder or decoder, while others use both. This provides a useful taxonomy to categorize and examine the high-level differences within models in the Transformer family, and it'll help you understand Transformers you haven't encountered before. + +If you aren't familiar with the original Transformer model or need a refresher, check out the [How do Transformers work](https://huggingface.co/course/chapter1/4?fw=pt) chapter from the Hugging Face course. + +
+ +
+ +## Computer vision + + + +### Convolutional network + +For a long time, convolutional networks (CNNs) were the dominant paradigm for computer vision tasks until the [Vision Transformer](https://arxiv.org/abs/2010.11929) demonstrated its scalability and efficiency. Even then, some of a CNN's best qualities, like translation invariance, are so powerful (especially for certain tasks) that some Transformers incorporate convolutions in their architecture. [ConvNeXt](model_doc/convnext) flipped this exchange around and incorporated design choices from Transformers to modernize a CNN. For example, ConvNeXt uses non-overlapping sliding windows to patchify an image and a larger kernel to increase its global receptive field. ConvNeXt also makes several layer design choices to be more memory-efficient and improve performance, so it competes favorably with Transformers! + +### Encoder[[cv-encoder]] + +The [Vision Transformer (ViT)](model_doc/vit) opened the door to computer vision tasks without convolutions. ViT uses a standard Transformer encoder, but its main breakthrough was how it treated an image. It splits an image into fixed-size patches and uses them to create an embedding, just like how a sentence is split into tokens. ViT capitalized on the Transformers' efficient architecture to demonstrate competitive results with the CNNs at the time while requiring fewer resources to train. ViT was soon followed by other vision models that could also handle dense vision tasks like segmentation as well as detection. + +One of these models is the [Swin](model_doc/swin) Transformer. It builds hierarchical feature maps (like a CNN 👀 and unlike ViT) from smaller-sized patches and merges them with neighboring patches in deeper layers. Attention is only computed within a local window, and the window is shifted between attention layers to create connections to help the model learn better. Since the Swin Transformer can produce hierarchical feature maps, it is a good candidate for dense prediction tasks like segmentation and detection. The [SegFormer](model_doc/segformer) also uses a Transformer encoder to build hierarchical feature maps, but it adds a simple multilayer perceptron (MLP) decoder on top to combine all the feature maps and make a prediction. + +Other vision models, like BeIT and ViTMAE, drew inspiration from BERT's pretraining objective. [BeIT](model_doc/beit) is pretrained by *masked image modeling (MIM)*; the image patches are randomly masked, and the image is also tokenized into visual tokens. BeIT is trained to predict the visual tokens corresponding to the masked patches. [ViTMAE](model_doc/vitmae) has a similar pretraining objective, except it must predict the pixels instead of visual tokens. What's unusual is 75% of the image patches are masked! The decoder reconstructs the pixels from the masked tokens and encoded patches. After pretraining, the decoder is thrown away, and the encoder is ready to be used in downstream tasks. + +### Decoder[[cv-decoder]] + +Decoder-only vision models are rare because most vision models rely on an encoder to learn an image representation. But for use cases like image generation, the decoder is a natural fit, as we've seen from text generation models like GPT-2. [ImageGPT](model_doc/imagegpt) uses the same architecture as GPT-2, but instead of predicting the next token in a sequence, it predicts the next pixel in an image. In addition to image generation, ImageGPT could also be finetuned for image classification. + +### Encoder-decoder[[cv-encoder-decoder]] + +Vision models commonly use an encoder (also known as a backbone) to extract important image features before passing them to a Transformer decoder. [DETR](model_doc/detr) has a pretrained backbone, but it also uses the complete Transformer encoder-decoder architecture for object detection. The encoder learns image representations and combines them with object queries (each object query is a learned embedding that focuses on a region or object in an image) in the decoder. DETR predicts the bounding box coordinates and class label for each object query. + +## Natural language processing + + + +### Encoder[[nlp-encoder]] + +[BERT](model_doc/bert) is an encoder-only Transformer that randomly masks certain tokens in the input to avoid seeing other tokens, which would allow it to "cheat". The pretraining objective is to predict the masked token based on the context. This allows BERT to fully use the left and right contexts to help it learn a deeper and richer representation of the inputs. However, there was still room for improvement in BERT's pretraining strategy. [RoBERTa](model_doc/roberta) improved upon this by introducing a new pretraining recipe that includes training for longer and on larger batches, randomly masking tokens at each epoch instead of just once during preprocessing, and removing the next-sentence prediction objective. + +The dominant strategy to improve performance is to increase the model size. But training large models is computationally expensive. One way to reduce computational costs is using a smaller model like [DistilBERT](model_doc/distilbert). DistilBERT uses [knowledge distillation](https://arxiv.org/abs/1503.02531) - a compression technique - to create a smaller version of BERT while keeping nearly all of its language understanding capabilities. + +However, most Transformer models continued to trend towards more parameters, leading to new models focused on improving training efficiency. [ALBERT](model_doc/albert) reduces memory consumption by lowering the number of parameters in two ways: separating the larger vocabulary embedding into two smaller matrices and allowing layers to share parameters. [DeBERTa](model_doc/deberta) added a disentangled attention mechanism where the word and its position are separately encoded in two vectors. The attention is computed from these separate vectors instead of a single vector containing the word and position embeddings. [Longformer](model_doc/longformer) also focused on making attention more efficient, especially for processing documents with longer sequence lengths. It uses a combination of local windowed attention (attention only calculated from fixed window size around each token) and global attention (only for specific task tokens like `[CLS]` for classification) to create a sparse attention matrix instead of a full attention matrix. + +### Decoder[[nlp-decoder]] + +[GPT-2](model_doc/gpt2) is a decoder-only Transformer that predicts the next word in the sequence. It masks tokens to the right so the model can't "cheat" by looking ahead. By pretraining on a massive body of text, GPT-2 became really good at generating text, even if the text is only sometimes accurate or true. But GPT-2 lacked the bidirectional context from BERT's pretraining, which made it unsuitable for certain tasks. [XLNET](model_doc/xlnet) combines the best of both BERT and GPT-2's pretraining objectives by using a permutation language modeling objective (PLM) that allows it to learn bidirectionally. + +After GPT-2, language models grew even bigger and are now known as *large language models (LLMs)*. LLMs demonstrate few- or even zero-shot learning if pretrained on a large enough dataset. [GPT-J](model_doc/gptj) is an LLM with 6B parameters and trained on 400B tokens. GPT-J was followed by [OPT](model_doc/opt), a family of decoder-only models, the largest of which is 175B and trained on 180B tokens. [BLOOM](model_doc/bloom) was released around the same time, and the largest model in the family has 176B parameters and is trained on 366B tokens in 46 languages and 13 programming languages. + +### Encoder-decoder[[nlp-encoder-decoder]] + +[BART](model_doc/bart) keeps the original Transformer architecture, but it modifies the pretraining objective with *text infilling* corruption, where some text spans are replaced with a single `mask` token. The decoder predicts the uncorrupted tokens (future tokens are masked) and uses the encoder's hidden states to help it. [Pegasus](model_doc/pegasus) is similar to BART, but Pegasus masks entire sentences instead of text spans. In addition to masked language modeling, Pegasus is pretrained by gap sentence generation (GSG). The GSG objective masks whole sentences important to a document, replacing them with a `mask` token. The decoder must generate the output from the remaining sentences. [T5](model_doc/t5) is a more unique model that casts all NLP tasks into a text-to-text problem using specific prefixes. For example, the prefix `Summarize:` indicates a summarization task. T5 is pretrained by supervised (GLUE and SuperGLUE) training and self-supervised training (randomly sample and drop out 15% of tokens). + +## Audio + + + +### Encoder[[audio-encoder]] + +[Wav2Vec2](model_doc/wav2vec2) uses a Transformer encoder to learn speech representations directly from raw audio waveforms. It is pretrained with a contrastive task to determine the true speech representation from a set of false ones. [HuBERT](model_doc/hubert) is similar to Wav2Vec2 but has a different training process. Target labels are created by a clustering step in which segments of similar audio are assigned to a cluster which becomes a hidden unit. The hidden unit is mapped to an embedding to make a prediction. + +### Encoder-decoder[[audio-encoder-decoder]] + +[Speech2Text](model_doc/speech_to_text) is a speech model designed for automatic speech recognition (ASR) and speech translation. The model accepts log mel-filter bank features extracted from the audio waveform and pretrained autoregressively to generate a transcript or translation. [Whisper](model_doc/whisper) is also an ASR model, but unlike many other speech models, it is pretrained on a massive amount of ✨ labeled ✨ audio transcription data for zero-shot performance. A large chunk of the dataset also contains non-English languages, meaning Whisper can also be used for low-resource languages. Structurally, Whisper is similar to Speech2Text. The audio signal is converted to a log-mel spectrogram encoded by the encoder. The decoder generates the transcript autoregressively from the encoder's hidden states and the previous tokens. + +## Multimodal + + + +### Encoder[[mm-encoder]] + +[VisualBERT](model_doc/visual_bert) is a multimodal model for vision-language tasks released shortly after BERT. It combines BERT and a pretrained object detection system to extract image features into visual embeddings, passed alongside text embeddings to BERT. VisualBERT predicts the masked text based on the unmasked text and the visual embeddings, and it also has to predict whether the text is aligned with the image. When ViT was released, [ViLT](model_doc/vilt) adopted ViT in its architecture because it was easier to get the image embeddings this way. The image embeddings are jointly processed with the text embeddings. From there, ViLT is pretrained by image text matching, masked language modeling, and whole word masking. + +[CLIP](model_doc/clip) takes a different approach and makes a pair prediction of (`image`, `text`) . An image encoder (ViT) and a text encoder (Transformer) are jointly trained on a 400 million (`image`, `text`) pair dataset to maximize the similarity between the image and text embeddings of the (`image`, `text`) pairs. After pretraining, you can use natural language to instruct CLIP to predict the text given an image or vice versa. [OWL-ViT](model_doc/owlvit) builds on top of CLIP by using it as its backbone for zero-shot object detection. After pretraining, an object detection head is added to make a set prediction over the (`class`, `bounding box`) pairs. + +### Encoder-decoder[[mm-encoder-decoder]] + +Optical character recognition (OCR) is a long-standing text recognition task that typically involves several components to understand the image and generate the text. [TrOCR](model_doc/trocr) simplifies the process using an end-to-end Transformer. The encoder is a ViT-style model for image understanding and processes the image as fixed-size patches. The decoder accepts the encoder's hidden states and autoregressively generates text. [Donut](model_doc/donut) is a more general visual document understanding model that doesn't rely on OCR-based approaches. It uses a Swin Transformer as the encoder and multilingual BART as the decoder. Donut is pretrained to read text by predicting the next word based on the image and text annotations. The decoder generates a token sequence given a prompt. The prompt is represented by a special token for each downstream task. For example, document parsing has a special `parsing` token that is combined with the encoder hidden states to parse the document into a structured output format (JSON). + +## Reinforcement learning + + + +### Decoder[[rl-decoder]] + +The Decision and Trajectory Transformer casts the state, action, and reward as a sequence modeling problem. The [Decision Transformer](model_doc/decision_transformer) generates a series of actions that lead to a future desired return based on returns-to-go, past states, and actions. For the last *K* timesteps, each of the three modalities are converted into token embeddings and processed by a GPT-like model to predict a future action token. [Trajectory Transformer](model_doc/trajectory_transformer) also tokenizes the states, actions, and rewards and processes them with a GPT architecture. Unlike the Decision Transformer, which is focused on reward conditioning, the Trajectory Transformer generates future actions with beam search. \ No newline at end of file diff --git a/docs/source/en/model_summary.mdx b/docs/source/en/model_summary.mdx deleted file mode 100644 index b9799ab59129..000000000000 --- a/docs/source/en/model_summary.mdx +++ /dev/null @@ -1,955 +0,0 @@ - - -# Summary of the models - -This is a summary of the most downloaded models in 🤗 Transformers. Click on the large outermost bubble of each pretrained model category (encoder, decoder, encoder-decoder) to zoom in and out to see the most popular models within a modality. The size of each bubble corresponds to the number of downloads of each model. - - - -It assumes you're familiar with the original [transformer -model](https://arxiv.org/abs/1706.03762). For a gentle introduction check the [annotated transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html). Here we focus on the high-level differences between the -models. You can check them more in detail in their respective documentation. Also check out [the Model Hub](https://huggingface.co/models) where you can filter the checkpoints by model architecture. - -Each one of the models in the library falls into one of the following categories: - -- [autoregressive-models](#autoregressive-models) -- [autoencoding-models](#autoencoding-models) -- [seq-to-seq-models](#seq-to-seq-models) -- [multimodal-models](#multimodal-models) -- [retrieval-based-models](#retrieval-based-models) - - - -Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the -previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full -sentence so that the attention heads can only see what was before in the text, and not what’s after. Although those -models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. A -typical example of such models is GPT. - -Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original -sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the -full inputs without any mask. Those models usually build a bidirectional representation of the whole sentence. They can -be fine-tuned and achieve great results on many tasks such as text generation, but their most natural application is -sentence classification or token classification. A typical example of such models is BERT. - -Note that the only difference between autoregressive models and autoencoding models is in the way the model is -pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given -model has been used for both types of pretraining, we have put it in the category corresponding to the article where it -was first introduced. - -Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation -tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their -most natural applications are translation, summarization and question answering. The original transformer model is an -example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks. - -Multimodal models mix text inputs with other kinds (e.g. images) and are more specific to a given task. - - - -## Decoders or autoregressive models - -As mentioned before, these models rely on the decoder part of the original transformer and use an attention mask so -that at each position, the model can only look at the tokens before the attention heads. - - - -### Original GPT - -
- -Models - - -Doc - - -Spaces - -
- - -[Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf), Alec Radford et al. - -The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset. - -The library provides versions of the model for language modeling and multitask language modeling/multiple choice -classification. - -### GPT-2 - -
- -Models - - -Doc - - -Spaces - -
- - -[Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf), -Alec Radford et al. - -A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or -more). - -The library provides versions of the model for language modeling and multitask language modeling/multiple choice -classification. - -### CTRL - -
- -Models - - -Doc - - -Spaces - -
- - -[CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858), -Nitish Shirish Keskar et al. - -Same as the GPT model but adds the idea of control codes. Text is generated from a prompt (can be empty) and one (or -several) of those control codes which are then used to influence the text generation: generate with the style of -wikipedia article, a book or a movie review. - -The library provides a version of the model for language modeling only. - -### Transformer-XL - -
- -Models - - -Doc - - -Spaces - -
- - -[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860), Zihang -Dai et al. - -Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular -RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that -may span across multiple documents, and segments are fed in order to the model. - -Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention -scores. This allows the model to pay attention to information that was in the previous segment as well as the current -one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments. - -This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would -give the same results in the current input and the current hidden state at a given position) and needs to make some -adjustments in the way attention scores are computed. - -The library provides a version of the model for language modeling only. - - - -### Reformer - -
- -Models - - -Doc - - -Spaces - -
- - -[Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451), Nikita Kitaev et al . - -An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks -include: - -- Use [Axial position encoding](#axial-pos-encoding) (see below for more details). It’s a mechanism to avoid - having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller - matrices. -- Replace traditional attention by [LSH (local-sensitive hashing) attention](#lsh-attention) (see below for more - details). It's a technique to avoid computing the full product query-key in the attention layers. -- Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during - the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them - for results inside a given layer (less efficient than storing them but saves memory). -- Compute the feedforward operations by chunks and not on the whole batch. - -With those tricks, the model can be fed much larger sentences than traditional transformer autoregressive models. - - - -This model could be very well be used in an autoencoding setting, there is no checkpoint for such a -pretraining yet, though. - - - -The library provides a version of the model for language modeling only. - -### XLNet - -
- -Models - - -Doc - - -Spaces - -
- - -[XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237), Zhilin -Yang et al. - -XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the -tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done -with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens -for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length. - -XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies. - -The library provides a version of the model for language modeling, token classification, sentence classification, -multiple choice classification and question answering. - - - -## Encoders or autoencoding models - -As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can -look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their -corrupted versions. - - - -### BERT - -
- -Models - - -Doc - - -Spaces - -
- -[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805), -Jacob Devlin et al. - -Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually -15%) is masked by: - -- a special mask token with probability 0.8 -- a random token different from the one masked with probability 0.1 -- the same token with probability 0.1 - -The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a -separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% -they are not related. The model has to predict if the sentences are consecutive or not. - -The library provides a version of the model for language modeling (traditional or masked), next sentence prediction, -token classification, sentence classification, multiple choice classification and question answering. - -### ALBERT - -
- -Models - - -Doc - - -Spaces - -
- - -[ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), -Zhenzhong Lan et al. - -Same as BERT but with a few tweaks: - -- Embedding size E is different from hidden size H justified because the embeddings are context independent (one - embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a - sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V - being the vocab size). If E < H, it has less parameters. -- Layers are split in groups that share parameters (to save memory). -- Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and - B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have - been swapped or not. - -The library provides a version of the model for masked language modeling, token classification, sentence -classification, multiple choice classification and question answering. - -### RoBERTa - -
- -Models - - -Doc - - -Spaces - -
- - -[RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692), Yinhan Liu et al. - -Same as BERT with better pretraining tricks: - -- dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all -- no NSP (next sentence prediction) loss and instead of putting just two sentences together, put a chunk of - contiguous texts together to reach 512 tokens (so the sentences are in an order than may span several documents) -- train with larger batches -- use BPE with bytes as a subunit and not characters (because of unicode characters) - -The library provides a version of the model for masked language modeling, token classification, sentence -classification, multiple choice classification and question answering. - -### DistilBERT - -
- -Models - - -Doc - - -Spaces - -
- - -[DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108), -Victor Sanh et al. - -Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it's been trained to predict -the same probabilities as the larger model. The actual objective is a combination of: - -- finding the same probabilities as the teacher model -- predicting the masked tokens correctly (but no next-sentence objective) -- a cosine similarity between the hidden states of the student and the teacher model - -The library provides a version of the model for masked language modeling, token classification, sentence classification -and question answering. - -### ConvBERT - -
- -Models - - -Doc - - -Spaces - -
- - -[ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496), Zihang Jiang, -Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. - -Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural -language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large -memory footprint and computation cost. Although all its attention heads query on the whole input sequence for -generating the attention map from a global perspective, we observe some heads only need to learn local dependencies, -which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to -replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the -rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context -learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that -ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and -fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while -using less than 1/4 training cost. - -The library provides a version of the model for masked language modeling, token classification, sentence classification -and question answering. - -### XLM - -
- -Models - - -Doc - - -Spaces - -
- - -[Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291), Guillaume Lample and Alexis Conneau - -A transformer model trained on several languages. There are three different type of training for this model and the -library provides checkpoints for all of them: - -- Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the - previous section as well). One of the languages is selected for each training sample, and the model input is a - sentence of 256 tokens, that may span over several documents in one of those languages. -- Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample, - and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, - with dynamic masking of the tokens. -- A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two - different languages, with random masking. To predict one of the masked tokens, the model can use both, the - surrounding context in language 1 and the context given by language 2. - -Checkpoints refer to which method was used for pretraining by having *clm*, *mlm* or *mlm-tlm* in their names. On top -of positional embeddings, the model has language embeddings. When training using MLM/CLM, this gives the model an -indication of the language used, and when training using MLM+TLM, an indication of the language used for each part. - -The library provides a version of the model for language modeling, token classification, sentence classification and -question answering. - -### XLM-RoBERTa - -
- -Models - - -Doc - - -Spaces - -
- - -[Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116), Alexis Conneau et -al. - -Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses -masked language modeling on sentences coming from one language. However, the model is trained on many more languages -(100) and doesn't use the language embeddings, so it's capable of detecting the input language by itself. - -The library provides a version of the model for masked language modeling, token classification, sentence -classification, multiple choice classification and question answering. - -### FlauBERT - -
- -Models - - -Doc - - -Spaces - -
- - -[FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372), Hang Le et al. - -Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective). - -The library provides a version of the model for language modeling and sentence classification. - -### ELECTRA - -
- -Models - - -Doc - - -Spaces - -
- - -[ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://arxiv.org/abs/2003.10555), -Kevin Clark et al. - -ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are -corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA -has to predict which token is an original and which one has been replaced. Like for GAN training, the small language -model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a -traditional GAN setting) then the ELECTRA model is trained for a few steps. - -The library provides a version of the model for masked language modeling, token classification and sentence -classification. - -### Funnel Transformer - -
- -Models - - -Doc - - -Spaces - -
- - -[Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236), Zihang Dai et al. - -Funnel Transformer is a transformer model using pooling, a bit like a ResNet model: layers are grouped in blocks, and -at the beginning of each block (except the first one), the hidden states are pooled among the sequence dimension. This -way, their length is divided by 2, which speeds up the computation of the next hidden states. All pretrained models -have three blocks, which means the final hidden state has a sequence length that is one fourth of the original sequence -length. - -For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token -classification, we need a hidden state with the same sequence length as the original input. In those cases, the final -hidden states are upsampled to the input sequence length and go through two additional layers. That's why there are two -versions of each checkpoint. The version suffixed with "-base" contains only the three blocks, while the version -without that suffix contains the three blocks and the upsampling head with its additional layers. - -The pretrained models available use the same pretraining objective as ELECTRA. - -The library provides a version of the model for masked language modeling, token classification, sentence -classification, multiple choice classification and question answering. - - - -### Longformer - -
- -Models - - -Doc - - -Spaces - -
- - -[Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150), Iz Beltagy et al. - -A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g., -what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are -still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the -[local attention section](#local-attention) for more information. - -It is pretrained the same way a RoBERTa otherwise. - - - -This model could be very well be used in an autoregressive setting, there is no checkpoint for such a -pretraining yet, though. - - - -The library provides a version of the model for masked language modeling, token classification, sentence -classification, multiple choice classification and question answering. - - - -## Sequence-to-sequence models - -As mentioned before, these models keep both the encoder and the decoder of the original transformer. - - - -### BART - -
- -Models - - -Doc - - -Spaces - -
- - -[BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461), Mike Lewis et al. - -Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is -fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of -the following transformations are applied on the pretraining tasks for the encoder: - -- mask random tokens (like in BERT) -- delete random tokens -- mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token) -- permute sentences -- rotate the document to make it start at a specific token - -The library provides a version of this model for conditional generation and sequence classification. - -### Pegasus - -
- -Models - - -Doc - - -Spaces - -
- - -[PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf), Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019. - -Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on -two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining -objective, called Gap Sentence Generation (GSG). - -- MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in - BERT) -- GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a - causal mask to hide the future words like a regular auto-regressive transformer decoder. - -In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are -masked and are generated together as one output sequence from the remaining sentences, similar to an extractive -summary. - -The library provides a version of this model for conditional generation, which should be used for summarization. - - -### MarianMT - -
- -Models - - -Doc - - -Spaces - -
- - -[Marian: Fast Neural Machine Translation in C++](https://arxiv.org/abs/1804.00344), Marcin Junczys-Dowmunt et al. - -A framework for translation models, using the same models as BART - -The library provides a version of this model for conditional generation. - - -### T5 - -
- -Models - - -Doc - - -Spaces - -
- - -[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683), Colin Raffel et al. - -Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at each -layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific -prefixes: “summarize: ”, “question: ”, “translate English to German: ” and so forth. - -The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream -tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above). - -Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with -individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a -single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original -sentence and the target is then the dropped out tokens delimited by their sentinel tokens. - -For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and -"cute", the encoder input becomes “My very .” and the target input becomes “ dog is cute .” - -The library provides a version of this model for conditional generation. - - -### MT5 - -
- -Models - - -Doc - - -Spaces - -
- - -[mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934), Linting Xue -et al. - -The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's -supervised training. mT5 is trained on 101 languages. - -The library provides a version of this model for conditional generation. - - -### MBart - -
- -Models - - -Doc - - -Spaces - -
- - -[Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, -Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. - -The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended -for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete -sequence-to-sequence model by denoising full texts in multiple languages, - -The library provides a version of this model for conditional generation. - -The [mbart-large-en-ro checkpoint](https://huggingface.co/facebook/mbart-large-en-ro) can be used for english -> -romanian translation. - -The [mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) checkpoint can be finetuned for other -translation and summarization tasks, using code in ```examples/pytorch/translation/``` , but is not very useful without -finetuning. - - -### ProphetNet - -
- -Models - - -Doc - - -Spaces - -
- - -[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by -Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou. - -ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In -future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each -time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model -to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on -the original Transformer, but replaces the "standard" self-attention mechanism in the decoder by a a main -self-attention mechanism and a self and n-stream (predict) self-attention mechanism. - -The library provides a pre-trained version of this model for conditional generation and a fine-tuned version for -summarization. - -### XLM-ProphetNet - -
- -Models - - -Doc - - -Spaces - -
- - -[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://arxiv.org/abs/2001.04063) by -Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou. - -XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained -on the cross-lingual dataset [XGLUE](https://arxiv.org/abs/2004.01401). - -The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned -versions for headline generation and question generation, respectively. - - - -## Multimodal models - -There is one multimodal model in the library which has not been pretrained in the self-supervised fashion like the -others. - -### MMBT - -[Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/abs/1909.02950), Douwe Kiela -et al. - -A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer -model takes as inputs the embeddings of the tokenized text and the final activations of a pretrained on images resnet -(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the resnet to -the hidden state dimension of the transformer). - -The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the -model know which part of the input vector corresponds to the text and which to the image. - -The pretrained model only works for classification. - - - - - -## Retrieval-based models - -Some models use documents retrieval during (pre)training and inference for open-domain question answering, for example. - - -### DPR - -
- -Models - - -Doc - - -Spaces - -
- - -[Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906), Vladimir Karpukhin et -al. - -Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering -research. - - -DPR consists in three models: - -- Question encoder: encode questions as vectors -- Context encoder: encode contexts as vectors -- Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the - inferred span actually answers the question). - -DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and -then it calls the reader with the question and the retrieved documents to get the answer. - -### RAG - -
- -Models - - -Doc - -
- -[Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401), Patrick Lewis, -Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau -Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela - -Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq -models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and -seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation -to adapt to downstream tasks. - -The two models RAG-Token and RAG-Sequence are available for generation. - -## More technical aspects - -### Full vs sparse attention - -Most transformer models use full attention in the sense that the attention matrix is square. It can be a big -computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and -use a sparse version of the attention matrix to speed up training. - - - -**LSH attention** - -[Reformer](#reformer) uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax -dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only -the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is -modified to mask the current token (except at the first position), because it will give a query and a key equal (so -very similar to each other). Since the hash can be a bit random, several hash functions are used in practice -(determined by a n_rounds parameter) and then are averaged together. - - - -**Local attention** - -[Longformer](#longformer) uses local attention: often, the local context (e.g., what are the two tokens to the -left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small -window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a -representation of the whole sentence. - -Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access -all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in -their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask: - - - -Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence -length. - -### Other tricks - - - -**Axial positional encodings** - -[Reformer](#reformer) uses axial positional encodings: in traditional transformer models, the positional encoding -E is a matrix of size \\(l\\) by \\(d\\), \\(l\\) being the sequence length and \\(d\\) the dimension of the -hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate -that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with -dimensions \\(l_{1} \times d_{1}\\) and \\(l_{2} \times d_{2}\\), such that \\(l_{1} \times l_{2} = l\\) and -\\(d_{1} + d_{2} = d\\) (with the product for the lengths, this ends up being way smaller). The embedding for time -step \\(j\\) in E is obtained by concatenating the embeddings for timestep \\(j \% l1\\) in E1 and \\(j // l1\\) -in E2. diff --git a/docs/source/en/multilingual.md b/docs/source/en/multilingual.md new file mode 100644 index 000000000000..9bf904a3b373 --- /dev/null +++ b/docs/source/en/multilingual.md @@ -0,0 +1,179 @@ + + +# Multilingual models for inference + +[[open-in-colab]] + +There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference. + +## XLM + +XLM has ten different checkpoints, only one of which is monolingual. The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't. + +### XLM with language embeddings + +The following XLM models use language embeddings to specify the language used at inference: + +- `xlm-mlm-ende-1024` (Masked language modeling, English-German) +- `xlm-mlm-enfr-1024` (Masked language modeling, English-French) +- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian) +- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages) +- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages) +- `xlm-clm-enfr-1024` (Causal language modeling, English-French) +- `xlm-clm-ende-1024` (Causal language modeling, English-German) + +Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes. + +In this example, load the `xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French): + +```py +>>> import torch +>>> from transformers import XLMTokenizer, XLMWithLMHeadModel + +>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") +>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") +``` + +The `lang2id` attribute of the tokenizer displays this model's languages and their ids: + +```py +>>> print(tokenizer.lang2id) +{'en': 0, 'fr': 1} +``` + +Next, create an example input: + +```py +>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 +``` + +Set the language id as `"en"` and use it to define the language embedding. The language embedding is a tensor filled with `0` since that is the language id for English. This tensor should be the same size as `input_ids`. + +```py +>>> language_id = tokenizer.lang2id["en"] # 0 +>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) + +>>> # We reshape it to be of size (batch_size, sequence_length) +>>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) +``` + +Now you can pass the `input_ids` and language embedding to the model: + +```py +>>> outputs = model(input_ids, langs=langs) +``` + +The [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) script can generate text with language embeddings using the `xlm-clm` checkpoints. + +### XLM without language embeddings + +The following XLM models do not require language embeddings during inference: + +- `xlm-mlm-17-1280` (Masked language modeling, 17 languages) +- `xlm-mlm-100-1280` (Masked language modeling, 100 languages) + +These models are used for generic sentence representations, unlike the previous XLM checkpoints. + +## BERT + +The following BERT models can be used for multilingual tasks: + +- `bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages) +- `bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages) + +These models do not require language embeddings during inference. They should identify the language from the +context and infer accordingly. + +## XLM-RoBERTa + +The following XLM-RoBERTa models can be used for multilingual tasks: + +- `xlm-roberta-base` (Masked language modeling, 100 languages) +- `xlm-roberta-large` (Masked language modeling, 100 languages) + +XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering. + +## M2M100 + +The following M2M100 models can be used for multilingual translation: + +- `facebook/m2m100_418M` (Translation) +- `facebook/m2m100_1.2B` (Translation) + +In this example, load the `facebook/m2m100_418M` checkpoint to translate from Chinese to English. You can set the source language in the tokenizer: + +```py +>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." + +>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +``` + +Tokenize the text: + +```py +>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") +``` + +M2M100 forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English: + +```py +>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' +``` + +## MBart + +The following MBart models can be used for multilingual translation: + +- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages) +- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages) +- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages) +- `facebook/mbart-large-50` (Multilingual translation, 50 languages) +- `facebook/mbart-large-cc25` + +In this example, load the `facebook/mbart-large-50-many-to-many-mmt` checkpoint to translate Finnish to English. You can set the source language in the tokenizer: + +```py +>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." + +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +``` + +Tokenize the text: + +```py +>>> encoded_en = tokenizer(en_text, return_tensors="pt") +``` + +MBart forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English: + +```py +>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." +``` + +If you are using the `facebook/mbart-large-50-many-to-one-mmt` checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same. diff --git a/docs/source/en/multilingual.mdx b/docs/source/en/multilingual.mdx deleted file mode 100644 index 7c95de6ffc09..000000000000 --- a/docs/source/en/multilingual.mdx +++ /dev/null @@ -1,175 +0,0 @@ - - -# Multilingual models for inference - -[[open-in-colab]] - -There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference. - -## XLM - -XLM has ten different checkpoints, only one of which is monolingual. The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't. - -### XLM with language embeddings - -The following XLM models use language embeddings to specify the language used at inference: - -- `xlm-mlm-ende-1024` (Masked language modeling, English-German) -- `xlm-mlm-enfr-1024` (Masked language modeling, English-French) -- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian) -- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages) -- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages) -- `xlm-clm-enfr-1024` (Causal language modeling, English-French) -- `xlm-clm-ende-1024` (Causal language modeling, English-German) - -Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes. - -In this example, load the `xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French): - -```py ->>> import torch ->>> from transformers import XLMTokenizer, XLMWithLMHeadModel - ->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") ->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") -``` - -The `lang2id` attribute of the tokenizer displays this model's languages and their ids: - -```py ->>> print(tokenizer.lang2id) -{'en': 0, 'fr': 1} -``` - -Next, create an example input: - -```py ->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 -``` - -Set the language id as `"en"` and use it to define the language embedding. The language embedding is a tensor filled with `0` since that is the language id for English. This tensor should be the same size as `input_ids`. - -```py ->>> language_id = tokenizer.lang2id["en"] # 0 ->>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) - ->>> # We reshape it to be of size (batch_size, sequence_length) ->>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) -``` - -Now you can pass the `input_ids` and language embedding to the model: - -```py ->>> outputs = model(input_ids, langs=langs) -``` - -The [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) script can generate text with language embeddings using the `xlm-clm` checkpoints. - -### XLM without language embeddings - -The following XLM models do not require language embeddings during inference: - -- `xlm-mlm-17-1280` (Masked language modeling, 17 languages) -- `xlm-mlm-100-1280` (Masked language modeling, 100 languages) - -These models are used for generic sentence representations, unlike the previous XLM checkpoints. - -## BERT - -The following BERT models can be used for multilingual tasks: - -- `bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages) -- `bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages) - -These models do not require language embeddings during inference. They should identify the language from the -context and infer accordingly. - -## XLM-RoBERTa - -The following XLM-RoBERTa models can be used for multilingual tasks: - -- `xlm-roberta-base` (Masked language modeling, 100 languages) -- `xlm-roberta-large` (Masked language modeling, 100 languages) - -XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering. - -## M2M100 - -The following M2M100 models can be used for multilingual translation: - -- `facebook/m2m100_418M` (Translation) -- `facebook/m2m100_1.2B` (Translation) - -In this example, load the `facebook/m2m100_418M` checkpoint to translate from Chinese to English. You can set the source language in the tokenizer: - -```py ->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." - ->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") ->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") -``` - -Tokenize the text: - -```py ->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") -``` - -M2M100 forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English: - -```py ->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' -``` - -## MBart - -The following MBart models can be used for multilingual translation: - -- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages) -- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages) -- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages) -- `facebook/mbart-large-50` (Multilingual translation, 50 languages) -- `facebook/mbart-large-cc25` - -In this example, load the `facebook/mbart-large-50-many-to-many-mmt` checkpoint to translate Finnish to English. You can set the source language in the tokenizer: - -```py ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." - ->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") ->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") -``` - -Tokenize the text: - -```py ->>> encoded_en = tokenizer(en_text, return_tensors="pt") -``` - -MBart forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English: - -```py ->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." -``` - -If you are using the `facebook/mbart-large-50-many-to-one-mmt` checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same. \ No newline at end of file diff --git a/docs/source/en/pad_truncation.md b/docs/source/en/pad_truncation.md new file mode 100644 index 000000000000..8094dc1bc2aa --- /dev/null +++ b/docs/source/en/pad_truncation.md @@ -0,0 +1,71 @@ + + +# Padding and truncation + +Batched inputs are often different lengths, so they can't be converted to fixed-size tensors. Padding and truncation are strategies for dealing with this problem, to create rectangular tensors from batches of varying lengths. Padding adds a special **padding token** to ensure shorter sequences will have the same length as either the longest sequence in a batch or the maximum length accepted by the model. Truncation works in the other direction by truncating long sequences. + +In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. However, the API supports more strategies if you need them. The three arguments you need to are: `padding`, `truncation` and `max_length`. + +The `padding` argument controls padding. It can be a boolean or a string: + + - `True` or `'longest'`: pad to the longest sequence in the batch (no padding is applied if you only provide + a single sequence). + - `'max_length'`: pad to a length specified by the `max_length` argument or the maximum length accepted + by the model if no `max_length` is provided (`max_length=None`). Padding will still be applied if you only provide a single sequence. + - `False` or `'do_not_pad'`: no padding is applied. This is the default behavior. + +The `truncation` argument controls truncation. It can be a boolean or a string: + + - `True` or `'longest_first'`: truncate to a maximum length specified by the `max_length` argument or + the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will + truncate token by token, removing a token from the longest sequence in the pair until the proper length is + reached. + - `'only_second'`: truncate to a maximum length specified by the `max_length` argument or the maximum + length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate + the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. + - `'only_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum + length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate + the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. + - `False` or `'do_not_truncate'`: no truncation is applied. This is the default behavior. + +The `max_length` argument controls the length of the padding and truncation. It can be an integer or `None`, in which case it will default to the maximum length the model can accept. If the model has no specific maximum input length, truncation or padding to `max_length` is deactivated. + +The following table summarizes the recommended way to setup padding and truncation. If you use pairs of input sequences in any of the following examples, you can replace `truncation=True` by a `STRATEGY` selected in +`['only_first', 'only_second', 'longest_first']`, i.e. `truncation='only_second'` or `truncation='longest_first'` to control how both sequences in the pair are truncated as detailed before. + +| Truncation | Padding | Instruction | +|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------| +| no truncation | no padding | `tokenizer(batch_sentences)` | +| | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True)` or | +| | | `tokenizer(batch_sentences, padding='longest')` | +| | padding to max model input length | `tokenizer(batch_sentences, padding='max_length')` | +| | padding to specific length | `tokenizer(batch_sentences, padding='max_length', max_length=42)` | +| | padding to a multiple of a value | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8) | +| truncation to max model input length | no padding | `tokenizer(batch_sentences, truncation=True)` or | +| | | `tokenizer(batch_sentences, truncation=STRATEGY)` | +| | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True, truncation=True)` or | +| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)` | +| | padding to max model input length | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or | +| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)` | +| | padding to specific length | Not possible | +| truncation to specific length | no padding | `tokenizer(batch_sentences, truncation=True, max_length=42)` or | +| | | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)` | +| | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or | +| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)` | +| | padding to max model input length | Not possible | +| | padding to specific length | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or | +| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` | diff --git a/docs/source/en/pad_truncation.mdx b/docs/source/en/pad_truncation.mdx deleted file mode 100644 index f848e23bed50..000000000000 --- a/docs/source/en/pad_truncation.mdx +++ /dev/null @@ -1,66 +0,0 @@ - - -# Padding and truncation - -Batched inputs are often different lengths, so they can't be converted to fixed-size tensors. Padding and truncation are strategies for dealing with this problem, to create rectangular tensors from batches of varying lengths. Padding adds a special **padding token** to ensure shorter sequences will have the same length as either the longest sequence in a batch or the maximum length accepted by the model. Truncation works in the other direction by truncating long sequences. - -In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. However, the API supports more strategies if you need them. The three arguments you need to are: `padding`, `truncation` and `max_length`. - -The `padding` argument controls padding. It can be a boolean or a string: - - - `True` or `'longest'`: pad to the longest sequence in the batch (no padding is applied if you only provide - a single sequence). - - `'max_length'`: pad to a length specified by the `max_length` argument or the maximum length accepted - by the model if no `max_length` is provided (`max_length=None`). Padding will still be applied if you only provide a single sequence. - - `False` or `'do_not_pad'`: no padding is applied. This is the default behavior. - -The `truncation` argument controls truncation. It can be a boolean or a string: - - - `True` or `'longest_first'`: truncate to a maximum length specified by the `max_length` argument or - the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will - truncate token by token, removing a token from the longest sequence in the pair until the proper length is - reached. - - `'only_second'`: truncate to a maximum length specified by the `max_length` argument or the maximum - length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate - the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. - - `'only_first'`: truncate to a maximum length specified by the `max_length` argument or the maximum - length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate - the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. - - `False` or `'do_not_truncate'`: no truncation is applied. This is the default behavior. - -The `max_length` argument controls the length of the padding and truncation. It can be an integer or `None`, in which case it will default to the maximum length the model can accept. If the model has no specific maximum input length, truncation or padding to `max_length` is deactivated. - -The following table summarizes the recommended way to setup padding and truncation. If you use pairs of input sequences in any of the following examples, you can replace `truncation=True` by a `STRATEGY` selected in -`['only_first', 'only_second', 'longest_first']`, i.e. `truncation='only_second'` or `truncation='longest_first'` to control how both sequences in the pair are truncated as detailed before. - -| Truncation | Padding | Instruction | -|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------| -| no truncation | no padding | `tokenizer(batch_sentences)` | -| | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True)` or | -| | | `tokenizer(batch_sentences, padding='longest')` | -| | padding to max model input length | `tokenizer(batch_sentences, padding='max_length')` | -| | padding to specific length | `tokenizer(batch_sentences, padding='max_length', max_length=42)` | -| truncation to max model input length | no padding | `tokenizer(batch_sentences, truncation=True)` or | -| | | `tokenizer(batch_sentences, truncation=STRATEGY)` | -| | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True, truncation=True)` or | -| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)` | -| | padding to max model input length | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or | -| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)` | -| | padding to specific length | Not possible | -| truncation to specific length | no padding | `tokenizer(batch_sentences, truncation=True, max_length=42)` or | -| | | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)` | -| | padding to max sequence in batch | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or | -| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)` | -| | padding to max model input length | Not possible | -| | padding to specific length | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or | -| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` | diff --git a/docs/source/en/peft.md b/docs/source/en/peft.md new file mode 100644 index 000000000000..302b614e5f7b --- /dev/null +++ b/docs/source/en/peft.md @@ -0,0 +1,216 @@ + + +# Load adapters with 🤗 PEFT + +[[open-in-colab]] + +[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. The adapters are trained to learn task-specific information. This approach has been shown to be very memory-efficient with lower compute usage while producing results comparable to a fully fine-tuned model. + +Adapters trained with PEFT are also usually an order of magnitude smaller than the full model, making it convenient to share, store, and load them. + +
+ +
The adapter weights for a OPTForCausalLM model stored on the Hub are only ~6MB compared to the full size of the model weights, which can be ~700MB.
+
+ +If you're interested in learning more about the 🤗 PEFT library, check out the [documentation](https://huggingface.co/docs/peft/index). + +## Setup + +Get started by installing 🤗 PEFT: + +```bash +pip install peft +``` + +If you want to try out the brand new features, you might be interested in installing the library from source: + +```bash +pip install git+https://github.com/huggingface/peft.git +``` + +## Supported PEFT models + +🤗 Transformers natively supports some PEFT methods, meaning you can load adapter weights stored locally or on the Hub and easily run or train them with a few lines of code. The following methods are supported: + +- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora) +- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3) +- [AdaLoRA](https://arxiv.org/abs/2303.10512) + +If you want to use other PEFT methods, such as prompt learning or prompt tuning, or about the 🤗 PEFT library in general, please refer to the [documentation](https://huggingface.co/docs/peft/index). + + +## Load a PEFT adapter + +To load and use a PEFT adapter model from 🤗 Transformers, make sure the Hub repository or local directory contains an `adapter_config.json` file and the adapter weights, as shown in the example image above. Then you can load the PEFT adapter model using the `AutoModelFor` class. For example, to load a PEFT adapter model for causal language modeling: + +1. specify the PEFT model id +2. pass it to the [`AutoModelForCausalLM`] class + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +peft_model_id = "ybelkada/opt-350m-lora" +model = AutoModelForCausalLM.from_pretrained(peft_model_id) +``` + + + +You can load a PEFT adapter with either an `AutoModelFor` class or the base model class like `OPTForCausalLM` or `LlamaForCausalLM`. + + + +You can also load a PEFT adapter by calling the `load_adapter` method: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "facebook/opt-350m" +peft_model_id = "ybelkada/opt-350m-lora" + +model = AutoModelForCausalLM.from_pretrained(model_id) +model.load_adapter(peft_model_id) +``` + +## Load in 8bit or 4bit + +The `bitsandbytes` integration supports 8bit and 4bit precision data types, which are useful for loading large models because it saves memory (see the `bitsandbytes` integration [guide](./quantization#bitsandbytes-integration) to learn more). Add the `load_in_8bit` or `load_in_4bit` parameters to [`~PreTrainedModel.from_pretrained`] and set `device_map="auto"` to effectively distribute the model to your hardware: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +peft_model_id = "ybelkada/opt-350m-lora" +model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +``` + +## Add a new adapter + +You can use [`~peft.PeftModel.add_adapter`] to add a new adapter to a model with an existing adapter as long as the new adapter is the same type as the current one. For example, if you have an existing LoRA adapter attached to a model: + +```py +from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer +from peft import PeftConfig + +model_id = "facebook/opt-350m" +model = AutoModelForCausalLM.from_pretrained(model_id) + +lora_config = LoraConfig( + target_modules=["q_proj", "k_proj"], + init_lora_weights=False +) + +model.add_adapter(lora_config, adapter_name="adapter_1") +``` + +To add a new adapter: + +```py +# attach new adapter with same config +model.add_adapter(lora_config, adapter_name="adapter_2") +``` + +Now you can use [`~peft.PeftModel.set_adapter`] to set which adapter to use: + +```py +# use adapter_1 +model.set_adapter("adapter_1") +output = model.generate(**inputs) +print(tokenizer.decode(output_disabled[0], skip_special_tokens=True)) + +# use adapter_2 +model.set_adapter("adapter_2") +output_enabled = model.generate(**inputs) +print(tokenizer.decode(output_enabled[0], skip_special_tokens=True)) +``` + +## Enable and disable adapters + +Once you've added an adapter to a model, you can enable or disable the adapter module. To enable the adapter module: + +```py +from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer +from peft import PeftConfig + +model_id = "facebook/opt-350m" +adapter_model_id = "ybelkada/opt-350m-lora" +tokenizer = AutoTokenizer.from_pretrained(model_id) +text = "Hello" +inputs = tokenizer(text, return_tensors="pt") + +model = AutoModelForCausalLM.from_pretrained(model_id) +peft_config = PeftConfig.from_pretrained(adapter_model_id) + +# to initiate with random weights +peft_config.init_lora_weights = False + +model.add_adapter(peft_config) +model.enable_adapters() +output = model.generate(**inputs) +``` + +To disable the adapter module: + +```py +model.disable_adapters() +output = model.generate(**inputs) +``` + +## Train a PEFT adapter + +PEFT adapters are supported by the [`Trainer`] class so that you can train an adapter for your specific use case. It only requires adding a few more lines of code. For example, to train a LoRA adapter: + + + +If you aren't familiar with fine-tuning a model with [`Trainer`], take a look at the [Fine-tune a pretrained model](training) tutorial. + + + +1. Define your adapter configuration with the task type and hyperparameters (see [`~peft.LoraConfig`] for more details about what the hyperparameters do). + +```py +from peft import LoraConfig + +peft_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + task_type="CAUSAL_LM", +) +``` + +2. Add adapter to the model. + +```py +model.add_adapter(peft_config) +``` + +3. Now you can pass the model to [`Trainer`]! + +```py +trainer = Trainer(model=model, ...) +trainer.train() +``` + +To save your trained adapter and load it back: + +```py +model.save_pretrained(save_dir) +model = AutoModelForCausalLM.from_pretrained(save_dir) +``` + + diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md new file mode 100644 index 000000000000..a28824346e4b --- /dev/null +++ b/docs/source/en/perf_hardware.md @@ -0,0 +1,155 @@ + + + +# Custom hardware for training + +The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/). + +Let's have a look at some practical advice for GPU setups. + +## GPU +When you train bigger models you have essentially three options: + +- bigger GPUs +- more GPUs +- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support)) + +Let's start at the case where you have a single GPU. + +### Power and Cooling + +If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling. + +**Power**: + +Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise. + +Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power. + +Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power. + +Low end cards may use 6-Pin connectors, which supply up to 75W of power. + +Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak. + +And of course the PSU needs to have enough unused Watts to power the card. + +**Cooling**: + +When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot. + +It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU. + +Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity. + +### Multi-GPU Connectivity + +If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run: + +``` +nvidia-smi topo -m +``` + +and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like: + +``` + GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X NV2 0-23 N/A +GPU1 NV2 X 0-23 N/A +``` + +on a different machine w/o NVLink we may see: +``` + GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X PHB 0-11 N/A +GPU1 PHB X 0-11 N/A +``` + +The report includes this legend: + +``` + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks +``` + +So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup. + +Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB). + +Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training. + +#### NVlink + +[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia. + +Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf): + +> Third-Generation NVLink® +> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, +> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four +> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth +> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink. +> (Note that 3-Way and 4-Way SLI configurations are not supported.) + +So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture. + +Let's compare the execution of a gpt2 language model training over a small sample of wikitext. + +The results are: + + +| NVlink | Time | +| ----- | ---: | +| Y | 101s | +| N | 131s | + + +You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink. + +Here is the full benchmark code and outputs: + +```bash +# DDP w/ NVLink + +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} + +# DDP w/o NVLink + +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} +``` + +Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) +Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` diff --git a/docs/source/en/perf_hardware.mdx b/docs/source/en/perf_hardware.mdx deleted file mode 100644 index b28df49892b1..000000000000 --- a/docs/source/en/perf_hardware.mdx +++ /dev/null @@ -1,150 +0,0 @@ - - - -# Custom hardware for training - -The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/). - -Let's have a look at some practical advice for GPU setups. - -## GPU -When you train bigger models you have essentially three options: -- bigger GPUs -- more GPUs -- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support)) - -Let's start at the case where you have a single GPU. - -### Power and Cooling - -If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling. - -**Power**: - -Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise. - -Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power. - -Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power. - -Low end cards may use 6-Pin connectors, which supply up to 75W of power. - -Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak. - -And of course the PSU needs to have enough unused Watts to power the card. - -**Cooling**: - -When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot. - -It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU. - -Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity. - -### Multi-GPU Connectivity - -If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run: - -``` -nvidia-smi topo -m -``` - -and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like: - -``` - GPU0 GPU1 CPU Affinity NUMA Affinity -GPU0 X NV2 0-23 N/A -GPU1 NV2 X 0-23 N/A -``` - -on a different machine w/o NVLink we may see: -``` - GPU0 GPU1 CPU Affinity NUMA Affinity -GPU0 X PHB 0-11 N/A -GPU1 PHB X 0-11 N/A -``` - -The report includes this legend: - -``` - X = Self - SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) - NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node - PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) - PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) - PIX = Connection traversing at most a single PCIe bridge - NV# = Connection traversing a bonded set of # NVLinks -``` - -So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup. - -Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB). - -Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training. - -#### NVlink - -[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia. - -Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf): - -> Third-Generation NVLink® -> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, -> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four -> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth -> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink. -> (Note that 3-Way and 4-Way SLI configurations are not supported.) - -So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture. - -Let's compare the execution of a gpt2 language model training over a small sample of wikitext. - -The results are: - - -| NVlink | Time | -| ----- | ---: | -| Y | 101s | -| N | 131s | - - -You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink. - -Here is the full benchmark code and outputs: - -```bash -# DDP w/ NVLink - -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \ ---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ ---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ ---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} - -# DDP w/o NVLink - -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \ ---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ ---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train ---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} -``` - -Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) -Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md new file mode 100644 index 000000000000..a7a524ae1ef0 --- /dev/null +++ b/docs/source/en/perf_infer_cpu.md @@ -0,0 +1,75 @@ + + +# Efficient Inference on CPU + +This guide focuses on inferencing large models efficiently on CPU. + +## `BetterTransformer` for faster inference + +We have recently integrated `BetterTransformer` for faster inference on CPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details. + +## PyTorch JIT-mode (TorchScript) +TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency. +Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion. + +For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules). + +### IPEX Graph Optimization with JIT-mode +Intel® Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel® Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel® Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision. + +Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html). + +#### IPEX installation: + +IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/). + +### Usage of JIT-mode +To enable JIT-mode in Trainer for evaluaion or prediction, users should add `jit_mode_eval` in Trainer command arguments. + + + +for PyTorch >= 1.14.0. JIT-mode could benefit any models for prediction and evaluaion since dict input is supported in jit.trace + +for PyTorch < 1.14.0. JIT-mode could benefit models whose forward parameter order matches the tuple input order in jit.trace, like question-answering model +In the case where the forward parameter order does not match the tuple input order in jit.trace, like text-classification models, jit.trace will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users. + + + +Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) + + +- Inference using jit mode on CPU: +
python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+--jit_mode_eval 
+ +- Inference with IPEX using jit mode on CPU: +
python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+--use_ipex \
+--jit_mode_eval
diff --git a/docs/source/en/perf_infer_cpu.mdx b/docs/source/en/perf_infer_cpu.mdx deleted file mode 100644 index a3df21e93a57..000000000000 --- a/docs/source/en/perf_infer_cpu.mdx +++ /dev/null @@ -1,71 +0,0 @@ - - -# Efficient Inference on CPU - -This guide focuses on inferencing large models efficiently on CPU. - -## `BetterTransformer` for faster inference - -We have recently integrated `BetterTransformer` for faster inference on CPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details. - -## PyTorch JIT-mode (TorchScript) -TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency. -Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion. - -For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules). - -### IPEX Graph Optimization with JIT-mode -Intel® Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel® Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel® Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision. - -Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html). - -#### IPEX installation: - -IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/). - -### Usage of JIT-mode -To enable JIT-mode in Trainer for evaluaion or prediction, users should add `jit_mode_eval` in Trainer command arguments. - - - -for PyTorch >= 1.14.0. JIT-mode could benefit any models for prediction and evaluaion since dict input is supported in jit.trace - -for PyTorch < 1.14.0. JIT-mode could benefit models whose forward parameter order matches the tuple input order in jit.trace, like question-answering model -In the case where the forward parameter order does not match the tuple input order in jit.trace, like text-classification models, jit.trace will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users. - - - -Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - - -- Inference using jit mode on CPU: -
python run_qa.py \
---model_name_or_path csarron/bert-base-uncased-squad-v1 \
---dataset_name squad \
---do_eval \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/ \
---no_cuda \
---jit_mode_eval 
- -- Inference with IPEX using jit mode on CPU: -
python run_qa.py \
---model_name_or_path csarron/bert-base-uncased-squad-v1 \
---dataset_name squad \
---do_eval \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/ \
---no_cuda \
---use_ipex \
---jit_mode_eval
diff --git a/docs/source/en/perf_infer_gpu_many.md b/docs/source/en/perf_infer_gpu_many.md new file mode 100644 index 000000000000..2118b5ddb404 --- /dev/null +++ b/docs/source/en/perf_infer_gpu_many.md @@ -0,0 +1,124 @@ + + +# Efficient Inference on a Multiple GPUs + +This document contains information on how to efficiently infer on a multiple GPUs. + + +Note: A multi GPU setup can use the majority of the strategies described in the [single GPU section](./perf_infer_gpu_one). You must be aware of simple techniques, though, that can be used for a better usage. + + + +## Flash Attention 2 + +Flash Attention 2 integration also works in a multi-GPU setup, check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) + +## BetterTransformer + +[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood. + +BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models. + + + +Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer. + + + +### Decoder models + +For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards. + +To convert a model to BetterTransformer: + +```python +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +# convert the model to BetterTransformer +model.to_bettertransformer() + +# Use it for training or inference +``` + +SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager: + + +```diff +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to("cuda") +# convert the model to BetterTransformer +model.to_bettertransformer() + +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") + ++ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +If you see a bug with a traceback saying + +```bash +RuntimeError: No available kernel. Aborting execution. +``` + +try using the PyTorch nightly version, which may have a broader coverage for Flash Attention: + +```bash +pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +``` + +Have a look at this [blog post](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about what is possible with the BetterTransformer + SDPA API. + +### Encoder models + +For encoder models during inference, BetterTransformer dispatches the forward call of encoder layers to an equivalent of [`torch.nn.TransformerEncoderLayer`](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html) that will execute the fastpath implementation of the encoder layers. + +Because `torch.nn.TransformerEncoderLayer` fastpath does not support training, it is dispatched to `torch.nn.functional.scaled_dot_product_attention` instead, which does not leverage nested tensors but can use Flash Attention or Memory-Efficient Attention fused kernels. + +More details about BetterTransformer performance can be found in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), and you can learn more about BetterTransformer for encoder models in this [blog](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/). + + +## Advanced usage: mixing FP4 (or Int8) and BetterTransformer + +You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention: + +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16 +) + +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) + +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") + +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` \ No newline at end of file diff --git a/docs/source/en/perf_infer_gpu_many.mdx b/docs/source/en/perf_infer_gpu_many.mdx deleted file mode 100644 index d8a24d6ab8ae..000000000000 --- a/docs/source/en/perf_infer_gpu_many.mdx +++ /dev/null @@ -1,23 +0,0 @@ - - -# Efficient Inference on a Multiple GPUs - -This document contains information on how to efficiently infer on a multiple GPUs. - - -Note: A multi GPU setup can use the majority of the strategies described in the [single GPU section](./perf_infer_gpu_one). You must be aware of simple techniques, though, that can be used for a better usage. - - - -## `BetterTransformer` for faster inference - -We have recently integrated `BetterTransformer` for faster inference on multi-GPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details. diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md new file mode 100644 index 000000000000..d24299012e9f --- /dev/null +++ b/docs/source/en/perf_infer_gpu_one.md @@ -0,0 +1,422 @@ + + +# Efficient Inference on a Single GPU + +In addition to this guide, relevant information can be found as well in [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu). + +## Flash Attention 2 + + + +Note that this feature is experimental and might considerably change in future versions. For instance, the Flash Attention 2 API might migrate to `BetterTransformer` API in the near future. + + + +Flash Attention 2 can considerably speed up transformer-based models' training and inference speed. Flash Attention 2 has been introduced in the [official Flash Attention repository](https://github.com/Dao-AILab/flash-attention) by Tri Dao et al. The scientific paper on Flash Attention can be found [here](https://arxiv.org/abs/2205.14135). + +Make sure to follow the installation guide on the repository mentioned above to properly install Flash Attention 2. Once that package is installed, you can benefit from this feature. + +We natively support Flash Attention 2 for the following models: + +- Llama +- Mistral +- Falcon + +You can request to add Flash Attention 2 support for more models by opening an issue on GitHub, and even open a Pull Request to integrate the changes. The supported models can be used for inference and training, including training with padding tokens - *which is currently not supported for `BetterTransformer` API below.* + + + +Flash Attention 2 can only be used when the models' dtype is `fp16` or `bf16` and runs only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature. + + + +### Quick usage + +To enable Flash Attention 2 in your model, add `use_flash_attention_2` in the `from_pretrained` arguments: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + use_flash_attention_2=True, +) +``` + +And use it for generation or fine-tuning. + +### Expected speedups + +You can benefit from considerable speedups for fine-tuning and inference, especially for long sequences. However, since Flash Attention does not support computing attention scores with padding tokens under the hood, we must manually pad / unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens. + +To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length. An example is provided [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516). + +Below is the expected speedup you can get for a simple forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes, without padding tokens: + +
+ +
+ +Below is the expected speedup you can get for a simple forward pass on [`meta-llama/Llama-7b-hf`](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes, without padding tokens: + +
+ +
+ +For sequences with padding tokens (training with padding tokens or generating with padding tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below 30% of the input has been filled with padding tokens). + +
+ +
+ +But for large sequence length you can benefit from interesting speedup for pure inference (also training) + +Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. It can lead up to memory reduction up to 20 for large sequence length. Check out [the official flash attention repository](https://github.com/Dao-AILab/flash-attention) for more details. + +
+ +
+ + +### Advanced usage + +You can combine this feature with many exisiting feature for model optimization. Check out few examples below: + +### Combining Flash Attention 2 and 8-bit models + +You can combine this feature together with 8-bit quantization: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_8bit=True, + use_flash_attention_2=True, +) +``` + +### Combining Flash Attention 2 and 4-bit models + +You can combine this feature together with 4-bit quantization: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_4bit=True, + use_flash_attention_2=True, +) +``` + +### Combining Flash Attention 2 and PEFT + +You can combine this feature together with PEFT for training adapters using Flash Attention 2 under the hood: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM +from peft import LoraConfig + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_4bit=True, + use_flash_attention_2=True, +) + +lora_config = LoraConfig( + r=8, + task_type="CAUSAL_LM" +) + +model.add_adapter(lora_config) + +... # train your model +``` + +## BetterTransformer + +[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood. + +BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models. + + + +Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer. + + + +### Encoder models + +PyTorch-native [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) attention fastpath, called BetterTransformer, can be used with Transformers through the integration in the [🤗 Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). + +PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2). + +After installing the [`optimum`](https://github.com/huggingface/optimum) package, to use Better Transformer during inference, the relevant internal modules are replaced by calling [`~PreTrainedModel.to_bettertransformer`]: + +```python +model = model.to_bettertransformer() +``` + +The method [`~PreTrainedModel.reverse_bettertransformer`] allows to go back to the original modeling, which should be used before saving the model in order to use the canonical transformers modeling: + +```python +model = model.reverse_bettertransformer() +model.save_pretrained("saved_model") +``` + +Have a look at this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) to learn more about what is possible to do with `BetterTransformer` API for encoder models. + +### Decoder models + +For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards. + +To convert a model to BetterTransformer: + +```python +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +# convert the model to BetterTransformer +model.to_bettertransformer() + +# Use it for training or inference +``` + +SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager: + + +```diff +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda") +# convert the model to BetterTransformer +model.to_bettertransformer() + +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") + ++ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +If you see a bug with a traceback saying + +```bash +RuntimeError: No available kernel. Aborting execution. +``` + +try using the PyTorch nightly version, which may have a broader coverage for Flash Attention: + +```bash +pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +``` + +Or make sure your model is correctly casted in float16 or bfloat16 + + +Have a look at [this detailed blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to read more about what is possible to do with `BetterTransformer` + SDPA API. + +## `bitsandbytes` integration for FP4 mixed-precision inference + +You can install `bitsandbytes` and benefit from easy model compression on GPUs. Using FP4 quantization you can expect to reduce up to 8x the model size compared to its native full precision version. Check out below how to get started. + + + +Note that this feature can also be used in a multi GPU setup. + + + +### Requirements [[requirements-for-fp4-mixedprecision-inference]] + +- Latest `bitsandbytes` library +`pip install bitsandbytes>=0.39.0` + +- Install latest `accelerate` from source +`pip install git+https://github.com/huggingface/accelerate.git` + +- Install latest `transformers` from source +`pip install git+https://github.com/huggingface/transformers.git` + +### Running FP4 models - single GPU setup - Quickstart + +You can quickly run a FP4 model on a single GPU by running the following code: + +```py +from transformers import AutoModelForCausalLM + +model_name = "bigscience/bloom-2b5" +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) +``` +Note that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources. + +### Running FP4 models - multi GPU setup + +The way to load your mixed 4-bit model in multiple GPUs is as follows (same command as single GPU setup): +```py +model_name = "bigscience/bloom-2b5" +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) +``` +But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows: + +```py +max_memory_mapping = {0: "600MB", 1: "1GB"} +model_name = "bigscience/bloom-3b" +model_4bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping +) +``` +In this example, the first GPU will use 600MB of memory and the second 1GB. + +### Advanced usage + +For more advanced usage of this method, please have a look at the [quantization](main_classes/quantization) documentation page. + +## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition + + + +Note that this feature can also be used in a multi GPU setup. + + + +From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support Hugging Face integration for all models in the Hub with a few lines of code. +The method reduces `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision. + +![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png) + +Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models. +For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration). + +![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif) + +Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature. +Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos). + +### Requirements [[requirements-for-int8-mixedprecision-matrix-decomposition]] + +- If you have `bitsandbytes<0.37.0`, make sure you run on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100). For `bitsandbytes>=0.37.0`, all GPUs should be supported. +- Install the correct version of `bitsandbytes` by running: +`pip install bitsandbytes>=0.31.5` +- Install `accelerate` +`pip install accelerate>=0.12.0` + +### Running mixed-Int8 models - single GPU setup + +After installing the required libraries, the way to load your mixed 8-bit model is as follows: + +```py +from transformers import AutoModelForCausalLM + +model_name = "bigscience/bloom-2b5" +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +``` + +For text generation, we recommend: + +* using the model's `generate()` method instead of the `pipeline()` function. Although inference is possible with the `pipeline()` function, it is not optimized for mixed-8bit models, and will be slower than using the `generate()` method. Moreover, some sampling strategies are like nucleaus sampling are not supported by the `pipeline()` function for mixed-8bit models. +* placing all inputs on the same device as the model. + +Here is a simple example: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_name = "bigscience/bloom-2b5" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) + +prompt = "Hello, my llama is cute" +inputs = tokenizer(prompt, return_tensors="pt").to("cuda") +generated_ids = model.generate(**inputs) +outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +``` + + +### Running mixed-int8 models - multi GPU setup + +The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup): +```py +model_name = "bigscience/bloom-2b5" +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +``` +But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows: + +```py +max_memory_mapping = {0: "1GB", 1: "2GB"} +model_name = "bigscience/bloom-3b" +model_8bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping +) +``` +In this example, the first GPU will use 1GB of memory and the second 2GB. + +### Colab demos + +With this method you can infer on models that were not possible to infer on a Google Colab before. +Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab: + +[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing) + +Or this demo for BLOOM-3B: + +[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing) + +## Advanced usage: mixing FP4 (or Int8) and BetterTransformer + +You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention: + +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16 +) + +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) + +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") + +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` diff --git a/docs/source/en/perf_infer_gpu_one.mdx b/docs/source/en/perf_infer_gpu_one.mdx deleted file mode 100644 index 086e2ff48709..000000000000 --- a/docs/source/en/perf_infer_gpu_one.mdx +++ /dev/null @@ -1,80 +0,0 @@ - - -# Efficient Inference on a Single GPU - -This document will be completed soon with information on how to infer on a single GPU. In the meantime you can check out [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu). - -## `BetterTransformer` for faster inference - -We have recently integrated `BetterTransformer` for faster inference on GPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details. - -## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition - -Note that this feature is also totally applicable in a multi GPU setup as well. - -From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support HuggingFace integration for all models in the Hub with a few lines of code. -The method reduce `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision. - -![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png) - -Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models. -For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration). - -![MixedInt8.gif](https://s3.amazonaws.com/moonup/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif) - -Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature. -Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos). - -### Requirements - -- Make sure you run that on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100). -- Install the correct version of `bitsandbytes` by running: -`pip install bitsandbytes>=0.31.5` -- Install `accelerate` -`pip install accelerate>=0.12.0` - -### Running mixed-int8 models - single GPU setup - -After installing the required libraries, the way to load your mixed 8-bit model is as follows: -```py -model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) -``` - -### Running mixed-int8 models - multi GPU setup - -The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup): -```py -model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) -``` -But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows: - -```py -max_memory_mapping = {0: "1GB", 1: "2GB"} -model_name = "bigscience/bloom-3b" -model_8bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping -) -``` -In this example, the first GPU will use 1GB of memory and the second 2GB. - -### Colab demos - -With this method you can infer on models that were not possible to infer on a Google Colab before. -Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab: - -[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing) - -Or this demo for BLOOM-3B: - -[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing) \ No newline at end of file diff --git a/docs/source/en/perf_infer_special.md b/docs/source/en/perf_infer_special.md new file mode 100644 index 000000000000..e5744754b88e --- /dev/null +++ b/docs/source/en/perf_infer_special.md @@ -0,0 +1,18 @@ + + +# Inference on Specialized Hardware + +This document will be completed soon with information on how to infer on specialized hardware. In the meantime you can check out [the guide for inference on CPUs](perf_infer_cpu). \ No newline at end of file diff --git a/docs/source/en/perf_infer_special.mdx b/docs/source/en/perf_infer_special.mdx deleted file mode 100644 index e18a9a104883..000000000000 --- a/docs/source/en/perf_infer_special.mdx +++ /dev/null @@ -1,14 +0,0 @@ - - -# Inference on Specialized Hardware - -This document will be completed soon with information on how to infer on specialized hardware. In the meantime you can check out [the guide for inference on CPUs](perf_infer_cpu). \ No newline at end of file diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md new file mode 100644 index 000000000000..a840e7d551ce --- /dev/null +++ b/docs/source/en/perf_torch_compile.md @@ -0,0 +1,359 @@ + + +# Optimize inference using torch.compile() + +This guide aims to provide a benchmark on the inference speed-ups introduced with [`torch.compile()`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for [computer vision models in 🤗 Transformers](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers&sort=trending). + +## Benefits of torch.compile + +Depending on the model and the GPU, `torch.compile()` yields up to 30% speed-up during inference. To use `torch.compile()`, simply install any version of `torch` above 2.0. + +Compiling a model takes time, so it's useful if you are compiling the model only once instead of every time you infer. +To compile any computer vision model of your choice, call `torch.compile()` on the model as shown below: + +```diff +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda") ++ model = torch.compile(model) +``` + +`compile()` comes with multiple modes for compiling, which essentially differ in compilation time and inference overhead. `max-autotune` takes longer than `reduce-overhead` but results in faster inference. Default mode is fastest for compilation but is not as efficient compared to `reduce-overhead` for inference time. In this guide, we used the default mode. You can learn more about it [here](https://pytorch.org/get-started/pytorch-2.0/#user-experience). + +We benchmarked `torch.compile` with different computer vision models, tasks, types of hardware, and batch sizes on `torch` version 2.0.1. + +## Benchmarking code + +Below you can find the benchmarking code for each task. We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time. + +### Image Classification with ViT + +```python +import torch +from PIL import Image +import requests +import numpy as np +from transformers import AutoImageProcessor, AutoModelForImageClassification + +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) + +processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda") +model = torch.compile(model) + +processed_input = processor(image, return_tensors='pt').to(device="cuda") + +with torch.no_grad(): + _ = model(**processed_input) + +``` + +#### Object Detection with DETR + +```python +from transformers import AutoImageProcessor, AutoModelForObjectDetection + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda") +model = torch.compile(model) + +texts = ["a photo of a cat", "a photo of a dog"] +inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda") + +with torch.no_grad(): + _ = model(**inputs) +``` + +#### Image Segmentation with Segformer + +```python +from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation + +processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") +model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda") +model = torch.compile(model) +seg_inputs = processor(images=image, return_tensors="pt").to("cuda") + +with torch.no_grad(): + _ = model(**seg_inputs) +``` + +Below you can find the list of the models we benchmarked. + +**Image Classification** +- [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) +- [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k) +- [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224) +- [microsoft/resnet-50](https://huggingface.co/) + +**Image Segmentation** +- [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) +- [facebook/mask2former-swin-tiny-coco-panoptic](https://huggingface.co/facebook/mask2former-swin-tiny-coco-panoptic) +- [facebook/maskformer-swin-base-ade](https://huggingface.co/facebook/maskformer-swin-base-ade) +- [google/deeplabv3_mobilenet_v2_1.0_513](https://huggingface.co/google/deeplabv3_mobilenet_v2_1.0_513) + +**Object Detection** +- [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) +- [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101) +- [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) + +Below you can find visualization of inference durations with and without `torch.compile()` and percentage improvements for each model in different hardware and batch sizes. + +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ + +![Duration Comparison on V100 with Batch Size of 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_1_duration.png) + +![Percentage Improvement on T4 with Batch Size of 4](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/T4_4_percentage.png) + +Below you can find inference durations in milliseconds for each model with and without `compile()`. Note that OwlViT results in OOM in larger batch sizes. + +### A100 (batch size: 1) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 9.325 | 7.584 | +| Image Segmentation/Segformer | 11.759 | 10.500 | +| Object Detection/OwlViT | 24.978 | 18.420 | +| Image Classification/BeiT | 11.282 | 8.448 | +| Object Detection/DETR | 34.619 | 19.040 | +| Image Classification/ConvNeXT | 10.410 | 10.208 | +| Image Classification/ResNet | 6.531 | 4.124 | +| Image Segmentation/Mask2former | 60.188 | 49.117 | +| Image Segmentation/Maskformer | 75.764 | 59.487 | +| Image Segmentation/MobileNet | 8.583 | 3.974 | +| Object Detection/Resnet-101 | 36.276 | 18.197 | +| Object Detection/Conditional-DETR | 31.219 | 17.993 | + + +### A100 (batch size: 4) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 14.832 | 14.499 | +| Image Segmentation/Segformer | 18.838 | 16.476 | +| Image Classification/BeiT | 13.205 | 13.048 | +| Object Detection/DETR | 48.657 | 32.418| +| Image Classification/ConvNeXT | 22.940 | 21.631 | +| Image Classification/ResNet | 6.657 | 4.268 | +| Image Segmentation/Mask2former | 74.277 | 61.781 | +| Image Segmentation/Maskformer | 180.700 | 159.116 | +| Image Segmentation/MobileNet | 14.174 | 8.515 | +| Object Detection/Resnet-101 | 68.101 | 44.998 | +| Object Detection/Conditional-DETR | 56.470 | 35.552 | + +### A100 (batch size: 16) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 40.944 | 40.010 | +| Image Segmentation/Segformer | 37.005 | 31.144 | +| Image Classification/BeiT | 41.854 | 41.048 | +| Object Detection/DETR | 164.382 | 161.902 | +| Image Classification/ConvNeXT | 82.258 | 75.561 | +| Image Classification/ResNet | 7.018 | 5.024 | +| Image Segmentation/Mask2former | 178.945 | 154.814 | +| Image Segmentation/Maskformer | 638.570 | 579.826 | +| Image Segmentation/MobileNet | 51.693 | 30.310 | +| Object Detection/Resnet-101 | 232.887 | 155.021 | +| Object Detection/Conditional-DETR | 180.491 | 124.032 | + +### V100 (batch size: 1) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 10.495 | 6.00 | +| Image Segmentation/Segformer | 13.321 | 5.862 | +| Object Detection/OwlViT | 25.769 | 22.395 | +| Image Classification/BeiT | 11.347 | 7.234 | +| Object Detection/DETR | 33.951 | 19.388 | +| Image Classification/ConvNeXT | 11.623 | 10.412 | +| Image Classification/ResNet | 6.484 | 3.820 | +| Image Segmentation/Mask2former | 64.640 | 49.873 | +| Image Segmentation/Maskformer | 95.532 | 72.207 | +| Image Segmentation/MobileNet | 9.217 | 4.753 | +| Object Detection/Resnet-101 | 52.818 | 28.367 | +| Object Detection/Conditional-DETR | 39.512 | 20.816 | + +### V100 (batch size: 4) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 15.181 | 14.501 | +| Image Segmentation/Segformer | 16.787 | 16.188 | +| Image Classification/BeiT | 15.171 | 14.753 | +| Object Detection/DETR | 88.529 | 64.195 | +| Image Classification/ConvNeXT | 29.574 | 27.085 | +| Image Classification/ResNet | 6.109 | 4.731 | +| Image Segmentation/Mask2former | 90.402 | 76.926 | +| Image Segmentation/Maskformer | 234.261 | 205.456 | +| Image Segmentation/MobileNet | 24.623 | 14.816 | +| Object Detection/Resnet-101 | 134.672 | 101.304 | +| Object Detection/Conditional-DETR | 97.464 | 69.739 | + +### V100 (batch size: 16) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 52.209 | 51.633 | +| Image Segmentation/Segformer | 61.013 | 55.499 | +| Image Classification/BeiT | 53.938 | 53.581 | +| Object Detection/DETR | OOM | OOM | +| Image Classification/ConvNeXT | 109.682 | 100.771 | +| Image Classification/ResNet | 14.857 | 12.089 | +| Image Segmentation/Mask2former | 249.605 | 222.801 | +| Image Segmentation/Maskformer | 831.142 | 743.645 | +| Image Segmentation/MobileNet | 93.129 | 55.365 | +| Object Detection/Resnet-101 | 482.425 | 361.843 | +| Object Detection/Conditional-DETR | 344.661 | 255.298 | + +### T4 (batch size: 1) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 16.520 | 15.786 | +| Image Segmentation/Segformer | 16.116 | 14.205 | +| Object Detection/OwlViT | 53.634 | 51.105 | +| Image Classification/BeiT | 16.464 | 15.710 | +| Object Detection/DETR | 73.100 | 53.99 | +| Image Classification/ConvNeXT | 32.932 | 30.845 | +| Image Classification/ResNet | 6.031 | 4.321 | +| Image Segmentation/Mask2former | 79.192 | 66.815 | +| Image Segmentation/Maskformer | 200.026 | 188.268 | +| Image Segmentation/MobileNet | 18.908 | 11.997 | +| Object Detection/Resnet-101 | 106.622 | 82.566 | +| Object Detection/Conditional-DETR | 77.594 | 56.984 | + +### T4 (batch size: 4) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 43.653 | 43.626 | +| Image Segmentation/Segformer | 45.327 | 42.445 | +| Image Classification/BeiT | 52.007 | 51.354 | +| Object Detection/DETR | 277.850 | 268.003 | +| Image Classification/ConvNeXT | 119.259 | 105.580 | +| Image Classification/ResNet | 13.039 | 11.388 | +| Image Segmentation/Mask2former | 201.540 | 184.670 | +| Image Segmentation/Maskformer | 764.052 | 711.280 | +| Image Segmentation/MobileNet | 74.289 | 48.677 | +| Object Detection/Resnet-101 | 421.859 | 357.614 | +| Object Detection/Conditional-DETR | 289.002 | 226.945 | + +### T4 (batch size: 16) + +| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:| +| Image Classification/ViT | 163.914 | 160.907 | +| Image Segmentation/Segformer | 192.412 | 163.620 | +| Image Classification/BeiT | 188.978 | 187.976 | +| Object Detection/DETR | OOM | OOM | +| Image Classification/ConvNeXT | 422.886 | 388.078 | +| Image Classification/ResNet | 44.114 | 37.604 | +| Image Segmentation/Mask2former | 756.337 | 695.291 | +| Image Segmentation/Maskformer | 2842.940 | 2656.88 | +| Image Segmentation/MobileNet | 299.003 | 201.942 | +| Object Detection/Resnet-101 | 1619.505 | 1262.758 | +| Object Detection/Conditional-DETR | 1137.513 | 897.390| + +## PyTorch Nightly +We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://download.pytorch.org/whl/nightly/cu118)) and observed improvement in latency both for uncompiled and compiled models. + +### A100 + +| **Task/Model** | **Batch Size** | **torch 2.0 - no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | +| Image Classification/BeiT | 4 | 14.109 | 12.851 | +| Image Classification/BeiT | 16 | 42.179 | 42.147 | +| Object Detection/DETR | Unbatched | 30.484 | 15.221 | +| Object Detection/DETR | 4 | 46.816 | 30.942 | +| Object Detection/DETR | 16 | 163.749 | 163.706 | + +### T4 + +| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | +| Image Classification/BeiT | 4 | 47.381 | 46.604 | +| Image Classification/BeiT | 16 | 42.179 | 42.147 | +| Object Detection/DETR | Unbatched | 68.382 | 53.481 | +| Object Detection/DETR | 4 | 269.615 | 204.785 | +| Object Detection/DETR | 16 | OOM | OOM | + +### V100 + +| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | +| Image Classification/BeiT | 4 | 15.103 | 14.378 | +| Image Classification/BeiT | 16 | 52.517 | 51.691 | +| Object Detection/DETR | Unbatched | 28.706 | 19.077 | +| Object Detection/DETR | 4 | 88.402 | 62.949| +| Object Detection/DETR | 16 | OOM | OOM | + + +## Reduce Overhead +We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly. + +### A100 + +| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:|:---:| +| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | +| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | +| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | +| Image Classification/ResNet | 4 | 7.261 | 2.187 | +| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627 | +| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831 | +| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 | +| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 | + + +### T4 + +| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | +|:---:|:---:|:---:|:---:| +| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | +| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | +| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | +| Image Classification/ResNet | 4 | 15.215 | 13.871 | +| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660 | +| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543 | +| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339 | +| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 | + + diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md new file mode 100644 index 000000000000..9c81820ce7d5 --- /dev/null +++ b/docs/source/en/perf_train_cpu.md @@ -0,0 +1,67 @@ + + +# Efficient Training on CPU + +This guide focuses on training large models efficiently on CPU. + +## Mixed precision with IPEX + +IPEX is optimized for CPUs with AVX-512 or above, and functionally works for CPUs with only AVX2. So, it is expected to bring performance benefit for Intel CPU generations with AVX-512 or above while CPUs with only AVX2 (e.g., AMD CPUs or older Intel CPUs) might result in a better performance under IPEX, but not guaranteed. IPEX provides performance optimizations for CPU training with both Float32 and BFloat16. The usage of BFloat16 is the main focus of the following sections. + +Low precision data type BFloat16 has been natively supported on the 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) with AVX512 instruction set and will be supported on the next generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix Extensions (Intel® AMX) instruction set with further boosted performance. The Auto Mixed Precision for CPU backend has been enabled since PyTorch-1.10. At the same time, the support of Auto Mixed Precision with BFloat16 for CPU and BFloat16 optimization of operators has been massively enabled in Intel® Extension for PyTorch, and partially upstreamed to PyTorch master branch. Users can get better performance and user experience with IPEX Auto Mixed Precision. + +Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html). + +### IPEX installation: + +IPEX release is following PyTorch, to install via pip: + +| PyTorch Version | IPEX version | +| :---------------: | :----------: | +| 1.13 | 1.13.0+cpu | +| 1.12 | 1.12.300+cpu | +| 1.11 | 1.11.200+cpu | +| 1.10 | 1.10.100+cpu | + +``` +pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +``` + +Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html). + +### Usage in Trainer +To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments. + +Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) + +- Training with IPEX using BF16 auto mixed precision on CPU: +
 python run_qa.py \
+--model_name_or_path bert-base-uncased \
+--dataset_name squad \
+--do_train \
+--do_eval \
+--per_device_train_batch_size 12 \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/debug_squad/ \
+--use_ipex \
+--bf16 --no_cuda
+ +### Practice example + +Blog: [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids) diff --git a/docs/source/en/perf_train_cpu.mdx b/docs/source/en/perf_train_cpu.mdx deleted file mode 100644 index c35a62fcdcef..000000000000 --- a/docs/source/en/perf_train_cpu.mdx +++ /dev/null @@ -1,60 +0,0 @@ - - -# Efficient Training on CPU - -This guide focuses on training large models efficiently on CPU. - -## Mixed precision with IPEX - -IPEX is optimized for CPUs with AVX-512 or above, and functionally works for CPUs with only AVX2. So, it is expected to bring performance benefit for Intel CPU generations with AVX-512 or above while CPUs with only AVX2 (e.g., AMD CPUs or older Intel CPUs) might result in a better performance under IPEX, but not guaranteed. IPEX provides performance optimizations for CPU training with both Float32 and BFloat16. The usage of BFloat16 is the main focus of the following sections. - -Low precision data type BFloat16 has been natively supported on the 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) with AVX512 instruction set and will be supported on the next generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix Extensions (Intel® AMX) instruction set with further boosted performance. The Auto Mixed Precision for CPU backend has been enabled since PyTorch-1.10. At the same time, the support of Auto Mixed Precision with BFloat16 for CPU and BFloat16 optimization of operators has been massively enabled in Intel® Extension for PyTorch, and partially upstreamed to PyTorch master branch. Users can get better performance and user experience with IPEX Auto Mixed Precision. - -Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html). - -### IPEX installation: - -IPEX release is following PyTorch, to install via pip: - -| PyTorch Version | IPEX version | -| :---------------: | :----------: | -| 1.13 | 1.13.0+cpu | -| 1.12 | 1.12.300+cpu | -| 1.11 | 1.11.200+cpu | -| 1.10 | 1.10.100+cpu | - -``` -pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu -``` - -Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html). - -### Usage in Trainer -To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments. - -Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - -- Training with IPEX using BF16 auto mixed precision on CPU: -
 python run_qa.py \
---model_name_or_path bert-base-uncased \
---dataset_name squad \
---do_train \
---do_eval \
---per_device_train_batch_size 12 \
---learning_rate 3e-5 \
---num_train_epochs 2 \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/debug_squad/ \
---use_ipex \
---bf16 --no_cuda
- diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md new file mode 100644 index 000000000000..4c131430babd --- /dev/null +++ b/docs/source/en/perf_train_cpu_many.md @@ -0,0 +1,134 @@ + + +# Efficient Training on Multiple CPUs + +When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently. + +## Intel® oneCCL Bindings for PyTorch + +[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html). + +Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12) implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now + +Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl). + +### Intel® oneCCL Bindings for PyTorch installation: + +Wheel files are available for the following Python versions: + +| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | +| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | +| 1.13.0 | | √ | √ | √ | √ | +| 1.12.100 | | √ | √ | √ | √ | +| 1.12.0 | | √ | √ | √ | √ | +| 1.11.0 | | √ | √ | √ | √ | +| 1.10.0 | √ | √ | √ | √ | | + +``` +pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu +``` +where `{pytorch_version}` should be your PyTorch version, for instance 1.13.0. +Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl). +Versions of oneCCL and PyTorch must match. + + + +oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0) +PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100 + + + +## Intel® MPI library +Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit. + +oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it. + +for Intel® oneCCL >= 1.12.0 +``` +oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)") +source $oneccl_bindings_for_pytorch_path/env/setvars.sh +``` + +for Intel® oneCCL whose version < 1.12.0 +``` +torch_ccl_path=$(python -c "import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))") +source $torch_ccl_path/env/setvars.sh +``` + +#### IPEX installation: + +IPEX provides performance optimizations for CPU training with both Float32 and BFloat16, you could refer [single CPU section](./perf_train_cpu). + + +The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example. + + +## Usage in Trainer +To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--ddp_backend ccl`** in the command arguments. + +Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) + + +The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. +```shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=127.0.0.1 + mpirun -n 2 -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex +``` +The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. + +In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. +```shell script + cat hostfile + xxx.xxx.xxx.xxx #node0 ip + xxx.xxx.xxx.xxx #node1 ip +``` +Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1 with BF16 auto mixed precision: +```shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip + mpirun -f hostfile -n 4 -ppn 2 \ + -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex \ + --bf16 +``` diff --git a/docs/source/en/perf_train_cpu_many.mdx b/docs/source/en/perf_train_cpu_many.mdx deleted file mode 100644 index 1310e40d30e1..000000000000 --- a/docs/source/en/perf_train_cpu_many.mdx +++ /dev/null @@ -1,130 +0,0 @@ - - -# Efficient Training on Multiple CPUs - -When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently. - -## Intel® oneCCL Bindings for PyTorch - -[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html). - -Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12) implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now - -Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl). - -### Intel® oneCCL Bindings for PyTorch installation: - -Wheel files are available for the following Python versions: - -| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | -| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | -| 1.13.0 | | √ | √ | √ | √ | -| 1.12.100 | | √ | √ | √ | √ | -| 1.12.0 | | √ | √ | √ | √ | -| 1.11.0 | | √ | √ | √ | √ | -| 1.10.0 | √ | √ | √ | √ | | - -``` -pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu -``` -where `{pytorch_version}` should be your PyTorch version, for instance 1.13.0. -Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl). -Versions of oneCCL and PyTorch must match. - - - -oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0) -PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100 - - - -## Intel® MPI library -Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit. - -oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it. - -for Intel® oneCCL >= 1.12.0 -``` -oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)") -source $oneccl_bindings_for_pytorch_path/env/setvars.sh -``` - -for Intel® oneCCL whose version < 1.12.0 -``` -torch_ccl_path=$(python -c "import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))") -source $torch_ccl_path/env/setvars.sh -``` - -#### IPEX installation: - -IPEX provides performance optimizations for CPU training with both Float32 and BFloat16, you could refer [single CPU section](./perf_train_cpu). - - -The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example. - - -## Usage in Trainer -To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--xpu_backend ccl`** in the command arguments. - -Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - - -The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. -```shell script - export CCL_WORKER_COUNT=1 - export MASTER_ADDR=127.0.0.1 - mpirun -n 2 -genv OMP_NUM_THREADS=23 \ - python3 run_qa.py \ - --model_name_or_path bert-large-uncased \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 12 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/debug_squad/ \ - --no_cuda \ - --xpu_backend ccl \ - --use_ipex -``` -The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. - -In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. -```shell script - cat hostfile - xxx.xxx.xxx.xxx #node0 ip - xxx.xxx.xxx.xxx #node1 ip -``` -Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1 with BF16 auto mixed precision: -```shell script - export CCL_WORKER_COUNT=1 - export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip - mpirun -f hostfile -n 4 -ppn 2 \ - -genv OMP_NUM_THREADS=23 \ - python3 run_qa.py \ - --model_name_or_path bert-large-uncased \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 12 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/debug_squad/ \ - --no_cuda \ - --xpu_backend ccl \ - --use_ipex \ - --bf16 -``` diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md new file mode 100644 index 000000000000..fc93f763d815 --- /dev/null +++ b/docs/source/en/perf_train_gpu_many.md @@ -0,0 +1,530 @@ + + +# Efficient Training on Multiple GPUs + +When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a multi-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations. + + + + Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training. + + + +We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented. + +## Concepts + +The following is the brief description of the main concepts that will be described later in depth in this document. + +1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step. +2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level. +3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch. +4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory. +5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO. + +Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure. + +## Scalability Strategy + +**⇨ Single Node / Multi-GPU** +* Model fits onto a single GPU: + + 1. DDP - Distributed DP + 2. ZeRO - may or may not be faster depending on the situation and configuration used + +* Model doesn't fit onto a single GPU: + + 1. PP + 2. ZeRO + 3. TP + + With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup. + + TP is almost always used within a single node. That is TP size <= gpus per node. + +* Largest Layer not fitting into a single GPU: + + 1. If not using ZeRO - must use TP, as PP alone won't be able to fit. + 2. With ZeRO see the same entry for "Single GPU" above + + +**⇨ Multi-Node / Multi-GPU** + +* When you have fast inter-node connectivity: + + 1. ZeRO - as it requires close to no modifications to the model + 2. PP+TP+DP - less communications, but requires massive changes to the model + +* when you have slow inter-node connectivity and still low on GPU memory: + + 1. DP+PP+TP+ZeRO-1 + + + +## Data Parallelism + +Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP. + +### DP vs DDP + +`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case: +* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL +* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP + +Here are the main differences in the inter-GPU communication overhead between the two modes: + +[DDP](https://pytorch.org/docs/master/notes/ddp.html): + +- At the start time the main process replicates the model once from gpu 0 to the rest of gpus +- Then for each batch: + 1. each gpu consumes each own mini-batch of data directly + 2. during `backward`, once the local gradients are ready, they are then averaged across all processes + +[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html): + +For each batch: + 1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu + 2. replicates the up-to-date model from gpu 0 to each gpu + 3. runs `forward` and sends output from each gpu to gpu 0, computes loss + 4. scatters loss from gpu 0 to all gpus, runs `backward` + 5. sends gradients from each gpu to gpu 0 and averages those + +The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch. + +DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html). + +Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus. + +You can use DDP across multiple machines, but this is not the case with DP. + +There are other differences between DP and DDP but they aren't relevant to this discussion. + +If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know. + +Let's look at an actual benchmark: + +| Type | NVlink | Time | +| :----- | ----- | ---: | +| 2:DP | Y | 110s | +| 2:DDP | Y | 101s | +| 2:DDP | N | 131s | + + +Analysis: + +Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink + +The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime. + +Here is the full benchmark code and outputs: + +`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark. + +``` + +# DP +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ +python examples/pytorch/language-modeling/run_clm.py \ +--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ +--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69} + +# DDP w/ NVlink +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ +python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ +--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ +--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} + +# DDP w/o NVlink +rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \ +python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ +--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ +--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} +``` + +Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) +Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` + +## ZeRO Data Parallelism + +ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) +![DeepSpeed-Image-1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png) + +It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual `DataParallel` (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it. And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it. + +Consider this simple model with 3 layers, where each layer has 3 params: +``` +La | Lb | Lc +---|----|--- +a0 | b0 | c0 +a1 | b1 | c1 +a2 | b2 | c2 +``` +Layer La has weights a0, a1 and a2. + +If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so: + +``` +GPU0: +La | Lb | Lc +---|----|--- +a0 | b0 | c0 + +GPU1: +La | Lb | Lc +---|----|--- +a1 | b1 | c1 + +GPU2: +La | Lb | Lc +---|----|--- +a2 | b2 | c2 +``` + +In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point. + +Now each of these GPUs will get the usual mini-batch as it works in DP: +``` +x0 => GPU0 +x1 => GPU1 +x2 => GPU2 +``` + +The inputs are unmodified - they think they are going to be processed by the normal model. + +First, the inputs hit the layer La. + +Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together. + +In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2. + +Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor. + +All 3 GPUs get the full tensors reconstructed and a forward happens. + +As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch. + +And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La. + +To me this sounds like an efficient group backpacking weight distribution strategy: + +1. person A carries the tent +2. person B carries the stove +3. person C carries the axe + +Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP. + +Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch. + +While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned. + +If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next. + +Implementations: + +- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3 +- [`transformers` integration](main_classes/trainer#trainer-integrations) + +## Naive Model Parallelism (Vertical) and Pipeline Parallelism + +Naive Model Parallelism (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified. + +We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model: + +``` +=================== =================== +| 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | +=================== =================== + gpu0 gpu1 +``` +we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1. + +Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger. + +Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work. + +Problems: +- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states) +- shared embeddings may need to get copied back and forth between GPUs. + +Pipeline Parallelism (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process. + +The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom: + +![mp-pp](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png) + +It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble". + +Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0. + +PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). + +Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS. + +Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time. + +To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`). + +Let's go back to the diagram. + +With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus. + +While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble. + +There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user. + +Traditional Pipeline API solutions: +- PyTorch +- DeepSpeed +- Megatron-LM + +Modern solutions: +- Varuna +- Sagemaker + +Problems with traditional Pipeline API solutions: +- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model. +- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693 +- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage. +- have to arrange each layer so that the output of one model becomes an input to the other model. + +We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model. + +Implementations: +- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py) +- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/) +- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API. +- [Varuna](https://github.com/microsoft/varuna) +- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS. +- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers. + +🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that. + +Other approaches: + +DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html) +![interleaved-pipeline-execution](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png) + +Here the bubble (idle time) is further minimized by prioritizing backward passes. + +Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling. + +OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` converting. + +## Tensor Parallelism + +In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing. + +In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473). + +The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`. + +Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix. + +If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs: +![Parallel GEMM](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png) + +If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently: +![independent GeLU](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png) + +Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that: +![parallel shard processing](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png) + +Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads! +![parallel self-attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png) + +Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs. + +This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530). +by [@anton-l](https://github.com/anton-l). + +SageMaker combines TP with DP for a more efficient processing. + +Alternative names: +- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism) + +Implementations: +- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific +- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment) +- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS. +- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers. + +🤗 Transformers status: +- core: not yet implemented in the core +- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too. +- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/) + +## DP+PP + +The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP. + +![dp-pp-2d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png) + +Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid. + +Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs. + +Implementations: +- [DeepSpeed](https://github.com/microsoft/DeepSpeed) +- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) +- [Varuna](https://github.com/microsoft/varuna) +- [SageMaker](https://arxiv.org/abs/2111.05972) +- [OSLO](https://github.com/tunib-ai/oslo) + +🤗 Transformers status: not yet implemented + +## DP+PP+TP + +To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram. + +![dp-pp-tp-3d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png) + +This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well. + +Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs. + +Implementations: +- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP. +- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) +- [Varuna](https://github.com/microsoft/varuna) +- [SageMaker](https://arxiv.org/abs/2111.05972) +- [OSLO](https://github.com/tunib-ai/oslo) + +🤗 Transformers status: not yet implemented, since we have no PP and TP. + +## ZeRO DP+PP+TP + +One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP. + +When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding). + +While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt. + +In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP. + +ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required. + +And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU. + +Implementations: +- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) and [Megatron-Deepspeed from BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed), which is the fork of the former repo. +- [OSLO](https://github.com/tunib-ai/oslo) + +Important papers: + +- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model]( +https://arxiv.org/abs/2201.11990) + +🤗 Transformers status: not yet implemented, since we have no PP and TP. + +## FlexFlow + +[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach. + +Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358) + +It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter. + +1. Sample = Data Parallelism (sample-wise parallel) +2. Operator = Parallelize a single operation into several sub-operations +3. Attribute = Data Parallelism (length-wise parallel) +4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical) + +Examples: +* Sample + +Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512. + +* Operator + +If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time. + +* Attribute + +We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256. + +* Parameter + +It is similar with tensor model parallelism or naive layer-wise model parallelism. + +![flex-flow-soap](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg) + +The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which parallelisation to use where. + +One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations. + +So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization. + +🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models. + + +## Which Strategy To Use When + +Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster. + +**⇨ Single GPU** + +* Model fits onto a single GPU: + + 1. Normal use + +* Model doesn't fit onto a single GPU: + + 1. ZeRO + Offload CPU and optionally NVMe + 2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU + +* Largest Layer not fitting into a single GPU: + +1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user. + +**⇨ Single Node / Multi-GPU** + +* Model fits onto a single GPU: + + 1. DDP - Distributed DP + 2. ZeRO - may or may not be faster depending on the situation and configuration used + +* Model doesn't fit onto a single GPU: + + 1. PP + 2. ZeRO + 3. TP + + With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup. + + TP is almost always used within a single node. That is TP size <= gpus per node. + +* Largest Layer not fitting into a single GPU: + + 1. If not using ZeRO - must use TP, as PP alone won't be able to fit. + 2. With ZeRO see the same entry for "Single GPU" above + + +**⇨ Multi-Node / Multi-GPU** + +* When you have fast inter-node connectivity: + + 1. ZeRO - as it requires close to no modifications to the model + 2. PP+TP+DP - less communications, but requires massive changes to the model + +* when you have slow inter-node connectivity and still low on GPU memory: + + 1. DP+PP+TP+ZeRO-1 diff --git a/docs/source/en/perf_train_gpu_many.mdx b/docs/source/en/perf_train_gpu_many.mdx deleted file mode 100644 index 17eb7b739925..000000000000 --- a/docs/source/en/perf_train_gpu_many.mdx +++ /dev/null @@ -1,529 +0,0 @@ - - -# Efficient Training on Multiple GPUs - -When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a multi-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations. - - - - Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training. - - - -We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented. - -## Concepts - -The following is the brief description of the main concepts that will be described later in depth in this document. - -1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step. -2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level. -3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch. -4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory. -5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO. - -Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure. - -## Scalability Strategy - -**⇨ Single Node / Multi-GPU** -* Model fits onto a single GPU: - - 1. DDP - Distributed DP - 2. ZeRO - may or may not be faster depending on the situation and configuration used - -* Model doesn't fit onto a single GPU: - - 1. PP - 2. ZeRO - 3. TP - - With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup. - - TP is almost always used within a single node. That is TP size <= gpus per node. - -* Largest Layer not fitting into a single GPU: - - 1. If not using ZeRO - must use TP, as PP alone won't be able to fit. - 2. With ZeRO see the same entry for "Single GPU" above - - -**⇨ Multi-Node / Multi-GPU** - -* When you have fast inter-node connectivity: - - 1. ZeRO - as it requires close to no modifications to the model - 2. PP+TP+DP - less communications, but requires massive changes to the model - -* when you have slow inter-node connectivity and still low on GPU memory: - - 1. DP+PP+TP+ZeRO-1 - - - -## Data Parallelism - -Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP. - -### DP vs DDP - -`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case: -* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL -* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP - -Here are the main differences in the inter-GPU communication overhead between the two modes: - -[DDP](https://pytorch.org/docs/master/notes/ddp.html): - -- At the start time the main process replicates the model once from gpu 0 to the rest of gpus -- Then for each batch: - 1. each gpu consumes each own mini-batch of data directly - 2. during `backward`, once the local gradients are ready, they are then averaged across all processes - -[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html): - -For each batch: - 1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu - 2. replicates the up-to-date model from gpu 0 to each gpu - 3. runs `forward` and sends output from each gpu to gpu 0, computes loss - 4. scatters loss from gpu 0 to all gpus, runs `backward` - 5. sends gradients from each gpu to gpu 0 and averages those - -The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch. - -DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html). - -Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus. - -You can use DDP across multiple machines, but this is not the case with DP. - -There are other differences between DP and DDP but they aren't relevant to this discussion. - -If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know. - -Let's look at an actual benchmark: - -| Type | NVlink | Time | -| :----- | ----- | ---: | -| 2:DP | Y | 110s | -| 2:DDP | Y | 101s | -| 2:DDP | N | 131s | - - -Analysis: - -Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink - -The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime. - -Here is the full benchmark code and outputs: - -`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark. - -``` - -# DP -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ -python examples/pytorch/language-modeling/run_clm.py \ ---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ ---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69} - -# DDP w/ NVlink -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ -python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ ---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ ---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} - -# DDP w/o NVlink -rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \ -python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ ---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ ---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} -``` - -Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) -Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` - -## ZeRO Data Parallelism - -ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) -![DeepSpeed-Image-1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png) - -It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual `DataParallel` (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it. And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it. - -Consider this simple model with 3 layers, where each layer has 3 params: -``` -La | Lb | Lc ----|----|--- -a0 | b0 | c0 -a1 | b1 | c1 -a2 | b2 | c2 -``` -Layer La has weights a0, a1 and a2. - -If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so: - -``` -GPU0: -La | Lb | Lc ----|----|--- -a0 | b0 | c0 - -GPU1: -La | Lb | Lc ----|----|--- -a1 | b1 | c1 - -GPU2: -La | Lb | Lc ----|----|--- -a2 | b2 | c2 -``` - -In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point. - -Now each of these GPUs will get the usual mini-batch as it works in DP: -``` -x0 => GPU0 -x1 => GPU1 -x2 => GPU2 -``` - -The inputs are unmodified - they think they are going to be processed by the normal model. - -First, the inputs hit the layer La. - -Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together. - -In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2. - -Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor. - -All 3 GPUs get the full tensors reconstructed and a forward happens. - -As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch. - -And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La. - -To me this sounds like an efficient group backpacking weight distribution strategy: - -1. person A carries the tent -2. person B carries the stove -3. person C carries the axe - -Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP. - -Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch. - -While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned. - -If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next. - -Implementations: - -- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3 -- [Fairscale](https://github.com/facebookresearch/fairscale/#optimizer-state-sharding-zero) ZeRO-DP stages 1+2+3 -- [`transformers` integration](main_classes/trainer#trainer-integrations) - -## Naive Model Parallelism (Vertical) and Pipeline Parallelism - -Naive Model Parallelism (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified. - -We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model: - -``` -=================== =================== -| 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | -=================== =================== - gpu0 gpu1 -``` -we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1. - -Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger. - -Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work. - -Problems: -- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states) -- shared embeddings may need to get copied back and forth between GPUs. - -Pipeline Parallelism (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process. - -The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom: - -![mp-pp](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png) - -It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble". - -Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0. - -PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). - -Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS. - -Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time. - -To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`). - -Let's go back to the diagram. - -With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus. - -While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble. - -There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user. - -Traditional Pipeline API solutions: -- PyTorch -- FairScale -- DeepSpeed -- Megatron-LM - -Modern solutions: -- Varuna -- Sagemaker - -Problems with traditional Pipeline API solutions: -- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model. -- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693 -- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage. -- have to arrange each layer so that the output of one model becomes an input to the other model. - -We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model. - -Implementations: -- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py) -- [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html) -- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/) -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API. -- [Varuna](https://github.com/microsoft/varuna) -- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS. -- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers. - -🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that. - -Other approaches: - -DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html) -![interleaved-pipeline-execution](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png) - -Here the bubble (idle time) is further minimized by prioritizing backward passes. - -Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling. - -OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` converting. - -## Tensor Parallelism - -In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing. - -In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473). - -The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`. - -Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix. - -If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs: -![Parallel GEMM](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png) - -If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently: -![independent GeLU](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png) - -Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that: -![parallel shard processing](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png) - -Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads! -![parallel self-attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png) - -Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs. - -This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530). -by [@anton-l](https://github.com/anton-l). - -SageMaker combines TP with DP for a more efficient processing. - -Alternative names: -- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism) - -Implementations: -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific -- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment) -- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS. -- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers. - -🤗 Transformers status: -- core: not yet implemented in the core -- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too. -- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/) - -## DP+PP - -The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP. - -![dp-pp-2d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png) - -Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid. - -Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs. - -Implementations: -- [DeepSpeed](https://github.com/microsoft/DeepSpeed) -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) -- [Varuna](https://github.com/microsoft/varuna) -- [SageMaker](https://arxiv.org/abs/2111.05972) -- [OSLO](https://github.com/tunib-ai/oslo) - -🤗 Transformers status: not yet implemented - -## DP+PP+TP - -To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram. - -![dp-pp-tp-3d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png) - -This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well. - -Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs. - -Implementations: -- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP. -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) -- [Varuna](https://github.com/microsoft/varuna) -- [SageMaker](https://arxiv.org/abs/2111.05972) -- [OSLO](https://github.com/tunib-ai/oslo) - -🤗 Transformers status: not yet implemented, since we have no PP and TP. - -## ZeRO DP+PP+TP - -One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP. - -When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding). - -While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt. - -In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP. - -ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required. - -And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU. - -Implementations: -- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) and [Megatron-Deepspeed from BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed), which is the fork of the former repo. -- [OSLO](https://github.com/tunib-ai/oslo) - -Important papers: - -- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model]( -https://arxiv.org/abs/2201.11990) - -🤗 Transformers status: not yet implemented, since we have no PP and TP. - -## FlexFlow - -[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach. - -Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358) - -It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter. - -1. Sample = Data Parallelism (sample-wise parallel) -2. Operator = Parallelize a single operation into several sub-operations -3. Attribute = Data Parallelism (length-wise parallel) -4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical) - -Examples: -* Sample - -Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512. - -* Operator - -If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time. - -* Attribute - -We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256. - -* Parameter - -It is similar with tensor model parallelism or naive layer-wise model parallelism. - -![flex-flow-soap](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg) - -The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which parallelisation to use where. - -One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations. - -So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization. - -🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models. - - -## Which Strategy To Use When - -Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster. - -**⇨ Single GPU** - -* Model fits onto a single GPU: - - 1. Normal use - -* Model doesn't fit onto a single GPU: - - 1. ZeRO + Offload CPU and optionally NVMe - 2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU - -* Largest Layer not fitting into a single GPU: - -1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user. - -**⇨ Single Node / Multi-GPU** - -* Model fits onto a single GPU: - - 1. DDP - Distributed DP - 2. ZeRO - may or may not be faster depending on the situation and configuration used - -* Model doesn't fit onto a single GPU: - - 1. PP - 2. ZeRO - 3. TP - - With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup. - - TP is almost always used within a single node. That is TP size <= gpus per node. - -* Largest Layer not fitting into a single GPU: - - 1. If not using ZeRO - must use TP, as PP alone won't be able to fit. - 2. With ZeRO see the same entry for "Single GPU" above - - -**⇨ Multi-Node / Multi-GPU** - -* When you have fast inter-node connectivity: - - 1. ZeRO - as it requires close to no modifications to the model - 2. PP+TP+DP - less communications, but requires massive changes to the model - -* when you have slow inter-node connectivity and still low on GPU memory: - - 1. DP+PP+TP+ZeRO-1 diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md new file mode 100644 index 000000000000..17b62c3a1379 --- /dev/null +++ b/docs/source/en/perf_train_gpu_one.md @@ -0,0 +1,532 @@ + + +# Methods and tools for efficient training on a single GPU + +This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by +optimizing memory utilization, speeding up the training, or both. If you'd like to understand how GPU is utilized during +training, please refer to the [Model training anatomy](model_memory_anatomy) conceptual guide first. This guide +focuses on practical techniques. + + + +If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the [multi-GPU section](perf_train_gpu_many). + + + +When training large models, there are two aspects that should be considered at the same time: + +* Data throughput/training time +* Model performance + +Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU +as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory, +the memory optimization techniques, such as gradient accumulation, can help. + +However, if the preferred batch size fits into memory, there's no reason to apply memory-optimizing techniques because they can +slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of +hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly. + +The methods and tools covered in this guide can be classified based on the effect they have on the training process: + +| Method/tool | Improves training speed | Optimizes memory utilization | +|:-----------------------------------------------------------|:------------------------|:-----------------------------| +| [Batch size choice](#batch-size-choice) | Yes | Yes | +| [Gradient accumulation](#gradient-accumulation) | No | Yes | +| [Gradient checkpointing](#gradient-checkpointing) | No | Yes | +| [Mixed precision training](#mixed-precision-training) | Yes | (No) | +| [Optimizer choice](#optimizer-choice) | Yes | Yes | +| [Data preloading](#data-preloading) | Yes | No | +| [DeepSpeed Zero](#deepspeed-zero) | No | Yes | +| [torch.compile](#using-torchcompile) | Yes | No | + + + +Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a +large model and a small batch size, the memory use will be larger. + + + +You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are +training your model with [`Trainer`] or writing a pure PyTorch loop, in which case you can [configure these optimizations +with 🤗 Accelerate](#using-accelerate). + +If these methods do not result in sufficient gains, you can explore the following options: +* [Look into building your own custom Docker container with efficient softare prebuilds](#efficient-software-prebuilds) +* [Consider a model that uses Mixture of Experts (MoE)](#mixture-of-experts) +* [Convert your model to BetterTransformer to leverage PyTorch native attention](#using-pytorch-native-attention) + +Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving +to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism +techniques outlined in the [multi-GPU section](perf_train_gpu_many). + +## Batch size choice + +To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and +input/output neuron counts that are of size 2^N. Often it's a multiple of 8, but it can be +higher depending on the hardware being used and the model's dtype. + +For reference, check out NVIDIA's recommendation for [input/output neuron counts]( +https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and +[batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size) for +fully connected layers (which are involved in GEMMs (General Matrix Multiplications)). + +[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) +define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless +it's an A100 GPU, in which case use multiples of 64. + +For parameters that are small, consider also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization). +This is where tiling happens and the right multiplier can have a significant speedup. + +## Gradient Accumulation + +The **gradient accumulation** method aims to calculate gradients in smaller increments instead of computing them for the +entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward +and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of +gradients have been accumulated, the model's optimization step is executed. By employing gradient accumulation, it +becomes possible to increase the **effective batch size** beyond the limitations imposed by the GPU's memory capacity. +However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can +slow down the training process. + +You can enable gradient accumulation by adding the `gradient_accumulation_steps` argument to [`TrainingArguments`]: + +```py +training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args) +``` + +In the above example, your effective batch size becomes 4. + +Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example +[further down in this guide](#using-accelerate). + +While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can +result in a more pronounced training slowdown. Consider the following example. Let's say, the `per_device_train_batch_size=4` +without gradient accumulation hits the GPU's limit. If you would like to train with batches of size 64, do not set the +`per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to 64. Instead, keep `per_device_train_batch_size=4` +and set `gradient_accumulation_steps=16`. This results in the same effective batch size while making better use of +the available GPU resources. + +For additional information, please refer to batch size and gradient accumulation benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537) +and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957). + +## Gradient Checkpointing + +Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. +This is because there are other components that also require memory storage. + +Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in +significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed +during the backward pass, would introduce a considerable computational overhead and slow down the training process. + +**Gradient checkpointing** offers a compromise between these two approaches and saves strategically selected activations +throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For +an in-depth explanation of gradient checkpointing, refer to [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9). + +To enable gradient checkpointing in the [`Trainer`], pass the corresponding a flag to [`TrainingArguments`]: + +```py +training_args = TrainingArguments( + per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args +) +``` + +Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example [further in this guide](#using-accelerate). + + + +While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%. + + + +## Mixed precision training + +**Mixed precision training** is a technique that aims to optimize the computational efficiency of training models by +utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point +precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision +level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit +floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed +in half-precision, while some are still in full precision, the approach is called mixed precision training. + +Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures +(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check +out the [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) to learn more about +the differences between these data types. + +### fp16 + +The main advantage of mixed precision training comes from saving the activations in half precision (fp16). +Although the gradients are also computed in half precision they are converted back to full precision for the optimization +step so no memory is saved here. +While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes. +This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU). + +To enable mixed precision training, set the `fp16` flag to `True`: + +```py +training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args) +``` + +If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example [further in this guide](#using-accelerate). + +### BF16 + +If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While +bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have +is `65535` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which +is about the same as fp32 - because both have 8-bits used for the numerical range. + +You can enable BF16 in the 🤗 Trainer with: + +```python +training_args = TrainingArguments(bf16=True, **default_args) +``` + +### TF32 + +The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead +of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It's "magical" in the sense that +you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput +improvement. All you need to do is to add the following to your code: + +``` +import torch +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +``` + +CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series. + +According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/), the +majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. +If you're already using fp16 or bf16 mixed precision it may help with the throughput as well. + +You can enable this mode in the 🤗 Trainer: + +```python +TrainingArguments(tf32=True, **default_args) +``` + + + +tf32 can't be accessed directly via `tensor.to(dtype=torch.tf32)` because it is an internal CUDA data type. You need `torch>=1.7` to use tf32 data types. + + + +For additional information on tf32 vs other precisions, please refer to the following benchmarks: +[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and +[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189). + +## Flash Attention 2 + +You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) to learn more about how to load a model with Flash Attention 2 modules. + +## Optimizer choice + +The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves +good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory +footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. +For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed, `adamw_apex_fused` will give you the +fastest training experience among all supported AdamW optimizers. + +[`Trainer`] integrates a variety of optimizers that can be used out of box: `adamw_hf`, `adamw_torch`, `adamw_torch_fused`, +`adamw_apex_fused`, `adamw_anyprecision`, `adafactor`, or `adamw_bnb_8bit`. More optimizers can be plugged in via a third-party implementation. + +Let's take a closer look at two alternatives to AdamW optimizer: +1. `adafactor` which is available in [`Trainer`] +2. `adamw_bnb_8bit` is also available in Trainer, but a third-party integration is provided below for demonstration. + +For comparison, for a 3B-parameter model, like “t5-3b”: +* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB) +* Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra. +* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized. + +### Adafactor + +Adafactor doesn't store rolling averages for each element in weight matrices. Instead, it keeps aggregated information +(sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, +Adafactor may have slower convergence in certain cases. + +You can switch to Adafactor by setting `optim="adafactor"` in [`TrainingArguments`]: + +```py +training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args) +``` + +Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) +you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of +Adafactor can be worse than Adam. + +### 8-bit Adam + +Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization +means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the +idea behind mixed precision training. + +To use `adamw_bnb_8bit`, you simply need to set `optim="adamw_bnb_8bit"` in [`TrainingArguments`]: + +```py +training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", **default_args) +``` + +However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated. + +First, follow the installation guide in the GitHub [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library +that implements the 8-bit Adam optimizer. + +Next you need to initialize the optimizer. This involves two steps: +* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed. +* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer. + +```py +import bitsandbytes as bnb +from torch import nn +from transformers.trainer_pt_utils import get_parameter_names + +training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) + +decay_parameters = get_parameter_names(model, [nn.LayerNorm]) +decay_parameters = [name for name in decay_parameters if "bias" not in name] +optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if n in decay_parameters], + "weight_decay": training_args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if n not in decay_parameters], + "weight_decay": 0.0, + }, +] + +optimizer_kwargs = { + "betas": (training_args.adam_beta1, training_args.adam_beta2), + "eps": training_args.adam_epsilon, +} +optimizer_kwargs["lr"] = training_args.learning_rate +adam_bnb_optim = bnb.optim.Adam8bit( + optimizer_grouped_parameters, + betas=(training_args.adam_beta1, training_args.adam_beta2), + eps=training_args.adam_epsilon, + lr=training_args.learning_rate, +) +``` + +Finally, pass the custom optimizer as an argument to the `Trainer`: + +```py +trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) +``` + +Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), +you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. + +### multi_tensor + +pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations +with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub [issue](https://github.com/huggingface/transformers/issues/9965). + +## Data preloading + +One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it +can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast +enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck: + +- `DataLoader(pin_memory=True, ...)` - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory. +- `DataLoader(num_workers=4, ...)` - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance. + +When using [`Trainer`], the corresponding [`TrainingArguments`] are: `dataloader_pin_memory` (`True` by default), and `dataloader_num_workers` (defaults to `0`). + +## DeepSpeed ZeRO + +DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate. +It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale +deep learning training. + +If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed +as it'll only slow things down. However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can +leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately +[install the library](main_classes/deepspeed#installation), then follow one of the guides to create a configuration file +and launch DeepSpeed: + +* For an in-depth guide on DeepSpeed integration with [`Trainer`], review [the corresponding documentation](main_classes/deepspeed), specifically the +[section for a single GPU](main_classes/deepspeed#deployment-with-one-gpu). Some adjustments are required to use DeepSpeed in a notebook; please take a look at the [corresponding guide](main_classes/deepspeed#deployment-in-notebooks). +* If you prefer to use 🤗 Accelerate, refer to [🤗 Accelerate DeepSpeed guide](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed). + +## Using torch.compile + +PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can +optimize your code by adding a single line of code: `model = torch.compile(model)`. + +If using [`Trainer`], you only need `to` pass the `torch_compile` option in the [`TrainingArguments`]: + +```python +training_args = TrainingArguments(torch_compile=True, **default_args) +``` + +`torch.compile` uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. After +capturing the graph, different backends can be deployed to lower the graph to an optimized engine. +You can find more details and benchmarks in [PyTorch documentation](https://pytorch.org/get-started/pytorch-2.0/). + +`torch.compile` has a growing list of backends, which can be found in by calling `torchdynamo.list_backends()`, each of which with its optional dependencies. + +Choose which backend to use by specifying it via `torch_compile_backend` in the [`TrainingArguments`]. Some of the most commonly used backends are: + +**Debugging backends**: +* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues. +* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups. + +**Training & inference backends**: +* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) +* `dynamo.optimize("nvfuser")` - nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593) +* `dynamo.optimize("aot_nvfuser")` - nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593) +* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757) + +**Inference-only backend**s: +* `dynamo.optimize("ofi")` - Uses Torchscript optimize_for_inference. [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html) +* `dynamo.optimize("fx2trt")` - Uses Nvidia TensorRT for inference optimizations. [Read more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst) +* `dynamo.optimize("onnxrt")` - Uses ONNXRT for inference on CPU/GPU. [Read more](https://onnxruntime.ai/) +* `dynamo.optimize("ipex")` - Uses IPEX for inference on CPU. [Read more](https://github.com/intel/intel-extension-for-pytorch) + +For an example of using `torch.compile` with 🤗 Transformers, check out this [blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features](https://www.philschmid.de/getting-started-pytorch-2-0-transformers) + +## Using 🤗 Accelerate + +With [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) you can use the above methods while gaining full +control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. + +Suppose you have combined the methods in the [`TrainingArguments`] like so: + +```py +training_args = TrainingArguments( + per_device_train_batch_size=1, + gradient_accumulation_steps=4, + gradient_checkpointing=True, + fp16=True, + **default_args, +) +``` + +The full example training loop with 🤗 Accelerate is only a handful of lines of code long: + +```py +from accelerate import Accelerator +from torch.utils.data.dataloader import DataLoader + +dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) + +if training_args.gradient_checkpointing: + model.gradient_checkpointing_enable() + +accelerator = Accelerator(fp16=training_args.fp16) +model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) + +model.train() +for step, batch in enumerate(dataloader, start=1): + loss = model(**batch).loss + loss = loss / training_args.gradient_accumulation_steps + accelerator.backward(loss) + if step % training_args.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() +``` + +First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). +Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. +When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) +we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. +During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) +call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same [8-bit optimizer](#8-bit-adam) from the earlier example. + +Finally, we can add the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see +how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have +enough steps we run the optimization. + +Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the +benefit of more flexibility in the training loop. For a full documentation of all features have a look at the +[Accelerate documentation](https://huggingface.co/docs/accelerate/index). + + +## Efficient Software Prebuilds + +PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuilt with the cuda toolkit +which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions. + +At times, additional efforts may be required to pre-build some components. For instance, if you're using libraries like `apex` that +don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. +To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with +everything prebuilt. You just need to install your programs on it, and it will run out of the box. + +This approach is also useful if you want to tweak the pytorch source and/or make a new customized build. +To find the docker image version you want start [with PyTorch release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), +choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's +components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go +to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch). + +Next follow the instructions to download and deploy the docker image. + +## Mixture of Experts + +Some recent papers reported a 4-5x training speedup and a faster inference by integrating +Mixture of Experts (MoE) into the Transformer models. + +Since it has been discovered that more parameters lead to better performance, this technique allows to increase the +number of parameters by an order of magnitude without increasing training costs. + +In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function +that trains each expert in a balanced way depending on the input token's position in a sequence. + +![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png) + +(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)) + +You can find exhaustive details and comparison tables in the papers listed at the end of this section. + +The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude +larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements. + +There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or +hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the +memory requirements moderately as well. + +Most related papers and implementations are built around Tensorflow/TPUs: + +- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668) +- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) +- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html) + +And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts: [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](Thttps://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training). + +## Using PyTorch native attention and Flash Attention + +PyTorch 2.0 released a native [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA), +that allows using fused GPU kernels such as [memory-efficient attention](https://arxiv.org/abs/2112.05682) and [flash attention](https://arxiv.org/abs/2205.14135). + +After installing the [`optimum`](https://github.com/huggingface/optimum) package, the relevant internal modules can be +replaced to use PyTorch's native attention with: + +```python +model = model.to_bettertransformer() +``` + +Once converted, train the model as usual. + + + +The PyTorch-native `scaled_dot_product_attention` operator can only dispatch to Flash Attention if no `attention_mask` is provided. + +By default, in training mode, the BetterTransformer integration **drops the mask support and can only be used for training that does not require a padding mask for batched training**. This is the case, for example, during masked language modeling or causal language modeling. BetterTransformer is not suited for fine-tuning models on tasks that require a padding mask. + + + +Check out this [blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about acceleration and memory-savings with SDPA. \ No newline at end of file diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx deleted file mode 100644 index 07299b016f59..000000000000 --- a/docs/source/en/perf_train_gpu_one.mdx +++ /dev/null @@ -1,744 +0,0 @@ - - -# Efficient Training on a Single GPU - -This guide focuses on training large models efficiently on a single GPU. These approaches are still valid if you have access to a machine with multiple GPUs but you will also have access to additional methods outlined in the [multi-GPU section](perf_train_gpu_many). - -In this section we have a look at a few tricks to reduce the memory footprint and speed up training for large models and how they are integrated in the [`Trainer`] and [🤗 Accelerate](https://huggingface.co/docs/accelerate/). Each method can improve speed or memory usage which is summarized in the table below: - -|Method|Speed|Memory| -|:-----|:----|:-----| -| Gradient accumulation | No | Yes | -| Gradient checkpointing | No| Yes | -| Mixed precision training | Yes | (No) | -| Batch size | Yes | Yes | -| Optimizer choice | Yes | Yes | -| DataLoader | Yes | No | -| DeepSpeed Zero | No | Yes | - -A bracket means that it might not be strictly the case but is usually either not a main concern or negligible. Before we start make sure you have installed the following libraries: - -```bash -pip install transformers datasets accelerate nvidia-ml-py3 -``` - -The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly. - -Then we create some dummy data. We create random token IDs between 100 and 30000 and binary labels for a classifier. In total we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format. - - -```py -import numpy as np -from datasets import Dataset - - -seq_len, dataset_size = 512, 512 -dummy_data = { - "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)), - "labels": np.random.randint(0, 1, (dataset_size)), -} -ds = Dataset.from_dict(dummy_data) -ds.set_format("pt") -``` - -We want to print some summary statistics for the GPU utilization and the training run with the [`Trainer`]. We setup a two helper functions to do just that: - -```py -from pynvml import * - - -def print_gpu_utilization(): - nvmlInit() - handle = nvmlDeviceGetHandleByIndex(0) - info = nvmlDeviceGetMemoryInfo(handle) - print(f"GPU memory occupied: {info.used//1024**2} MB.") - - -def print_summary(result): - print(f"Time: {result.metrics['train_runtime']:.2f}") - print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") - print_gpu_utilization() -``` - -Let's verify that we start with a free GPU memory: - -```py ->>> print_gpu_utilization() -GPU memory occupied: 0 MB. -``` - -That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by the user. When a model is loaded to the GPU also the kernels are loaded which can take up 1-2GB of memory. To see how much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well. - -```py ->>> import torch - - ->>> torch.ones((1, 1)).to("cuda") ->>> print_gpu_utilization() -GPU memory occupied: 1343 MB. -``` - -We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses. - -## Load Model - -First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check how much space just weights use. - - -```py ->>> from transformers import AutoModelForSequenceClassification - - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda") ->>> print_gpu_utilization() -GPU memory occupied: 2631 MB. -``` - -We can see that the model weights alone take up 1.3 GB of the GPU memory. The exact number depends on the specific GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result as with `nvidia-smi` CLI: - - -```bash -nvidia-smi -``` - -```bash -Tue Jan 11 08:58:05 2022 -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|===============================+======================+======================| -| 0 Tesla V100-SXM2... On | 00000000:00:04.0 Off | 0 | -| N/A 37C P0 39W / 300W | 2631MiB / 16160MiB | 0% Default | -| | | N/A | -+-------------------------------+----------------------+----------------------+ - -+-----------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=============================================================================| -| 0 N/A N/A 3721 C ...nvs/codeparrot/bin/python 2629MiB | -+-----------------------------------------------------------------------------+ -``` - -We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can start training the model and see how the GPU memory consumption changes. First, we set up a few standard training arguments that we will use across all our experiments: - -```py -default_args = { - "output_dir": "tmp", - "evaluation_strategy": "steps", - "num_train_epochs": 1, - "log_level": "error", - "report_to": "none", -} -``` - - - - Note: In order to properly clear the memory after experiments we need restart the Python kernel between experiments. Run all steps above and then just one of the experiments below. - - - -## Vanilla Training - -As a first experiment we will use the [`Trainer`] and train the model without any further modifications and a batch size of 4: - -```py -from transformers import TrainingArguments, Trainer, logging - -logging.set_verbosity_error() - - -training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 57.82 -Samples/second: 8.86 -GPU memory occupied: 14949 MB. -``` - -We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. To understand a bit better why this is the case let's have look at a model's operations and memory needs. - -## Anatomy of Model's Operations - -Transformers architecture includes 3 main groups of operations grouped below by compute-intensity. - -1. **Tensor Contractions** - - Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer. - -2. **Statistical Normalizations** - - Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map. - -3. **Element-wise Operators** - - These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations. - -This knowledge can be helpful to know when analyzing performance bottlenecks. - -This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072) - - -## Anatomy of Model's Memory -We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there are many components during training that use GPU memory. The components on GPU memory are the following: -1. model weights -2. optimizer states -3. gradients -4. forward activations saved for gradient computation -5. temporary buffers -6. functionality-specific memory - -A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per model parameter for mixed precision inference, plus activation memory. - -Let's look at the details. - -**Model Weights:** - -- 4 bytes * number of parameters for fp32 training -- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory) - -**Optimizer States:** - -- 8 bytes * number of parameters for normal AdamW (maintains 2 states) -- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) -- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state) - -**Gradients** - -- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32) - -**Forward Activations** - -- size depends on many factors, the key ones being sequence length, hidden size and batch size. - -There are the input and output that are being passed and returned by the forward and the backward functions and the forward activations saved for gradient computation. - -**Temporary Memory** - -Additionally there are all kinds of temporary variables which get released once the calculation is done, but in the moment these could require additional memory and could push to OOM. Therefore when coding it's crucial to think strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed. - -**Functionality-specific memory** - -Then your software could have special memory needs. For example, when generating text using beam search, the software needs to maintain multiple copies of inputs and outputs. - -**`forward` vs `backward` Execution Speed** - -For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward (e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, and writes once, gradInput). - -So there are potentially a few places where we could save GPU memory or speed up operations. Let's start with a simple optimization: choosing the right batch size. - -## Batch sizes - -One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model. - -For example for fully connected layers (which correspond to GEMMs), NVIDIA provides recommendations for [input/output neuron counts]( -https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and [batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size). - -[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For example, for fp16 a multiple of 8 is recommended, but on A100 it's 64! - -For parameters that are small, there is also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) to consider, this is where tiling happens and the right multiplier can have a significant speedup. - -## Gradient Accumulation - -The idea behind gradient accumulation is to instead of calculating the gradients for the whole batch at once to do it in smaller steps. The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process. When enough gradients are accumulated we run the model's optimization step. This way we can easily increase the overall batch size to numbers that would never fit into the GPU's memory. In turn, however, the added forward and backward passes can slow down the training a bit. - -We can use gradient accumulation in the [`Trainer`] by simply adding the `gradient_accumulation_steps` argument to [`TrainingArguments`]. Let's see how it impacts the models memory footprint: - -```py -training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 66.03 -Samples/second: 7.75 -GPU memory occupied: 8681 MB. -``` - -We can see that the memory footprint was dramatically reduced at the cost of being only slightly slower than the vanilla run. Of course, this would change as you increase the number of accumulation steps. In general you would want to max out the GPU usage as much as possible. So in our case, the batch_size of 4 was already pretty close to the GPU's limit. If we wanted to train with a batch size of 64 we should not use `per_device_train_batch_size=1` and `gradient_accumulation_steps=64` but instead `per_device_train_batch_size=4` and `gradient_accumulation_steps=16` which has the same effective batch size while making better use of the available GPU resources. - -For more details see the benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537) -and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957). - -Next we have a look at another trick to save a little bit more GPU memory called gradient checkpointing. - -## Gradient Checkpointing - -Even when we set the batch size to 1 and use gradient accumulation we can still run out of memory when working with large models. In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead. Alternatively, one could forget all activations during the forward pass and recompute them on demand during the backward pass. This would however add a significant computational overhead and slow down training. - -Gradient checkpointing strikes a compromise between the two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. See [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9) explaining the ideas behind gradient checkpointing. - -To enable gradient checkpointing in the [`Trainer`] we only need to pass it as a flag to the [`TrainingArguments`]. Everything else is handled under the hood: - -```py -training_args = TrainingArguments( - per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args -) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 85.47 -Samples/second: 5.99 -GPU memory occupied: 6775 MB. -``` - -We can see that this saved some more memory but at the same time training became a bit slower. A general rule of thumb is that gradient checkpointing slows down training by about 20%. Let's have a look at another method with which we can regain some speed: mixed precision training. - - -## Floating Data Types - -The idea of mixed precision training is that not all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variables and their computations are faster. Here are the commonly used floating point data types choice of which impacts both memory usage and throughput: - -- fp32 (`float32`) -- fp16 (`float16`) -- bf16 (`bfloat16`) -- tf32 (CUDA internal data type) - -Here is a diagram that shows how these data types correlate to each other. - -![data types](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tf32-bf16-fp16-fp32.png) -(source: [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/)) - -While fp16 and fp32 have been around for quite some time, bf16 and tf32 are only available on the Ampere architecture GPUS and TPUs support bf16 as well. Let's start with the most commonly used method which is FP16 training/ - - -### FP16 Training - -The idea of mixed precision training is that not all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variales and their computations are faster. The main advantage comes from saving the activations in half (16-bit) precision. Although the gradients are also computed in half precision they are converted back to full precision for the optimization step so no memory is saved here. Since the model is present on the GPU in both 16-bit and 32-bit precision this can use more GPU memory (1.5x the original model is on the GPU), especially for small batch sizes. Since some computations are performed in full and some in half precision this approach is also called mixed precision training. Enabling mixed precision training is also just a matter of setting the `fp16` flag to `True`: - -```py -training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 27.46 -Samples/second: 18.64 -GPU memory occupied: 13939 MB. -``` - -We can see that this is almost twice as fast as the vanilla training. Let's add it to the mix of the previous methods: - - -```py -training_args = TrainingArguments( - per_device_train_batch_size=1, - gradient_accumulation_steps=4, - gradient_checkpointing=True, - fp16=True, - **default_args, -) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 50.76 -Samples/second: 10.09 -GPU memory occupied: 7275 MB. -``` - -We can see that with these tweaks we use about half the GPU memory as at the beginning while also being slightly faster. - -### BF16 -If you have access to a Ampere or newer hardware you can use bf16 for your training and evaluation. While bf16 has a worse precision than fp16, it has a much much bigger dynamic range. Therefore, if in the past you were experiencing overflow issues while training the model, bf16 will prevent this from happening most of the time. Remember that in fp16 the biggest number you can have is `65535` and any number above that will overflow. A bf16 number can be as large as `3.39e+38` (!) which is about the same as fp32 - because both have 8-bits used for the numerical range. - -You can enable BF16 in the 🤗 Trainer with: - -```python -TrainingArguments(bf16=True) -``` - -### TF32 -The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. - -It's magical in the sense that you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput improvement. All you need to do is to add this to your code: - -``` -import torch -torch.backends.cuda.matmul.allow_tf32 = True -``` - -When this is done CUDA will automatically switch to using tf32 instead of fp32 where it's possible. This, of course, assumes that the used GPU is from the Ampere series. - -Like all cases with reduced precision this may or may not be satisfactory for your needs, so you have to experiment and see. According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) the majority of machine learning training shouldn't be impacted and showed the same perplexity and convergence as the fp32 training. - -If you're already using fp16 or bf16 mixed precision it may help with the throughput as well. - -You can enable this mode in the 🤗 Trainer with: -```python -TrainingArguments(tf32=True) -``` -By default the PyTorch default is used. - -Note: tf32 mode is internal to CUDA and can't be accessed directly via `tensor.to(dtype=torch.tf32)` as `torch.tf32` doesn't exist. - -Note: you need `torch>=1.7` to enjoy this feature. - -You can also see a variety of benchmarks on tf32 vs other precisions: -[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and -[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189). - -We've now seen how we can change the floating types to increase throughput, but we are not done, yet! There is another area where we can save GPU memory: the optimizer. - -## Optimizer - -The most common optimizer used to train transformer model is Adam or AdamW (Adam with weight decay). Adam achieves good convergence by storing the rolling average of the previous gradients which, however, adds an additional memory footprint of the order of the number of model parameters. One remedy to this is to use an alternative optimizer such as Adafactor, which works well for some models but often it has instability issues. - -HF Trainer integrates a variety of optimisers that can be used out of box. To activate the desired optimizer simply pass the `--optim` flag to the command line. - -To see which optimizers are currently supported: - -```bash -$ python examples/pytorch/translation/run_translation.py -h | grep "\-optim" - [--optim {adamw_hf,adamw_torch,adamw_torch_xla,adamw_apex_fused,adafactor}] -``` - -For example, if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed `--optim adamw_apex_fused` will give you the fastest training experience among all supported AdamW optimizers. - -On the other hand [8bit BNB optimizer](https://github.com/TimDettmers/bitsandbytes) can save 3/4 of memory normally used by a typical AdamW optimizer if it is configured to quantize all optimizer states, but in some situations only some optimizer states are quintized and then more memory is used. - -Let's get a feel for the numbers and use for example use a 3B-parameter model, like `t5-3b`. Note that since a Gigabyte correpsonds to a billion bytes we can simply multiply the parameters (in billions) with the number of necessary bytes per parameter to get Gigabytes of GPU memory usage: - -- A standard AdamW uses 8 bytes for each parameter, here the optimizer will need (`8*3`) 24GB of GPU memory. -- Adafactor uses slightly more than 4 bytes, so (`4*3`) 12GB and then some extra. -- 8bit BNB quantized optimizer will use only (`2*3`) 6GB if all optimizer states are quantized. - -Let's have a look at Adafactor first. - -### Adafactor - -Instead of keeping the rolling average for each element in the weight matrices Adafactor only stores aggregated information (row- and column-wise sums of the rolling averages) which reduces the footprint considerably. One downside of Adafactor is that in some instances convergence can be slower than Adam's so some experimentation is advised here. We can use Adafactor simply by setting `optim="adafactor"`: - - -```py -training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 64.31 -Samples/second: 7.96 -GPU memory occupied: 12295 MB. -``` - -We can see that this saves a few more GB on the GPU. Let's see how it looks when we add it to the other methods we introduced earlier: - - -```py -training_args = TrainingArguments( - per_device_train_batch_size=1, - gradient_accumulation_steps=4, - gradient_checkpointing=True, - fp16=True, - optim="adafactor", - **default_args, -) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 56.54 -Samples/second: 9.06 -GPU memory occupied: 4847 MB. -``` - -We went from 15 GB memory usage to 5 GB - a 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of Adafactor can be worse than Adam. There is an alternative to Adafactor called 8-bit Adam that takes a slightly different approach. - -### 8-bit Adam - -Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the idea behind FP16 training where using variables with lower precision saves memory. - -In contrast to the previous approaches is this one not integrated into the [`Trainer`] as a simple flag. We need to install the 8-bit optimizer and then pass it as a custom optimizer to the [`Trainer`]. Follow the installation guide in the Github [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library that implements the 8-bit Adam optimizer. - -Once installed, we just need to initialize the the optimizer. Although this looks like a considerable amount of work it actually just involves two steps: first we need to group the model's parameters into two groups where to one group we apply weight decay and to the other we don't. Usually, biases and layer norm parameters are not weight decayed. Then in a second step we just do some argument housekeeping to use the same parameters as the previously used AdamW optimizer. - - -Note that in order to use the 8-bit optimizer with an existing pretrained model a change to the embedding layer is needed. -Read [this issue](https://github.com/huggingface/transformers/issues/14819) for more information. - - -```py -import bitsandbytes as bnb -from torch import nn -from transformers.trainer_pt_utils import get_parameter_names - -training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) - -decay_parameters = get_parameter_names(model, [nn.LayerNorm]) -decay_parameters = [name for name in decay_parameters if "bias" not in name] -optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if n in decay_parameters], - "weight_decay": training_args.weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if n not in decay_parameters], - "weight_decay": 0.0, - }, -] - -optimizer_kwargs = { - "betas": (training_args.adam_beta1, training_args.adam_beta2), - "eps": training_args.adam_epsilon, -} -optimizer_kwargs["lr"] = training_args.learning_rate -adam_bnb_optim = bnb.optim.Adam8bit( - optimizer_grouped_parameters, - betas=(training_args.adam_beta1, training_args.adam_beta2), - eps=training_args.adam_epsilon, - lr=training_args.learning_rate, -) -``` - -We can now pass the custom optimizer as an argument to the `Trainer`: -```py -trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 55.95 -Samples/second: 9.15 -GPU memory occupied: 13085 MB. -``` - -We can see that we get a similar memory improvement as with Adafactor while keeping the full rolling average of the gradients. Let's repeat the experiment with the full settings: - -```py -training_args = TrainingArguments( - per_device_train_batch_size=1, - gradient_accumulation_steps=4, - gradient_checkpointing=True, - fp16=True, - **default_args, -) - -trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) -result = trainer.train() -print_summary(result) -``` - -``` -Time: 49.46 -Samples/second: 10.35 -GPU memory occupied: 5363 MB. -``` - -Again, we get about a 3x memory improvement and even slightly higher throughput as using Adafactor. So we have seen how we can optimize the memory footprint of large models. The following plot summarizes all our experiments: - -![png](https://huggingface.co/datasets/lvwerra/repo-images/raw/main/gpu-memory-savings.png) - -### `_multi_tensor` -pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner and don't mind using the bleed-edge, see: https://github.com/huggingface/transformers/issues/9965 - - -## Using 🤗 Accelerate - -So far we have used the [`Trainer`] to run the experiments but a more flexible alternative to that approach is to use 🤗 Accelerate. With 🤗 Accelerate you have full control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. In turn it allows you to easily scale across different infrastructures such as CPUs, GPUs, TPUs, or distributed multi-GPU setups without changing any code. Let's see what it takes to implement all of the above tweaks in 🤗 Accelerate. We can still use the [`TrainingArguments`] to wrap the training settings: - - -```py -training_args = TrainingArguments( - per_device_train_batch_size=1, - gradient_accumulation_steps=4, - gradient_checkpointing=True, - fp16=True, - **default_args, -) -``` - -The full example training loop with 🤗 Accelerate is only a handful of lines of code long: - - -```py -from accelerate import Accelerator -from torch.utils.data.dataloader import DataLoader - -dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) - -if training_args.gradient_checkpointing: - model.gradient_checkpointing_enable() - -accelerator = Accelerator(fp16=training_args.fp16) -model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) - -model.train() -for step, batch in enumerate(dataloader, start=1): - loss = model(**batch).loss - loss = loss / training_args.gradient_accumulation_steps - accelerator.backward(loss) - if step % training_args.gradient_accumulation_steps == 0: - optimizer.step() - optimizer.zero_grad() -``` - -First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments. - -Finally, we can write the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see how gradient accumulation works: we normalize the loss so we get the average at the end of accumulation and once we have enough steps we run the optimization. Now the question is: does this use the same amount of memory as the previous steps? Let's check: - - -```py ->>> print_gpu_utilization() -GPU memory occupied: 5363 MB. -``` - -Indeed it does. Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the benefit of more flexiblity in the training loop. For a full documentation of all features have a look at the [Accelerate documentation](https://huggingface.co/docs/accelerate/index). - -## DataLoader - -One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it can handle. By default everything happens in the main process and it might not be able to read the data from disk fast enough, and thus create a bottleneck, leading to GPU under-utilization. - -- `DataLoader(pin_memory=True, ...)` which ensures that the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory. -- `DataLoader(num_workers=4, ...)` - spawn several workers to pre-load data faster - during training watch the GPU utilization stats and if it's far from 100% experiment with raising the number of workers. Of course, the problem could be elsewhere so a very big number of workers won't necessarily lead to a better performance. - -## DeepSpeed ZeRO - -The in-depth details on how to use Deepspeed can be found [here](main_classes/deepspeed). - -First, a quick decision tree: - -1. Model fits onto a single GPU and you have enough space to fit a small batch size - you don't need to use Deepspeed as it'll only slow things down in this use case. -2. Model doesn't fit onto a single GPU or you can't fit a small batch - use DeepSpeed ZeRO + CPU Offload and for much larger models NVMe Offload. - -Now if the decision tree suggested you use DeepSpeed first you need to [install it](main_classes/deepspeed#installation), then follow one of the following guides to create a configuration file and launch DeepSpeed. - -Activation: - -- HF Trainer-based examples: see this [guide](main_classes/deepspeed#deployment-with-one-gpu). -- Custom HF Trainer-based program: Same as above, but pass: - - ```python - TrainingArguments(deepspeed="/path/to/ds_config.json") - ``` -- Deployment in Notebooks: see this [guide](main_classes/deepspeed#deployment-in-notebooks). - -- Custom training loop: This is somewhat complex but you can study how this is implemented in [HF Trainer]( -https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) - simply search for `deepspeed` in the code. - - -## Choice of GPU -Sometimes, even when applying all the above tweaks the throughput on a given GPU might still not be good enough. One easy solution is to change the type of GPU. For example switching from let's say a K80 (which you typically get on Google Colab) to a fancier GPU such as the V100 or A100. Although they are more expensive they are usually more cost effective than cheaper GPUs due to their larger memory and faster architecture. - -Now, let's take a step back and discuss what we should optimize for when scaling the training of large models. - -## How to scale - -When we train models there are a two aspects we want to optimize at the same time: - -- Data throughput/training time -- Model performance - -We have seen that each method changes the memory usage and throughput. In general we want to maximize the throughput (samples/second) to minimize the training cost. This is generally achieved by utilizing the GPU as much as possible and thus filling GPU memory to its limit. For example, as mentioned earlier, we only employ gradient accumulation when we want to use a batch size beyond the size of the GPU memory. If the desired batch size fits into memory then there is no reason to apply gradient accumulation which will only slow down training. - -The second objective is model performance. Just because we can does not mean we should use a large batch size. As part of hyperparameter tuning you should determine which batch size yields the best result and then optimize the throughput accordingly. - - -## Efficient Software Prebuilds - -PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuit with the cuda toolkit which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions. - -At times it may take an additional effort to pre-build some components, e.g., if you're using libraries like `apex` that don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. To address these users' needs PyTorch and NVIDIA release a new version of NGC docker container which already comes with everything prebuilt and you just need to install your programs on it and it will run out of the box. - -This approach is also useful if you want to tweak the pytorch source and/or make a new customized build. - -To find the docker image version you want start [here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch). - -Next follow the instructions to download and deploy the docker image. - -## Sparsity - -### Mixture of Experts - -Quite a few of the recent papers reported a 4-5x training speedup and a faster inference by integrating -Mixture of Experts (MoE) into the Transformer models. - -Since it has been discovered that more parameters lead to better performance, this technique allows to increase the number of parameters by an order of magnitude without increasing training costs. - -In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function that trains each expert in a balanced way depending on the input token's position in a sequence. - -![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png) - -(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)) - -You can find exhaustive details and comparison tables in the papers listed at the end of this section. - -The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements. - -There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the memory requirements moderately as well. - -Most related papers and implementations are built around Tensorflow/TPUs: - -- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668) -- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) -- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html) - -And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts: [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](Thttps://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training). - - -## Scaling beyond a single GPU - -For some applications, such as pretraining large language models, applying all the approaches above might still not be fast enough. In this case you want to scale your experiment to several GPUs. - -Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many). - -## Using torch.compile - -PyTorch 2.0 introduces a new compile function, you can learn more about it [in their documentation](https://pytorch.org/get-started/pytorch-2.0/). It uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After capturing the graph, different backends can be deployed to lower the graph to an optimized engine. You can choose one option below for performance boost. - -`torch.compile` has a growing list of backends, which can be found in [backends.py](https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py) -or `torchdynamo.list_backends()` each of which with its optional dependencies. - -Some of the most commonly used backends are - -**Debugging backends**: -* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues. -* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups. - -**Training & inference backends**: -* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) -* `dynamo.optimize("nvfuser")` - nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593) -* `dynamo.optimize("aot_nvfuser")` - nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593) -* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757) - -**Inference-only backend**s: -* `dynamo.optimize("ofi")` - Uses Torchscript optimize_for_inference. [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html) -* `dynamo.optimize("fx2trt")` - Uses Nvidia TensorRT for inference optimizations. [Read more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst) -* `dynamo.optimize("onnxrt")` - Uses ONNXRT for inference on CPU/GPU. [Read more](https://onnxruntime.ai/) -* `dynamo.optimize("ipex")` - Uses IPEX for inference on CPU. [Read more](https://github.com/intel/intel-extension-for-pytorch) diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md new file mode 100644 index 000000000000..48727b24fef3 --- /dev/null +++ b/docs/source/en/perf_train_special.md @@ -0,0 +1,24 @@ + + +# Training on Specialized Hardware + + + + Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section. + + + +This document will be completed soon with information on how to train on specialized hardware. diff --git a/docs/source/en/perf_train_special.mdx b/docs/source/en/perf_train_special.mdx deleted file mode 100644 index cb6b8d4090e2..000000000000 --- a/docs/source/en/perf_train_special.mdx +++ /dev/null @@ -1,20 +0,0 @@ - - -# Training on Specialized Hardware - - - - Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section. - - - -This document will be completed soon with information on how to train on specialized hardware. diff --git a/docs/source/en/perf_train_tpu.md b/docs/source/en/perf_train_tpu.md new file mode 100644 index 000000000000..c7b344ad81e7 --- /dev/null +++ b/docs/source/en/perf_train_tpu.md @@ -0,0 +1,24 @@ + + +# Training on TPUs + + + + Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section. + + + +This document will be completed soon with information on how to train on TPUs. diff --git a/docs/source/en/perf_train_tpu.mdx b/docs/source/en/perf_train_tpu.mdx deleted file mode 100644 index bc37e00877c2..000000000000 --- a/docs/source/en/perf_train_tpu.mdx +++ /dev/null @@ -1,20 +0,0 @@ - - -# Training on TPUs - - - - Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section. - - - -This document will be completed soon with information on how to train on TPUs. diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md new file mode 100644 index 000000000000..011421b629c0 --- /dev/null +++ b/docs/source/en/perf_train_tpu_tf.md @@ -0,0 +1,162 @@ + + +# Training on TPU with TensorFlow + + + +If you don't need long explanations and just want TPU code samples to get started with, check out [our TPU example notebook!](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) + + + +### What is a TPU? + +A TPU is a **Tensor Processing Unit.** They are hardware designed by Google, which are used to greatly speed up the tensor computations within neural networks, much like GPUs. They can be used for both network training and inference. They are generally accessed through Google’s cloud services, but small TPUs can also be accessed directly for free through Google Colab and Kaggle Kernels. + +Because [all TensorFlow models in 🤗 Transformers are Keras models](https://huggingface.co/blog/tensorflow-philosophy), most of the methods in this document are generally applicable to TPU training for any Keras model! However, there are a few points that are specific to the HuggingFace ecosystem (hug-o-system?) of Transformers and Datasets, and we’ll make sure to flag them up when we get to them. + +### What kinds of TPU are available? + +New users are often very confused by the range of TPUs, and the different ways to access them. The first key distinction to understand is the difference between **TPU Nodes** and **TPU VMs.** + +When you use a **TPU Node**, you are effectively indirectly accessing a remote TPU. You will need a separate VM, which will initialize your network and data pipeline and then forward them to the remote node. When you use a TPU on Google Colab, you are accessing it in the **TPU Node** style. + +Using TPU Nodes can have some quite unexpected behaviour for people who aren’t used to them! In particular, because the TPU is located on a physically different system to the machine you’re running your Python code on, your data cannot be local to your machine - any data pipeline that loads from your machine’s internal storage will totally fail! Instead, data must be stored in Google Cloud Storage where your data pipeline can still access it, even when the pipeline is running on the remote TPU node. + + + +If you can fit all your data in memory as `np.ndarray` or `tf.Tensor`, then you can `fit()` on that data even when using Colab or a TPU Node, without needing to upload it to Google Cloud Storage. + + + + + +**🤗Specific Hugging Face Tip🤗:** The methods `Dataset.to_tf_dataset()` and its higher-level wrapper `model.prepare_tf_dataset()` , which you will see throughout our TF code examples, will both fail on a TPU Node. The reason for this is that even though they create a `tf.data.Dataset` it is not a “pure” `tf.data` pipeline and uses `tf.numpy_function` or `Dataset.from_generator()` to stream data from the underlying HuggingFace `Dataset`. This HuggingFace `Dataset` is backed by data that is on a local disc and which the remote TPU Node will not be able to read. + + + +The second way to access a TPU is via a **TPU VM.** When using a TPU VM, you connect directly to the machine that the TPU is attached to, much like training on a GPU VM. TPU VMs are generally easier to work with, particularly when it comes to your data pipeline. All of the above warnings do not apply to TPU VMs! + +This is an opinionated document, so here’s our opinion: **Avoid using TPU Node if possible.** It is more confusing and more difficult to debug than TPU VMs. It is also likely to be unsupported in future - Google’s latest TPU, TPUv4, can only be accessed as a TPU VM, which suggests that TPU Nodes are increasingly going to become a “legacy” access method. However, we understand that the only free TPU access is on Colab and Kaggle Kernels, which uses TPU Node - so we’ll try to explain how to handle it if you have to! Check the [TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) for code samples that explain this in more detail. + +### What sizes of TPU are available? + +A single TPU (a v2-8/v3-8/v4-8) runs 8 replicas. TPUs exist in **pods** that can run hundreds or thousands of replicas simultaneously. When you use more than a single TPU but less than a whole pod (for example, a v3-32), your TPU fleet is referred to as a **pod slice.** + +When you access a free TPU via Colab, you generally get a single v2-8 TPU. + +### I keep hearing about this XLA thing. What’s XLA, and how does it relate to TPUs? + +XLA is an optimizing compiler, used by both TensorFlow and JAX. In JAX it is the only compiler, whereas in TensorFlow it is optional (but mandatory on TPU!). The easiest way to enable it when training a Keras model is to pass the argument `jit_compile=True` to `model.compile()`. If you don’t get any errors and performance is good, that’s a great sign that you’re ready to move to TPU! + +Debugging on TPU is generally a bit harder than on CPU/GPU, so we recommend getting your code running on CPU/GPU with XLA first before trying it on TPU. You don’t have to train for long, of course - just for a few steps to make sure that your model and data pipeline are working like you expect them to. + + + +XLA compiled code is usually faster - so even if you’re not planning to run on TPU, adding `jit_compile=True` can improve your performance. Be sure to note the caveats below about XLA compatibility, though! + + + + + +**Tip born of painful experience:** Although using `jit_compile=True` is a good way to get a speed boost and test if your CPU/GPU code is XLA-compatible, it can actually cause a lot of problems if you leave it in when actually training on TPU. XLA compilation will happen implicitly on TPU, so remember to remove that line before actually running your code on a TPU! + + + +### How do I make my model XLA compatible? + +In many cases, your code is probably XLA-compatible already! However, there are a few things that work in normal TensorFlow that don’t work in XLA. We’ve distilled them into three core rules below: + + + +**🤗Specific HuggingFace Tip🤗:** We’ve put a lot of effort into rewriting our TensorFlow models and loss functions to be XLA-compatible. Our models and loss functions generally obey rule #1 and #2 by default, so you can skip over them if you’re using `transformers` models. Don’t forget about these rules when writing your own models and loss functions, though! + + + +#### XLA Rule #1: Your code cannot have “data-dependent conditionals” + +What that means is that any `if` statement cannot depend on values inside a `tf.Tensor`. For example, this code block cannot be compiled with XLA! + +```python +if tf.reduce_sum(tensor) > 10: + tensor = tensor / 2.0 +``` + +This might seem very restrictive at first, but most neural net code doesn’t need to do this. You can often get around this restriction by using `tf.cond` (see the documentation [here](https://www.tensorflow.org/api_docs/python/tf/cond)) or by removing the conditional and finding a clever math trick with indicator variables instead, like so: + +```python +sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32) +tensor = tensor / (1.0 + sum_over_10) +``` + +This code has exactly the same effect as the code above, but by avoiding a conditional, we ensure it will compile with XLA without problems! + +#### XLA Rule #2: Your code cannot have “data-dependent shapes” + +What this means is that the shape of all of the `tf.Tensor` objects in your code cannot depend on their values. For example, the function `tf.unique` cannot be compiled with XLA, because it returns a `tensor` containing one instance of each unique value in the input. The shape of this output will obviously be different depending on how repetitive the input `Tensor` was, and so XLA refuses to handle it! + +In general, most neural network code obeys rule #2 by default. However, there are a few common cases where it becomes a problem. One very common one is when you use **label masking**, setting your labels to a negative value to indicate that those positions should be ignored when computing the loss. If you look at NumPy or PyTorch loss functions that support label masking, you will often see code like this that uses [boolean indexing](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing): + +```python +label_mask = labels >= 0 +masked_outputs = outputs[label_mask] +masked_labels = labels[label_mask] +loss = compute_loss(masked_outputs, masked_labels) +mean_loss = torch.mean(loss) +``` + +This code is totally fine in NumPy or PyTorch, but it breaks in XLA! Why? Because the shape of `masked_outputs` and `masked_labels` depends on how many positions are masked - that makes it a **data-dependent shape.** However, just like for rule #1, we can often rewrite this code to yield exactly the same output without any data-dependent shapes. + +```python +label_mask = tf.cast(labels >= 0, tf.float32) +loss = compute_loss(outputs, labels) +loss = loss * label_mask # Set negative label positions to 0 +mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask) +``` + +Here, we avoid data-dependent shapes by computing the loss for every position, but zeroing out the masked positions in both the numerator and denominator when we calculate the mean, which yields exactly the same result as the first block while maintaining XLA compatibility. Note that we use the same trick as in rule #1 - converting a `tf.bool` to `tf.float32` and using it as an indicator variable. This is a really useful trick, so remember it if you need to convert your own code to XLA! + +#### XLA Rule #3: XLA will need to recompile your model for every different input shape it sees + +This is the big one. What this means is that if your input shapes are very variable, XLA will have to recompile your model over and over, which will create huge performance problems. This commonly arises in NLP models, where input texts have variable lengths after tokenization. In other modalities, static shapes are more common and this rule is much less of a problem. + +How can you get around rule #3? The key is **padding** - if you pad all your inputs to the same length, and then use an `attention_mask`, you can get the same results as you’d get from variable shapes, but without any XLA issues. However, excessive padding can cause severe slowdown too - if you pad all your samples to the maximum length in the whole dataset, you might end up with batches consisting endless padding tokens, which will waste a lot of compute and memory! + +There isn’t a perfect solution to this problem. However, you can try some tricks. One very useful trick is to **pad batches of samples up to a multiple of a number like 32 or 64 tokens.** This often only increases the number of tokens by a small amount, but it hugely reduces the number of unique input shapes, because every input shape now has to be a multiple of 32 or 64. Fewer unique input shapes means fewer XLA compilations! + + + +**🤗Specific HuggingFace Tip🤗:** Our tokenizers and data collators have methods that can help you here. You can use `padding="max_length"` or `padding="longest"` when calling tokenizers to get them to output padded data. Our tokenizers and data collators also have a `pad_to_multiple_of` argument that you can use to reduce the number of unique input shapes you see! + + + +### How do I actually train my model on TPU? + +Once your training is XLA-compatible and (if you’re using TPU Node / Colab) your dataset has been prepared appropriately, running on TPU is surprisingly easy! All you really need to change in your code is to add a few lines to initialize your TPU, and to ensure that your model and dataset are created inside a `TPUStrategy` scope. Take a look at [our TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) to see this in action! + +### Summary + +There was a lot in here, so let’s summarize with a quick checklist you can follow when you want to get your model ready for TPU training: + +- Make sure your code follows the three rules of XLA +- Compile your model with `jit_compile=True` on CPU/GPU and confirm that you can train it with XLA +- Either load your dataset into memory or use a TPU-compatible dataset loading approach (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)) +- Migrate your code either to Colab (with accelerator set to “TPU”) or a TPU VM on Google Cloud +- Add TPU initializer code (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)) +- Create your `TPUStrategy` and make sure dataset loading and model creation are inside the `strategy.scope()` (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)) +- Don’t forget to take `jit_compile=True` out again when you move to TPU! +- 🙏🙏🙏🥺🥺🥺 +- Call model.fit() +- You did it! \ No newline at end of file diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md new file mode 100644 index 000000000000..a1661a6ba5a8 --- /dev/null +++ b/docs/source/en/performance.md @@ -0,0 +1,73 @@ + + +# Performance and Scalability + +Training large transformer models and deploying them to production present various challenges. +During training, the model may require more GPU memory than available or exhibit slow training speed. In the deployment +phase, the model can struggle to handle the required throughput in a production environment. + +This documentation aims to assist you in overcoming these challenges and finding the optimal setting for your use-case. +The guides are divided into training and inference sections, as each comes with different challenges and solutions. +Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU +for training or CPU vs. GPU for inference. + +Use this document as your starting point to navigate further to the methods that match your scenario. + +## Training + +Training large transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where +you have a single GPU. The methods that you can apply to improve training efficiency on a single GPU extend to other setups +such as multiple GPU. However, there are also techniques that are specific to multi-GPU or CPU training. We cover them in +separate sections. + +* [Methods and tools for efficient training on a single GPU](perf_train_gpu_one): start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. +* [Multi-GPU training section](perf_train_gpu_many): explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism. +* [CPU training section](perf_train_cpu): learn about mixed precision training on CPU. +* [Efficient Training on Multiple CPUs](perf_train_cpu_many): learn about distributed CPU training. +* [Training on TPU with TensorFlow](perf_train_tpu_tf): if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. +* [Custom hardware for training](perf_hardware): find tips and tricks when building your own deep learning rig. +* [Hyperparameter Search using Trainer API](hpo_train) + +## Inference + +Efficient inference with large models in a production environment can be as challenging as training them. In the following +sections we go through the steps to run inference on CPU and single/multi-GPU setups. + +* [Inference on a single CPU](perf_infer_cpu) +* [Inference on a single GPU](perf_infer_gpu_one) +* [Multi-GPU inference](perf_infer_gpu_many) +* [XLA Integration for TensorFlow Models](tf_xla) + + +## Training and inference + +Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it. + +* [Instantiating a big model](big_models) +* [Troubleshooting performance issues](debugging) + +## Contribute + +This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to +make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there. + +When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the +source of that information (unless it comes directly from you). diff --git a/docs/source/en/performance.mdx b/docs/source/en/performance.mdx deleted file mode 100644 index 6c68e9b2acce..000000000000 --- a/docs/source/en/performance.mdx +++ /dev/null @@ -1,92 +0,0 @@ - - -# Performance and Scalability - -Training larger and larger transformer models and deploying them to production comes with a range of challenges. During training your model can require more GPU memory than is available or be very slow to train and when you deploy it for inference it can be overwhelmed with the throughput that is required in the production environment. This documentation is designed to help you navigate these challenges and find the best setting for your use-case. We split the guides into training and inference as they come with different challenges and solutions. Then within each of them we have separate guides for different kinds of hardware setting (e.g. single vs. multi-GPU for training or CPU vs. GPU for infrence). - -![perf_overview](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf_overview.png) - -This document serves as an overview and entry point for the methods that could be useful for your scenario. - -## Training - -Training transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where you only have a single GPU, but there is also a section about multi-GPU and CPU training (with more coming soon). - - - - Note: Most of the strategies introduced in the single GPU sections (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training. - - - -### Single GPU - -Training large models on a single GPU can be challenging but there are a number of tools and methods that make it feasible. In this section methods such as mixed precision training, gradient accumulation and checkpointing, efficient optimizers, as well as strategies to determine the best batch size are discussed. - -[Go to single GPU training section](perf_train_gpu_one) - -### Multi-GPU - -In some cases training on a single GPU is still too slow or won't fit the large model. Moving to a multi-GPU setup is the logical step, but training on multiple GPUs at once comes with new decisions: does each GPU have a full copy of the model or is the model itself also distributed? In this section we look at data, tensor, and pipeline parallism. - -[Go to multi-GPU training section](perf_train_gpu_many) - -### CPU - - -[Go to CPU training section](perf_train_cpu) - - -### TPU - -[_Coming soon_](perf_train_tpu) - -### Specialized Hardware - -[_Coming soon_](perf_train_special) - -## Inference - -Efficient inference with large models in a production environment can be as challenging as training them. In the following sections we go through the steps to run inference on CPU and single/multi-GPU setups. - -### CPU - -[Go to CPU inference section](perf_infer_cpu) - -### Single GPU - -[Go to single GPU inference section](perf_infer_gpu_one) - -### Multi-GPU - -[Go to multi-GPU inference section](perf_infer_gpu_many) - -### Specialized Hardware - -[_Coming soon_](perf_infer_special) - -## Hardware - -In the hardware section you can find tips and tricks when building your own deep learning rig. - -[Go to hardware section](perf_hardware) - - -## Contribute - -This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there. - -When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the source of that information (unless it comes directly from you). diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md new file mode 100644 index 000000000000..18abc0305b0e --- /dev/null +++ b/docs/source/en/perplexity.md @@ -0,0 +1,143 @@ + + +# Perplexity of fixed-length models + +[[open-in-colab]] + +Perplexity (PPL) is one of the most common metrics for evaluating language models. Before diving in, we should note +that the metric applies specifically to classical language models (sometimes called autoregressive or causal language +models) and is not well defined for masked language models like BERT (see [summary of the models](model_summary)). + +Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized +sequence \\(X = (x_0, x_1, \dots, x_t)\\), then the perplexity of \\(X\\) is, + +$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{ + +When working with approximate models, however, we typically have a constraint on the number of tokens the model can +process. The largest version of [GPT-2](model_doc/gpt2), for example, has a fixed length of 1024 tokens, so we +cannot calculate \\(p_\theta(x_t|x_{ + +This is quick to compute since the perplexity of each segment can be computed in one forward pass, but serves as a poor +approximation of the fully-factorized perplexity and will typically yield a higher (worse) PPL because the model will +have less context at most of the prediction steps. + +Instead, the PPL of fixed-length models should be evaluated with a sliding-window strategy. This involves repeatedly +sliding the context window so that the model has more context when making each prediction. + +Sliding window PPL taking advantage of all available context + +This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more +favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good +practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by +1 token a time. This allows computation to proceed much faster while still giving the model a large context to make +predictions at each step. + +## Example: Calculating perplexity with GPT-2 in 🤗 Transformers + +Let's demonstrate this process with GPT-2. + +```python +from transformers import GPT2LMHeadModel, GPT2TokenizerFast + +device = "cuda" +model_id = "gpt2-large" +model = GPT2LMHeadModel.from_pretrained(model_id).to(device) +tokenizer = GPT2TokenizerFast.from_pretrained(model_id) +``` + +We'll load in the WikiText-2 dataset and evaluate the perplexity using a few different sliding-window strategies. Since +this dataset is small and we're just doing one forward pass over the set, we can just load and encode the entire +dataset in memory. + +```python +from datasets import load_dataset + +test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") +encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt") +``` + +With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative +log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in +the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating +as context to be included in our loss, so we can set these targets to `-100` so that they are ignored. The following +is an example of how we could do this with a stride of `512`. This means that the model will have at least 512 tokens +for context when calculating the conditional likelihood of any one token (provided there are 512 preceding tokens +available to condition on). + +```python +import torch +from tqdm import tqdm + +max_length = model.config.n_positions +stride = 512 +seq_len = encodings.input_ids.size(1) + +nlls = [] +prev_end_loc = 0 +for begin_loc in tqdm(range(0, seq_len, stride)): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 + + with torch.no_grad(): + outputs = model(input_ids, labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + + nlls.append(neg_log_likelihood) + + prev_end_loc = end_loc + if end_loc == seq_len: + break + +ppl = torch.exp(torch.stack(nlls).mean()) +``` + +Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window +strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction, +and the better the reported perplexity will typically be. + +When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same +as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window +strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is +closer to the true autoregressive decomposition of a sequence likelihood. diff --git a/docs/source/en/perplexity.mdx b/docs/source/en/perplexity.mdx deleted file mode 100644 index 01f861c99c5e..000000000000 --- a/docs/source/en/perplexity.mdx +++ /dev/null @@ -1,140 +0,0 @@ - - -# Perplexity of fixed-length models - -[[open-in-colab]] - -Perplexity (PPL) is one of the most common metrics for evaluating language models. Before diving in, we should note -that the metric applies specifically to classical language models (sometimes called autoregressive or causal language -models) and is not well defined for masked language models like BERT (see [summary of the models](model_summary)). - -Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized -sequence \\(X = (x_0, x_1, \dots, x_t)\\), then the perplexity of \\(X\\) is, - -$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{ - -When working with approximate models, however, we typically have a constraint on the number of tokens the model can -process. The largest version of [GPT-2](model_doc/gpt2), for example, has a fixed length of 1024 tokens, so we -cannot calculate \\(p_\theta(x_t|x_{ - -This is quick to compute since the perplexity of each segment can be computed in one forward pass, but serves as a poor -approximation of the fully-factorized perplexity and will typically yield a higher (worse) PPL because the model will -have less context at most of the prediction steps. - -Instead, the PPL of fixed-length models should be evaluated with a sliding-window strategy. This involves repeatedly -sliding the context window so that the model has more context when making each prediction. - -Sliding window PPL taking advantage of all available context - -This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more -favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good -practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by -1 token a time. This allows computation to proceed much faster while still giving the model a large context to make -predictions at each step. - -## Example: Calculating perplexity with GPT-2 in 🤗 Transformers - -Let's demonstrate this process with GPT-2. - -```python -from transformers import GPT2LMHeadModel, GPT2TokenizerFast - -device = "cuda" -model_id = "gpt2-large" -model = GPT2LMHeadModel.from_pretrained(model_id).to(device) -tokenizer = GPT2TokenizerFast.from_pretrained(model_id) -``` - -We'll load in the WikiText-2 dataset and evaluate the perplexity using a few different sliding-window strategies. Since -this dataset is small and we're just doing one forward pass over the set, we can just load and encode the entire -dataset in memory. - -```python -from datasets import load_dataset - -test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") -encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt") -``` - -With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative -log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in -the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating -as context to be included in our loss, so we can set these targets to `-100` so that they are ignored. The following -is an example of how we could do this with a stride of `512`. This means that the model will have at least 512 tokens -for context when calculating the conditional likelihood of any one token (provided there are 512 preceding tokens -available to condition on). - -```python -import torch -from tqdm import tqdm - -max_length = model.config.n_positions -stride = 512 -seq_len = encodings.input_ids.size(1) - -nlls = [] -prev_end_loc = 0 -for begin_loc in tqdm(range(0, seq_len, stride)): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids, labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over input tokens. - # Multiply it with trg_len to get the summation instead of average. - # We will take average over all the tokens to get the true average - # in the last step of this example. - neg_log_likelihood = outputs.loss * trg_len - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - -ppl = torch.exp(torch.stack(nlls).sum() / end_loc) -``` - -Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window -strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction, -and the better the reported perplexity will typically be. - -When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.64`, which is about the same -as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window -strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is -closer to the true autoregressive decomposition of a sequence likelihood. diff --git a/docs/source/en/philosophy.md b/docs/source/en/philosophy.md new file mode 100644 index 000000000000..cad1e2ccdc8c --- /dev/null +++ b/docs/source/en/philosophy.md @@ -0,0 +1,79 @@ + + +# Philosophy + +🤗 Transformers is an opinionated library built for: + +- machine learning researchers and educators seeking to use, study or extend large-scale Transformers models. +- hands-on practitioners who want to fine-tune those models or serve them in production, or both. +- engineers who just want to download a pretrained model and use it to solve a given machine learning task. + +The library was designed with two strong goals in mind: + +1. Be as easy and fast to use as possible: + + - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, + just three standard classes required to use each model: [configuration](main_classes/configuration), + [models](main_classes/model), and a preprocessing class ([tokenizer](main_classes/tokenizer) for NLP, [image processor](main_classes/image_processor) for vision, [feature extractor](main_classes/feature_extractor) for audio, and [processor](main_classes/processors) for multimodal inputs). + - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common + `from_pretrained()` method which downloads (if needed), caches and + loads the related class instance and associated data (configurations' hyperparameters, tokenizers' vocabulary, + and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint. + - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly + using a model for inference on a given task and [`Trainer`] to quickly train or fine-tune a PyTorch model (all TensorFlow models are compatible with `Keras.fit`). + - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to + extend or build upon the library, just use regular Python, PyTorch, TensorFlow, Keras modules and inherit from the base + classes of the library to reuse functionalities like model loading and saving. If you'd like to learn more about our coding philosophy for models, check out our [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) blog post. + +2. Provide state-of-the-art models with performances as close as possible to the original models: + + - We provide at least one example for each architecture which reproduces a result provided by the official authors + of said architecture. + - The code is usually as close to the original code base as possible which means some PyTorch code may be not as + *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa. + +A few other goals: + +- Expose the models' internals as consistently as possible: + + - We give access, using a single API, to the full hidden-states and attention weights. + - The preprocessing classes and base model APIs are standardized to easily switch between models. + +- Incorporate a subjective selection of promising tools for fine-tuning and investigating these models: + + - A simple and consistent way to add new tokens to the vocabulary and embeddings for fine-tuning. + - Simple ways to mask and prune Transformer heads. + +- Easily switch between PyTorch, TensorFlow 2.0 and Flax, allowing training with one framework and inference with another. + +## Main concepts + +The library is built around three types of classes for each model: + +- **Model classes** can be PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) or JAX/Flax models ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html)) that work with the pretrained weights provided in the library. +- **Configuration classes** store the hyperparameters required to build a model (such as the number of layers and hidden size). You don't always need to instantiate these yourself. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model). +- **Preprocessing classes** convert the raw data into a format accepted by the model. A [tokenizer](main_classes/tokenizer) stores the vocabulary for each model and provide methods for encoding and decoding strings in a list of token embedding indices to be fed to a model. [Image processors](main_classes/image_processor) preprocess vision inputs, [feature extractors](main_classes/feature_extractor) preprocess audio inputs, and a [processor](main_classes/processors) handles multimodal inputs. + +All these classes can be instantiated from pretrained instances, saved locally, and shared on the Hub with three methods: + +- `from_pretrained()` lets you instantiate a model, configuration, and preprocessing class from a pretrained version either + provided by the library itself (the supported models can be found on the [Model Hub](https://huggingface.co/models)) or + stored locally (or on a server) by the user. +- `save_pretrained()` lets you save a model, configuration, and preprocessing class locally so that it can be reloaded using + `from_pretrained()`. +- `push_to_hub()` lets you share a model, configuration, and a preprocessing class to the Hub, so it is easily accessible to everyone. + diff --git a/docs/source/en/philosophy.mdx b/docs/source/en/philosophy.mdx deleted file mode 100644 index 7788d7836236..000000000000 --- a/docs/source/en/philosophy.mdx +++ /dev/null @@ -1,75 +0,0 @@ - - -# Philosophy - -🤗 Transformers is an opinionated library built for: - -- machine learning researchers and educators seeking to use, study or extend large-scale Transformers models. -- hands-on practitioners who want to fine-tune those models or serve them in production, or both. -- engineers who just want to download a pretrained model and use it to solve a given machine learning task. - -The library was designed with two strong goals in mind: - -1. Be as easy and fast to use as possible: - - - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, - just three standard classes required to use each model: [configuration](main_classes/configuration), - [models](main_classes/model), and a preprocessing class ([tokenizer](main_classes/tokenizer) for NLP, [image processor](main_classes/image_processor) for vision, [feature extractor](main_classes/feature_extractor) for audio, and [processor](main_classes/processors) for multimodal inputs). - - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common - `from_pretrained()` method which downloads (if needed), caches and - loads the related class instance and associated data (configurations' hyperparameters, tokenizers' vocabulary, - and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint. - - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly - using a model for inference on a given task and [`Trainer`] to quickly train or fine-tune a PyTorch model (all TensorFlow models are compatible with `Keras.fit`). - - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to - extend or build upon the library, just use regular Python, PyTorch, TensorFlow, Keras modules and inherit from the base - classes of the library to reuse functionalities like model loading and saving. If you'd like to learn more about our coding philosophy for models, check out our [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) blog post. - -2. Provide state-of-the-art models with performances as close as possible to the original models: - - - We provide at least one example for each architecture which reproduces a result provided by the official authors - of said architecture. - - The code is usually as close to the original code base as possible which means some PyTorch code may be not as - *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa. - -A few other goals: - -- Expose the models' internals as consistently as possible: - - - We give access, using a single API, to the full hidden-states and attention weights. - - The preprocessing classes and base model APIs are standardized to easily switch between models. - -- Incorporate a subjective selection of promising tools for fine-tuning and investigating these models: - - - A simple and consistent way to add new tokens to the vocabulary and embeddings for fine-tuning. - - Simple ways to mask and prune Transformer heads. - -- Easily switch between PyTorch, TensorFlow 2.0 and Flax, allowing training with one framework and inference with another. - -## Main concepts - -The library is built around three types of classes for each model: - -- **Model classes** can be PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) or JAX/Flax models ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html)) that work with the pretrained weights provided in the library. -- **Configuration classes** store the hyperparameters required to build a model (such as the number of layers and hidden size). You don't always need to instantiate these yourself. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model). -- **Preprocessing classes** convert the raw data into a format accepted by the model. A [tokenizer](main_classes/tokenizer) stores the vocabulary for each model and provide methods for encoding and decoding strings in a list of token embedding indices to be fed to a model. [Image processors](main_classes/image_processor) preprocess vision inputs, [feature extractors](main_classes/feature_extractor) preprocess audio inputs, and a [processor](main_classes/processors) handles multimodal inputs. - -All these classes can be instantiated from pretrained instances, saved locally, and shared on the Hub with three methods: - -- `from_pretrained()` lets you instantiate a model, configuration, and preprocessing class from a pretrained version either - provided by the library itself (the supported models can be found on the [Model Hub](https://huggingface.co/models)) or - stored locally (or on a server) by the user. -- `save_pretrained()` lets you save a model, configuration, and preprocessing class locally so that it can be reloaded using - `from_pretrained()`. -- `push_to_hub()` lets you share a model, configuration, and a preprocessing class to the Hub, so it is easily accessible to everyone. - diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md new file mode 100644 index 000000000000..460fc17274a8 --- /dev/null +++ b/docs/source/en/pipeline_tutorial.md @@ -0,0 +1,317 @@ + + +# Pipelines for inference + +The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to: + +* Use a [`pipeline`] for inference. +* Use a specific tokenizer or model. +* Use a [`pipeline`] for audio, vision, and multimodal tasks. + + + +Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters. + + + +## Pipeline usage + +While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains +all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable +of inference for your task. Let's take the example of using the [`pipeline`] for automatic speech recognition (ASR), or +speech-to-text. + + +1. Start by creating a [`pipeline`] and specify the inference task: + +```py +>>> from transformers import pipeline + +>>> transcriber = pipeline(task="automatic-speech-recognition") +``` + +2. Pass your input to the [`pipeline`]. In the case of speech recognition, this is an audio input file: + +```py +>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'} +``` + +Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) +on the Hub to see if you can get a better transcription. + +Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large) model from OpenAI. Whisper was released +2 years later than Wav2Vec2, and was trained on close to 10x more data. As such, it beats Wav2Vec2 on most downstream +benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with +Wav2Vec2. + +Let's give it a try here to see how it performs: + +```py +>>> transcriber = pipeline(model="openai/whisper-large-v2") +>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} +``` + +Now this result looks more accurate! For a deep-dive comparison on Wav2Vec2 vs Whisper, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/asr_models). +We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more. +You can check out and compare model results directly from your browser on the Hub to see if it fits or +handles corner cases better than other ones. +And if you don't find a model for your use case, you can always start [training](training) your own! + +If you have several inputs, you can pass your input as a list: + +```py +transcriber( + [ + "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", + "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", + ] +) +``` + +Pipelines are great for experimentation as switching from one model to another is trivial; however, there are some ways to optimize them for larger workloads than experimentation. See the following guides that dive into iterating over whole datasets or using pipelines in a webserver: +of the docs: +* [Using pipelines on a dataset](#using-pipelines-on-a-dataset) +* [Using pipelines for a webserver](./pipeline_webserver) + +## Parameters + +[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines. +In general, you can specify parameters anywhere you want: + +```py +transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1) + +out = transcriber(...) # This will use `my_parameter=1`. +out = transcriber(..., my_parameter=2) # This will override and use `my_parameter=2`. +out = transcriber(...) # This will go back to using `my_parameter=1`. +``` + +Let's check out 3 important ones: + +### Device + +If you use `device=n`, the pipeline automatically puts the model on the specified device. +This will work regardless of whether you are using PyTorch or Tensorflow. + +```py +transcriber = pipeline(model="openai/whisper-large-v2", device=0) +``` + +If the model is too large for a single GPU and you are using PyTorch, you can set `device_map="auto"` to automatically +determine how to load and store the model weights. Using the `device_map` argument requires the 🤗 [Accelerate](https://huggingface.co/docs/accelerate) +package: + +```bash +pip install --upgrade accelerate +``` + +The following code automatically loads and stores model weights across devices: + +```py +transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto") +``` + +Note that if `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior! + +### Batch size + +By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases. + +But if it works in your use case, you can use: + +```py +transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2) +audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)] +texts = transcriber(audio_filenames) +``` + +This runs the pipeline on the 4 provided audio files, but it will pass them in batches of 2 +to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. +The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline. + +Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. The pipeline performs this [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) for you. + +### Task specific parameters + +All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done. +For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter which sounds promising for subtitling videos: + + +```py +>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True) +>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]} +``` + +As you can see, the model inferred the text and also outputted **when** the various sentences were pronounced. + +There are many parameters available for each task, so check out each task's API reference to see what you can tinker with! +For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful +for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically +cannot handle on its own: + +```python +>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True) +>>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav") +{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening... +``` + +If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)! + + +## Using pipelines on a dataset + +The pipeline can also run inference on a large dataset. The easiest way we recommend doing this is by using an iterator: + +```py +def data(): + for i in range(1000): + yield f"My example {i}" + + +pipe = pipeline(model="gpt2", device=0) +generated_characters = 0 +for out in pipe(data()): + generated_characters += len(out[0]["generated_text"]) +``` + +The iterator `data()` yields each result, and the pipeline automatically +recognizes the input is iterable and will start fetching the data while +it continues to process it on the GPU (this uses [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) under the hood). +This is important because you don't have to allocate memory for the whole dataset +and you can feed the GPU as fast as possible. + +Since batching could speed things up, it may be useful to try tuning the `batch_size` parameter here. + +The simplest way to iterate over a dataset is to just load one from 🤗 [Datasets](https://github.com/huggingface/datasets/): + +```py +# KeyDataset is a util that will just output the item we're interested in. +from transformers.pipelines.pt_utils import KeyDataset +from datasets import load_dataset + +pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0) +dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]") + +for out in pipe(KeyDataset(dataset, "audio")): + print(out) +``` + + +## Using pipelines for a webserver + + +Creating an inference engine is a complex topic which deserves it's own +page. + + +[Link](./pipeline_webserver) + +## Vision pipeline + +Using a [`pipeline`] for vision tasks is practically identical. + +Specify your task and pass your image to the classifier. The image can be a link, a local path or a base64-encoded image. For example, what species of cat is shown below? + +![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) + +```py +>>> from transformers import pipeline + +>>> vision_classifier = pipeline(model="google/vit-base-patch16-224") +>>> preds = vision_classifier( +... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] +``` + +## Text pipeline + +Using a [`pipeline`] for NLP tasks is practically identical. + +```py +>>> from transformers import pipeline + +>>> # This model is a `zero-shot-classification` model. +>>> # It will classify text, except you are free to choose any label you might imagine +>>> classifier = pipeline(model="facebook/bart-large-mnli") +>>> classifier( +... "I have a problem with my iphone that needs to be resolved asap!!", +... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], +... ) +{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} +``` + +## Multimodal pipeline + +The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image. + +For example, if you use this [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png): + +```py +>>> from transformers import pipeline + +>>> vqa = pipeline(model="impira/layoutlm-document-qa") +>>> vqa( +... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", +... question="What is the invoice number?", +... ) +[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}] +``` + + + +To run the example above you need to have [`pytesseract`](https://pypi.org/project/pytesseract/) installed in addition to 🤗 Transformers: + +```bash +sudo apt install -y tesseract-ocr +pip install pytesseract +``` + + + +## Using `pipeline` on large models with 🤗 `accelerate`: + +You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. + +First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example. + +```py +# pip install accelerate +import torch +from transformers import pipeline + +pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto") +output = pipe("This is a cool example!", do_sample=True, top_p=0.95) +``` + +You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True` + +```py +# pip install accelerate bitsandbytes +import torch +from transformers import pipeline + +pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True}) +output = pipe("This is a cool example!", do_sample=True, top_p=0.95) +``` + +Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM! diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx deleted file mode 100644 index 4be43484e02a..000000000000 --- a/docs/source/en/pipeline_tutorial.mdx +++ /dev/null @@ -1,248 +0,0 @@ - - -# Pipelines for inference - -The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to: - -* Use a [`pipeline`] for inference. -* Use a specific tokenizer or model. -* Use a [`pipeline`] for audio, vision, and multimodal tasks. - - - -Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters. - - - -## Pipeline usage - -While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable of inference for your task. - -1. Start by creating a [`pipeline`] and specify an inference task: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="automatic-speech-recognition") -``` - -2. Pass your input text to the [`pipeline`]: - -```py ->>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") -{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'} -``` - -Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) on the Hub to see if you can get a better transcription. -Let's try [openai/whisper-large](https://huggingface.co/openai/whisper-large): - -```py ->>> generator = pipeline(model="openai/whisper-large") ->>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") -{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} -``` - -Now this result looks more accurate! -We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more. -You can check out and compare model results directly from your browser on the Hub to see if it fits or -handles corner cases better than other ones. -And if you don't find a model for your use case, you can always start [training](training) your own! - -If you have several inputs, you can pass your input as a list: - -```py -generator( - [ - "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", - "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", - ] -) -``` - -If you want to iterate over a whole dataset, or want to use it for inference in a webserver, check out dedicated parts - -[Using pipelines on a dataset](#using-pipelines-on-a-dataset) - -[Using pipelines for a webserver](./pipeline_webserver) - -## Parameters - -[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines. -In general you can specify parameters anywhere you want: - -```py -generator(model="openai/whisper-large", my_parameter=1) -out = generate(...) # This will use `my_parameter=1`. -out = generate(..., my_parameter=2) # This will override and use `my_parameter=2`. -out = generate(...) # This will go back to using `my_parameter=1`. -``` - -Let's check out 3 important ones: - -### Device - -If you use `device=n`, the pipeline automatically puts the model on the specified device. -This will work regardless of whether you are using PyTorch or Tensorflow. - -```py -generator(model="openai/whisper-large", device=0) -``` - -If the model is too large for a single GPU, you can set `device_map="auto"` to allow 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to automatically determine how to load and store the model weights. - -```py -#!pip install accelerate -generator(model="openai/whisper-large", device_map="auto") -``` - -### Batch size - -By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases. - -But if it works in your use case, you can use: - -```py -generator(model="openai/whisper-large", device=0, batch_size=2) -audio_filenames = [f"audio_{i}.flac" for i in range(10)] -texts = generator(audio_filenames) -``` - -This runs the pipeline on the 10 provided audio files, but it will pass them in batches of 2 -to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. -The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline. - -Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. The pipeline performs this [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) for you. - -### Task specific parameters - -All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done. -For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter which sounds promising for subtitling videos: - - -```py ->>> # Not using whisper, as it cannot provide timestamps. ->>> generator = pipeline(model="facebook/wav2vec2-large-960h-lv60-self", return_timestamps="word") ->>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") -{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP AND LIVE OUT THE TRUE MEANING OF ITS CREED', 'chunks': [{'text': 'I', 'timestamp': (1.22, 1.24)}, {'text': 'HAVE', 'timestamp': (1.42, 1.58)}, {'text': 'A', 'timestamp': (1.66, 1.68)}, {'text': 'DREAM', 'timestamp': (1.76, 2.14)}, {'text': 'BUT', 'timestamp': (3.68, 3.8)}, {'text': 'ONE', 'timestamp': (3.94, 4.06)}, {'text': 'DAY', 'timestamp': (4.16, 4.3)}, {'text': 'THIS', 'timestamp': (6.36, 6.54)}, {'text': 'NATION', 'timestamp': (6.68, 7.1)}, {'text': 'WILL', 'timestamp': (7.32, 7.56)}, {'text': 'RISE', 'timestamp': (7.8, 8.26)}, {'text': 'UP', 'timestamp': (8.38, 8.48)}, {'text': 'AND', 'timestamp': (10.08, 10.18)}, {'text': 'LIVE', 'timestamp': (10.26, 10.48)}, {'text': 'OUT', 'timestamp': (10.58, 10.7)}, {'text': 'THE', 'timestamp': (10.82, 10.9)}, {'text': 'TRUE', 'timestamp': (10.98, 11.18)}, {'text': 'MEANING', 'timestamp': (11.26, 11.58)}, {'text': 'OF', 'timestamp': (11.66, 11.7)}, {'text': 'ITS', 'timestamp': (11.76, 11.88)}, {'text': 'CREED', 'timestamp': (12.0, 12.38)}]} -``` - -As you can see, the model inferred the text and also outputted **when** the various words were pronounced -in the sentence. - -There are many parameters available for each task, so check out each task's API reference to see what you can tinker with! -For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically cannot handle on its own. - - -If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)! - - -## Using pipelines on a dataset - -The pipeline can also run inference on a large dataset. The easiest way we recommend doing this is by using an iterator: - -```py -def data(): - for i in range(1000): - yield f"My example {i}" - - -pipe = pipe(model="gpt2", device=0) -generated_characters = 0 -for out in pipe(data()): - generated_characters += len(out["generated_text"]) -``` - -The iterator `data()` yields each result, and the pipeline automatically -recognizes the input is iterable and will start fetching the data while -it continues to process it on the GPU (this uses [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) under the hood). -This is important because you don't have to allocate memory for the whole dataset -and you can feed the GPU as fast as possible. - -Since batching could speed things up, it may be useful to try tuning the `batch_size` parameter here. - -The simplest way to iterate over a dataset is to just load one from 🤗 [Datasets](https://github.com/huggingface/datasets/): - -```py -# KeyDataset is a util that will just output the item we're interested in. -from transformers.pipelines.pt_utils import KeyDataset - -pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0) -dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]") - -for out in pipe(KeyDataset(dataset["audio"])): - print(out) -``` - - -## Using pipelines for a webserver - - -Creating an inference engine is a complex topic which deserves it's own -page. - - -[Link](./pipeline_webserver) - -## Vision pipeline - -Using a [`pipeline`] for vision tasks is practically identical. - -Specify your task and pass your image to the classifier. The image can be a link or a local path to the image. For example, what species of cat is shown below? - -![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) - -```py ->>> from transformers import pipeline - ->>> vision_classifier = pipeline(model="google/vit-base-patch16-224") ->>> preds = vision_classifier( -... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" -... ) ->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] ->>> preds -[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] -``` - -## Text pipeline - -Using a [`pipeline`] for NLP tasks is practically identical. - -```py ->>> from transformers import pipeline - ->>> # This model is a `zero-shot-classification` model. ->>> # It will classify text, except you are free to choose any label you might imagine ->>> classifier = pipeline(model="facebook/bart-large-mnli") ->>> classifier( -... "I have a problem with my iphone that needs to be resolved asap!!", -... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], -... ) -{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} -``` - -## Multimodal pipeline - -The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image. - -For example, if you use this [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png): - -```py ->>> from transformers import pipeline - ->>> vqa = pipeline(model="impira/layoutlm-document-qa") ->>> vqa( -... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", -... question="What is the invoice number?", -... ) -[{'score': 0.42514941096305847, 'answer': 'us-001', 'start': 16, 'end': 16}] -``` diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md new file mode 100644 index 000000000000..38ef28d498c6 --- /dev/null +++ b/docs/source/en/pipeline_webserver.md @@ -0,0 +1,168 @@ + + +# Using pipelines for a webserver + + +Creating an inference engine is a complex topic, and the "best" solution +will most likely depend on your problem space. Are you on CPU or GPU? Do +you want the lowest latency, the highest throughput, support for +many models, or just highly optimize 1 specific model? +There are many ways to tackle this topic, so what we are going to present is a good default +to get started which may not necessarily be the most optimal solution for you. + + + +The key thing to understand is that we can use an iterator, just like you would [on a +dataset](pipeline_tutorial#using-pipelines-on-a-dataset), since a webserver is basically a system that waits for requests and +treats them as they come in. + +Usually webservers are multiplexed (multithreaded, async, etc..) to handle various +requests concurrently. Pipelines on the other hand (and mostly the underlying models) +are not really great for parallelism; they take up a lot of RAM, so it's best to give them all the available resources when they are running or it's a compute-intensive job. + +We are going to solve that by having the webserver handle the light load of receiving +and sending requests, and having a single thread handling the actual work. +This example is going to use `starlette`. The actual framework is not really +important, but you might have to tune or change the code if you are using another +one to achieve the same effect. + +Create `server.py`: + +```py +from starlette.applications import Starlette +from starlette.responses import JSONResponse +from starlette.routing import Route +from transformers import pipeline +import asyncio + + +async def homepage(request): + payload = await request.body() + string = payload.decode("utf-8") + response_q = asyncio.Queue() + await request.app.model_queue.put((string, response_q)) + output = await response_q.get() + return JSONResponse(output) + + +async def server_loop(q): + pipe = pipeline(model="bert-base-uncased") + while True: + (string, response_q) = await q.get() + out = pipe(string) + await response_q.put(out) + + +app = Starlette( + routes=[ + Route("/", homepage, methods=["POST"]), + ], +) + + +@app.on_event("startup") +async def startup_event(): + q = asyncio.Queue() + app.model_queue = q + asyncio.create_task(server_loop(q)) +``` + +Now you can start it with: +```bash +uvicorn server:app +``` + +And you can query it: +```bash +curl -X POST -d "test [MASK]" http://localhost:8000/ +#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...] +``` + +And there you go, now you have a good idea of how to create a webserver! + +What is really important is that we load the model only **once**, so there are no copies +of the model on the webserver. This way, no unnecessary RAM is being used. +Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few +items before inferring to use dynamic batching: + + + +The code sample below is intentionally written like pseudo-code for readability. +Do not run this without checking if it makes sense for your system resources! + + + +```py +(string, rq) = await q.get() +strings = [] +queues = [] +while True: + try: + (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) # 1ms + except asyncio.exceptions.TimeoutError: + break + strings.append(string) + queues.append(rq) +strings +outs = pipe(strings, batch_size=len(strings)) +for rq, out in zip(queues, outs): + await rq.put(out) +``` + +Again, the proposed code is optimized for readability, not for being the best code. +First of all, there's no batch size limit which is usually not a +great idea. Next, the timeout is reset on every queue fetch, meaning you could +wait much more than 1ms before running the inference (delaying the first request +by that much). + +It would be better to have a single 1ms deadline. + +This will always wait for 1ms even if the queue is empty, which might not be the +best since you probably want to start doing inference if there's nothing in the queue. +But maybe it does make sense if batching is really crucial for your use case. +Again, there's really no one best solution. + + +## Few things you might want to consider + +### Error checking + +There's a lot that can go wrong in production: out of memory, out of space, +loading the model might fail, the query might be wrong, the query might be +correct but still fail to run because of a model misconfiguration, and so on. + +Generally, it's good if the server outputs the errors to the user, so +adding a lot of `try..except` statements to show those errors is a good +idea. But keep in mind it may also be a security risk to reveal all those errors depending +on your security context. + +### Circuit breaking + +Webservers usually look better when they do circuit breaking. It means they +return proper errors when they're overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time. + +This is relatively easy to implement in the proposed code since there is a single queue. +Looking at the queue size is a basic way to start returning errors before your +webserver fails under load. + +### Blocking the main thread + +Currently PyTorch is not async aware, and computation will block the main +thread while running. That means it would be better if PyTorch was forced to run +on its own thread/process. This wasn't done here because the code is a lot more +complex (mostly because threads and async and queues don't play nice together). +But ultimately it does the same thing. + +This would be important if the inference of single items were long (> 1s) because +in this case, it means every query during inference would have to wait for 1s before +even receiving an error. + +### Dynamic batching + +In general, batching is not necessarily an improvement over passing 1 item at +a time (see [batching details](./main_classes/pipelines#pipeline-batching) for more information). But it can be very effective +when used in the correct setting. In the API, there is no dynamic +batching by default (too much opportunity for a slowdown). But for BLOOM inference - +which is a very large model - dynamic batching is **essential** to provide a decent experience for everyone. diff --git a/docs/source/en/pipeline_webserver.mdx b/docs/source/en/pipeline_webserver.mdx deleted file mode 100644 index d9f12fa2b3a0..000000000000 --- a/docs/source/en/pipeline_webserver.mdx +++ /dev/null @@ -1,161 +0,0 @@ -# Using pipelines for a webserver - - -Creating an inference engine is a complex topic, and the "best" solution -will most likely depend on your problem space. Are you on CPU or GPU? Do -you want the lowest latency, the highest throughput, support for -many models, or just highly optimize 1 specific model? -There are many ways to tackle this topic, so what we are going to present is a good default -to get started which may not necessarily be the most optimal solution for you. - - - -The key thing to understand is that we can use an iterator, just like you would [on a -dataset](pipeline_tutorial#using-pipelines-on-a-dataset), since a webserver is basically a system that waits for requests and -treats them as they come in. - -Usually webservers are multiplexed (multithreaded, async, etc..) to handle various -requests concurrently. Pipelines on the other hand (and mostly the underlying models) -are not really great for parallelism; they take up a lot of RAM, so it's best to give them all the available resources when they are running or it's a compute-intensive job. - -We are going to solve that by having the webserver handle the light load of receiving -and sending requests, and having a single thread handling the actual work. -This example is going to use `starlette`. The actual framework is not really -important, but you might have to tune or change the code if you are using another -one to achieve the same effect. - -Create `server.py`: - -```py -from starlette.applications import Starlette -from starlette.responses import JSONResponse -from starlette.routing import Route -from transformers import pipeline -import asyncio - - -async def homepage(request): - payload = await request.body() - string = payload.decode("utf-8") - response_q = asyncio.Queue() - await request.app.model_queue.put((string, response_q)) - output = await response_q.get() - return JSONResponse(output) - - -async def server_loop(q): - pipe = pipeline(model="bert-base-uncased") - while True: - (string, response_q) = await q.get() - out = pipe(string) - await response_q.put(out) - - -app = Starlette( - routes=[ - Route("/", homepage, methods=["POST"]), - ], -) - - -@app.on_event("startup") -async def startup_event(): - q = asyncio.Queue() - app.model_queue = q - asyncio.create_task(server_loop(q)) -``` - -Now you can start it with: -```bash -uvicorn server:app -``` - -And you can query it: -```bash -curl -X POST -d "test [MASK]" http://localhost:8000/ -#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...] -``` - -And there you go, now you have a good idea of how to create a webserver! - -What is really important is that we load the model only **once**, so there are no copies -of the model on the webserver. This way, no unnecessary RAM is being used. -Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few -items before inferring to use dynamic batching: - -```py -(string, rq) = await q.get() -strings = [] -queues = [] -while True: - try: - (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) # 1ms - except asyncio.exceptions.TimeoutError: - break - strings.append(string) - queues.append(rq) -strings -outs = pipe(strings, batch_size=len(strings)) -for (rq, out) in zip(queues, outs): - await rq.put(out) -``` - - -Do not activate this without checking it makes sense for your load! - - -The proposed code is optimized for readability, not for being the best code. -First of all, there's no batch size limit which is usually not a -great idea. Next, the timeout is reset on every queue fetch, meaning you could -wait much more than 1ms before running the inference (delaying the first request -by that much). - -It would be better to have a single 1ms deadline. - -This will always wait for 1ms even if the queue is empty, which might not be the -best since you probably want to start doing inference if there's nothing in the queue. -But maybe it does make sense if batching is really crucial for your use case. -Again, there's really no one best solution. - - -## Few things you might want to consider - -### Error checking - -There's a lot that can go wrong in production: out of memory, out of space, -loading the model might fail, the query might be wrong, the query might be -correct but still fail to run because of a model misconfiguration, and so on. - -Generally, it's good if the server outputs the errors to the user, so -adding a lot of `try..except` statements to show those errors is a good -idea. But keep in mind it may also be a security risk to reveal all those errors depending -on your security context. - -### Circuit breaking - -Webservers usually look better when they do circuit breaking. It means they -return proper errors when they're overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time. - -This is relatively easy to implement in the proposed code since there is a single queue. -Looking at the queue size is a basic way to start returning errors before your -webserver fails under load. - -### Blocking the main thread - -Currently PyTorch is not async aware, and computation will block the main -thread while running. That means it would be better if PyTorch was forced to run -on its own thread/process. This wasn't done here because the code is a lot more -complex (mostly because threads and async and queues don't play nice together). -But ultimately it does the same thing. - -This would be important if the inference of single items were long (> 1s) because -in this case, it means every query during inference would have to wait for 1s before -even receiving an error. - -### Dynamic batching - -In general, batching is not necessarily an improvement over passing 1 item at -a time (see [batching details](./main_classes/pipelines#pipeline-batching) for more information). But it can be very effective -when used in the correct setting. In the API, there is no dynamic -batching by default (too much opportunity for a slowdown). But for BLOOM inference - -which is a very large model - dynamic batching is **essential** to provide a decent experience for everyone. diff --git a/docs/source/en/pr_checks.md b/docs/source/en/pr_checks.md new file mode 100644 index 000000000000..f50cede3264f --- /dev/null +++ b/docs/source/en/pr_checks.md @@ -0,0 +1,200 @@ + + +# Checks on a Pull Request + +When you open a pull request on 🤗 Transformers, a fair number of checks will be run to make sure the patch you are adding is not breaking anything existing. Those checks are of four types: +- regular tests +- documentation build +- code and documentation style +- general repository consistency + +In this document, we will take a stab at explaining what those various checks are and the reason behind them, as well as how to debug them locally if one of them fails on your PR. + +Note that, ideally, they require you to have a dev install: + +```bash +pip install transformers[dev] +``` + +or for an editable install: + +```bash +pip install -e .[dev] +``` + +inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install the Deep Learning framework you are working with (PyTorch, TensorFlow and/or Flax) then do + +```bash +pip install transformers[quality] +``` + +or for an editable install: + +```bash +pip install -e .[quality] +``` + + +## Tests + +All the jobs that begin with `ci/circleci: run_tests_` run parts of the Transformers testing suite. Each of those jobs focuses on a part of the library in a certain environment: for instance `ci/circleci: run_tests_pipelines_tf` runs the pipelines test in an environment where TensorFlow only is installed. + +Note that to avoid running tests when there is no real change in the modules they are testing, only part of the test suite is run each time: a utility is run to determine the differences in the library between before and after the PR (what GitHub shows you in the "Files changes" tab) and picks the tests impacted by that diff. That utility can be run locally with: + +```bash +python utils/tests_fetcher.py +``` + +from the root of the Transformers repo. It will: + +1. Check for each file in the diff if the changes are in the code or only in comments or docstrings. Only the files with real code changes are kept. +2. Build an internal map that gives for each file of the source code of the library all the files it recursively impacts. Module A is said to impact module B if module B imports module A. For the recursive impact, we need a chain of modules going from module A to module B in which each module imports the previous one. +3. Apply this map on the files gathered in step 1, which gives us the list of model files impacted by the PR. +4. Map each of those files to their corresponding test file(s) and get the list of tests to run. + +When executing the script locally, you should get the results of step 1, 3 and 4 printed and thus know which tests are run. The script will also create a file named `test_list.txt` which contains the list of tests to run, and you can run them locally with the following command: + +```bash +python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt) +``` + +Just in case anything slipped through the cracks, the full test suite is also run daily. + +## Documentation build + +The `build_pr_documentation` job builds and generates a preview of the documentation to make sure everything looks okay once your PR is merged. A bot will add a link to preview the documentation in your PR. Any changes you make to the PR are automatically updated in the preview. If the documentation fails to build, click on **Details** next to the failed job to see where things went wrong. Often, the error is as simple as a missing file in the `toctree`. + +If you're interested in building or previewing the documentation locally, take a look at the [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) in the docs folder. + +## Code and documentation style + +Code formatting is applied to all the source files, the examples and the tests using `black` and `ruff`. We also have a custom tool taking care of the formatting of docstrings and `rst` files (`utils/style_doc.py`), as well as the order of the lazy imports performed in the Transformers `__init__.py` files (`utils/custom_init_isort.py`). All of this can be launched by executing + +```bash +make style +``` + +The CI checks those have been applied inside the `ci/circleci: check_code_quality` check. It also runs `ruff`, that will have a basic look at your code and will complain if it finds an undefined variable, or one that is not used. To run that check locally, use + +```bash +make quality +``` + +This can take a lot of time, so to run the same thing on only the files you modified in the current branch, run + +```bash +make fixup +``` + +This last command will also run all the additional checks for the repository consistency. Let's have a look at them. + +## Repository consistency + +This regroups all the tests to make sure your PR leaves the repository in a good state, and is performed by the `ci/circleci: check_repository_consistency` check. You can locally run that check by executing the following: + +```bash +make repo-consistency +``` + +This checks that: + +- All objects added to the init are documented (performed by `utils/check_repo.py`) +- All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`) +- All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`) +- All configuration classes have at least one valid checkpoint mentioned in their docstrings (performed by `utils/check_config_docstrings.py`) +- All configuration classes only contain attributes that are used in corresponding modeling files (performed by `utils/check_config_attributes.py`) +- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`) +- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`) +- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`) +- All docstrings properly document the arguments in the signature of the object (performed by `utils/check_docstrings.py`) + +Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command + +```bash +make fix-copies +``` + +Additional checks concern PRs that add new models, mainly that: + +- All models added are in an Auto-mapping (performed by `utils/check_repo.py`) + +- All models are properly tested (performed by `utils/check_repo.py`) + + + +### Check copies + +Since the Transformers library is very opinionated with respect to model code, and each model should fully be implemented in a single file without relying on other models, we have added a mechanism that checks whether a copy of the code of a layer of a given model stays consistent with the original. This way, when there is a bug fix, we can see all other impacted models and choose to trickle down the modification or break the copy. + + + +If a file is a full copy of another file, you should register it in the constant `FULL_COPIES` of `utils/check_copies.py`. + + + +This mechanism relies on comments of the form `# Copied from xxx`. The `xxx` should contain the whole path to the class of function which is being copied below. For instance, `RobertaSelfOutput` is a direct copy of the `BertSelfOutput` class, so you can see [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) it has a comment: + +```py +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +``` + +Note that instead of applying this to a whole class, you can apply it to the relevant methods that are copied from. For instance [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) you can see how `RobertaPreTrainedModel._init_weights` is copied from the same method in `BertPreTrainedModel` with the comment: + +```py +# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights +``` + +Sometimes the copy is exactly the same except for names: for instance in `RobertaAttention`, we use `RobertaSelfAttention` insted of `BertSelfAttention` but other than that, the code is exactly the same. This is why `# Copied from` supports simple string replacements with the follwoing syntax: `Copied from xxx with foo->bar`. This means the code is copied with all instances of `foo` being replaced by `bar`. You can see how it used [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` with the comment: + +```py +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta +``` + +Note that there shouldn't be any spaces around the arrow (unless that space is part of the pattern to replace of course). + +You can add several patterns separated by a comma. For instance here `CamemberForMaskedLM` is a direct copy of `RobertaForMaskedLM` with two replacements: `Roberta` to `Camembert` and `ROBERTA` to `CAMEMBERT`. You can see [here](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) this is done with the comment: + +```py +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +``` + +If the order matters (because one of the replacements might conflict with a previous one), the replacements are executed from left to right. + + + +If the replacements change the formatting (if you replace a short name by a very long name for instance), the copy is checked after applying the auto-formatter. + + + +Another way when the patterns are just different casings of the same replacement (with an uppercased and a lowercased variants) is just to add the option `all-casing`. [Here](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) is an example in `MobileBertForSequenceClassification` with the comment: + +```py +# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing +``` + +In this case, the code is copied from `BertForSequenceClassification` by replacing: +- `Bert` by `MobileBert` (for instance when using `MobileBertModel` in the init) +- `bert` by `mobilebert` (for instance when defining `self.mobilebert`) +- `BERT` by `MOBILEBERT` (in the constant `MOBILEBERT_INPUTS_DOCSTRING`) diff --git a/docs/source/en/pr_checks.mdx b/docs/source/en/pr_checks.mdx deleted file mode 100644 index 8b562b62b29c..000000000000 --- a/docs/source/en/pr_checks.mdx +++ /dev/null @@ -1,128 +0,0 @@ - - -# Checks on a Pull Request - -When you open a pull request on 🤗 Transformers, a fair number of checks will be run to make sure the patch you are adding is not breaking anything existing. Those checks are of four types: -- regular tests -- documentation build -- code and documentation style -- general repository consistency - -In this document, we will take a stab at explaining what those various checks are and the reason behind them, as well as how to debug them locally if one of them fails on your PR. - -Note that they all require you to have a dev install: - -```bash -pip install transformers[dev] -``` - -or for an editable install: - -```bash -pip install -e .[dev] -``` - -inside the Transformers repo. - -## Tests - -All the jobs that begin with `ci/circleci: run_tests_` run parts of the Transformers testing suite. Each of those jobs focuses on a part of the library in a certain environment: for instance `ci/circleci: run_tests_pipelines_tf` runs the pipelines test in an environment where TensorFlow only is installed. - -Note that to avoid running tests when there is no real change in the modules they are testing, only part of the test suite is run each time: a utility is run to determine the differences in the library between before and after the PR (what GitHub shows you in the "Files changes" tab) and picks the tests impacted by that diff. That utility can be run locally with: - -```bash -python utils/tests_fetcher.py -``` - -from the root of the Transformers repo. It will: - -1. Check for each file in the diff if the changes are in the code or only in comments or docstrings. Only the files with real code changes are kept. -2. Build an internal map that gives for each file of the source code of the library all the files it recursively impacts. Module A is said to impact module B if module B imports module A. For the recursive impact, we need a chain of modules going from module A to module B in which each module imports the previous one. -3. Apply this map on the files gathered in step 1, which gives us the list of model files impacted by the PR. -4. Map each of those files to their corresponding test file(s) and get the list of tests to run. - -When executing the script locally, you should get the results of step 1, 3 and 4 printed and thus know which tests are run. The script will also create a file named `test_list.txt` which contains the list of tests to run, and you can run them locally with the following command: - -```bash -python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt) -``` - -Just in case anything slipped through the cracks, the full test suite is also run daily. - -## Documentation build - -The `build_pr_documentation` job builds and generates a preview of the documentation to make sure everything looks okay once your PR is merged. A bot will add a link to preview the documentation in your PR. Any changes you make to the PR are automatically updated in the preview. If the documentation fails to build, click on **Details** next to the failed job to see where things went wrong. Often, the error is as simple as a missing file in the `toctree`. - -If you're interested in building or previewing the documentation locally, take a look at the [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) in the docs folder. - -## Code and documentation style - -Code formatting is applied to all the source files, the examples and the tests using `black` and `isort`. We also have a custom tool taking care of the formatting of docstrings and `rst` files (`utils/style_doc.py`), as well as the order of the lazy imports performed in the Transformers `__init__.py` files (`utils/custom_init_isort.py`). All of this can be launched by executing - -```bash -make style -``` - -The CI checks those have been applied inside the `ci/circleci: check_code_quality` check. It also runs `flake8`, that will have a basic look at your code and will complain if it finds an undefined variable, or one that is not used. To run that check locally, use - -```bash -make quality -``` - -This can take a lot of time, so to run the same thing on only the files you modified in the current branch, run - -```bash -make fixup -``` - -This last command will also run all the additional checks for the repository consistency. Let's have a look at them. - -## Repository consistency - -This regroups all the tests to make sure your PR leaves the repository in a good state, and is performed by the `ci/circleci: check_repository_consistency` check. You can locally run that check by executing the following: - -```bash -make repo-consistency -``` - -This checks that: - -- All objects added to the init are documented (performed by `utils/check_repo.py`) -- All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`) -- All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`) -- All configuration classes have at least one valid checkpoint mentioned in their docstrings (performed by `utils/check_config_docstrings.py`) -- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`) -- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`) -- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`) - -Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command - -```bash -make fix-copies -``` - -Additional checks concern PRs that add new models, mainly that: - -- All models added are in an Auto-mapping (performed by `utils/check_repo.py`) - -- All models are properly tested (performed by `utils/check_repo.py`) - - diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md new file mode 100644 index 000000000000..c90c6c2a2288 --- /dev/null +++ b/docs/source/en/preprocessing.md @@ -0,0 +1,529 @@ + + +# Preprocess + +[[open-in-colab]] + +Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, they need to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for: + +* Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors. +* Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors. +* Image inputs use a [ImageProcessor](./main_classes/image) to convert images into tensors. +* Multimodal inputs, use a [Processor](./main_classes/processors) to combine a tokenizer and a feature extractor or image processor. + + + +`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor. + + + +Before you begin, install 🤗 Datasets so you can load some datasets to experiment with: + +```bash +pip install datasets +``` + +## Natural Language Processing + + + +The main tool for preprocessing textual data is a [tokenizer](main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer. + + + +If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining. + + + +Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pretrained`] method. This downloads the *vocab* a model was pretrained with: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +``` + +Then pass your text to the tokenizer: + +```py +>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") +>>> print(encoded_input) +{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +The tokenizer returns a dictionary with three important items: + +* [input_ids](glossary#input-ids) are the indices corresponding to each token in the sentence. +* [attention_mask](glossary#attention-mask) indicates whether a token should be attended to or not. +* [token_type_ids](glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence. + +Return your input by decoding the `input_ids`: + +```py +>>> tokenizer.decode(encoded_input["input_ids"]) +'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' +``` + +As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need +special tokens, but if they do, the tokenizer automatically adds them for you. + +If there are several sentences you want to preprocess, pass them as a list to the tokenizer: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_inputs = tokenizer(batch_sentences) +>>> print(encoded_inputs) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1]]} +``` + +### Pad + +Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences. + +Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +The first and third sentences are now padded with `0`'s because they are shorter. + +### Truncation + +On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length. + +Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + + + +Check out the [Padding and truncation](./pad_truncation) concept guide to learn more different padding and truncation arguments. + + + +### Build tensors + +Finally, you want the tokenizer to return the actual tensors that get fed to the model. + +Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for TensorFlow: + + + + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") +>>> print(encoded_input) +{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} +``` + + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") +>>> print(encoded_input) +{'input_ids': , + 'token_type_ids': , + 'attention_mask': } +``` + + + +## Audio + +For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors. + +Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets: + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +``` + +Access the first element of the `audio` column to take a look at the input. Calling the `audio` column automatically loads and resamples the audio file: + +```py +>>> dataset[0]["audio"] +{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 8000} +``` + +This returns three items: + +* `array` is the speech signal loaded - and potentially resampled - as a 1D array. +* `path` points to the location of the audio file. +* `sampling_rate` refers to how many data points in the speech signal are measured per second. + +For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data. + +1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz: + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +2. Call the `audio` column again to resample the audio file: + +```py +>>> dataset[0]["audio"] +{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., + 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 16000} +``` + +Next, load a feature extractor to normalize and pad the input. When padding textual data, a `0` is added for shorter sequences. The same idea applies to audio data. The feature extractor adds a `0` - interpreted as silence - to `array`. + +Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +``` + +Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur. + +```py +>>> audio_input = [dataset[0]["audio"]["array"]] +>>> feature_extractor(audio_input, sampling_rate=16000) +{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., + 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} +``` + +Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples: + +```py +>>> dataset[0]["audio"]["array"].shape +(173398,) + +>>> dataset[1]["audio"]["array"].shape +(106496,) +``` + +Create a function to preprocess the dataset so the audio samples are the same lengths. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it: + +```py +>>> def preprocess_function(examples): +... audio_arrays = [x["array"] for x in examples["audio"]] +... inputs = feature_extractor( +... audio_arrays, +... sampling_rate=16000, +... padding=True, +... max_length=100000, +... truncation=True, +... ) +... return inputs +``` + +Apply the `preprocess_function` to the the first few examples in the dataset: + +```py +>>> processed_dataset = preprocess_function(dataset[:5]) +``` + +The sample lengths are now the same and match the specified maximum length. You can pass your processed dataset to the model now! + +```py +>>> processed_dataset["input_values"][0].shape +(100000,) + +>>> processed_dataset["input_values"][1].shape +(100000,) +``` + +## Computer vision + +For computer vision tasks, you'll need an [image processor](main_classes/image_processor) to prepare your dataset for the model. +Image preprocessing consists of several steps that convert images into the input expected by the model. These steps +include but are not limited to resizing, normalizing, color channel correction, and converting images to tensors. + + + +Image preprocessing often follows some form of image augmentation. Both image preprocessing and image augmentation +transform image data, but they serve different purposes: + +* Image augmentation alters images in a way that can help prevent overfitting and increase the robustness of the model. You can get creative in how you augment your data - adjust brightness and colors, crop, rotate, resize, zoom, etc. However, be mindful not to change the meaning of the images with your augmentations. +* Image preprocessing guarantees that the images match the model’s expected input format. When fine-tuning a computer vision model, images must be preprocessed exactly as when the model was initially trained. + +You can use any library you like for image augmentation. For image preprocessing, use the `ImageProcessor` associated with the model. + + + +Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets: + + + +Use 🤗 Datasets `split` parameter to only load a small sample from the training split since the dataset is quite large! + + + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("food101", split="train[:100]") +``` + +Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) feature: + +```py +>>> dataset[0]["image"] +``` + +
+ +
+ +Load the image processor with [`AutoImageProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +``` + +First, let's add some image augmentation. You can use any library you prefer, but in this tutorial, we'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module. If you're interested in using another data augmentation library, learn how in the [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) or [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb). + +1. Here we use [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) to chain together a couple of +transforms - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html). +Note that for resizing, we can get the image size requirements from the `image_processor`. For some models, an exact height and +width are expected, for others only the `shortest_edge` is defined. + +```py +>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose + +>>> size = ( +... image_processor.size["shortest_edge"] +... if "shortest_edge" in image_processor.size +... else (image_processor.size["height"], image_processor.size["width"]) +... ) + +>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)]) +``` + +2. The model accepts [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) +as its input. `ImageProcessor` can take care of normalizing the images, and generating appropriate tensors. +Create a function that combines image augmentation and image preprocessing for a batch of images and generates `pixel_values`: + +```py +>>> def transforms(examples): +... images = [_transforms(img.convert("RGB")) for img in examples["image"]] +... examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"] +... return examples +``` + + + +In the example above we set `do_resize=False` because we have already resized the images in the image augmentation transformation, +and leveraged the `size` attribute from the appropriate `image_processor`. If you do not resize images during image augmentation, +leave this parameter out. By default, `ImageProcessor` will handle the resizing. + +If you wish to normalize images as a part of the augmentation transformation, use the `image_processor.image_mean`, +and `image_processor.image_std` values. + + +3. Then use 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) to apply the transforms on the fly: + +```py +>>> dataset.set_transform(transforms) +``` + +4. Now when you access the image, you'll notice the image processor has added `pixel_values`. You can pass your processed dataset to the model now! + +```py +>>> dataset[0].keys() +``` + +Here is what the image looks like after the transforms are applied. The image has been randomly cropped and it's color properties are different. + +```py +>>> import numpy as np +>>> import matplotlib.pyplot as plt + +>>> img = dataset[0]["pixel_values"] +>>> plt.imshow(img.permute(1, 2, 0)) +``` + +
+ +
+ + + +For tasks like object detection, semantic segmentation, instance segmentation, and panoptic segmentation, `ImageProcessor` +offers post processing methods. These methods convert model's raw outputs into meaningful predictions such as bounding boxes, +or segmentation maps. + + + +### Pad + +In some cases, for instance, when fine-tuning [DETR](./model_doc/detr), the model applies scale augmentation at training +time. This may cause images to be different sizes in a batch. You can use [`DetrImageProcessor.pad`] +from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images together. + +```py +>>> def collate_fn(batch): +... pixel_values = [item["pixel_values"] for item in batch] +... encoding = image_processor.pad(pixel_values, return_tensors="pt") +... labels = [item["labels"] for item in batch] +... batch = {} +... batch["pixel_values"] = encoding["pixel_values"] +... batch["pixel_mask"] = encoding["pixel_mask"] +... batch["labels"] = labels +... return batch +``` + +## Multimodal + +For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as as tokenizer and feature extractor. + +Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR): + +```py +>>> from datasets import load_dataset + +>>> lj_speech = load_dataset("lj_speech", split="train") +``` + +For ASR, you're mainly focused on `audio` and `text` so you can remove the other columns: + +```py +>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) +``` + +Now take a look at the `audio` and `text` columns: + +```py +>>> lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} + +>>> lj_speech[0]["text"] +'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' +``` + +Remember you should always [resample](preprocessing#audio) your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model! + +```py +>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +Load a processor with [`AutoProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") +``` + +1. Create a function to process the audio data contained in `array` to `input_values`, and tokenize `text` to `labels`. These are the inputs to the model: + +```py +>>> def prepare_dataset(example): +... audio = example["audio"] + +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) + +... return example +``` + +2. Apply the `prepare_dataset` function to a sample: + +```py +>>> prepare_dataset(lj_speech[0]) +``` + +The processor has now added `input_values` and `labels`, and the sampling rate has also been correctly downsampled to 16kHz. You can pass your processed dataset to the model now! diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx deleted file mode 100644 index 5283a9b17e00..000000000000 --- a/docs/source/en/preprocessing.mdx +++ /dev/null @@ -1,469 +0,0 @@ - - -# Preprocess - -[[open-in-colab]] - -Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, they need to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for: - -* Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors. -* Image inputs use a [ImageProcessor](./main_classes/image) to convert images into tensors. -* Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors. -* Multimodal inputs, use a [Processor](./main_classes/processors) to combine a tokenizer and a feature extractor or image processor. - - - -`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor. - - - -Before you begin, install 🤗 Datasets so you can load some datasets to experiment with: - -```bash -pip install datasets -``` - -## Natural Language Processing - - - -The main tool for preprocessing textual data is a [tokenizer](main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer. - - - -If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referrred to as the *vocab*) during pretraining. - - - -Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pretrained`] method. This downloads the *vocab* a model was pretrained with: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -``` - -Then pass your text to the tokenizer: - -```py ->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") ->>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -The tokenizer returns a dictionary with three important items: - -* [input_ids](glossary#input-ids) are the indices corresponding to each token in the sentence. -* [attention_mask](glossary#attention-mask) indicates whether a token should be attended to or not. -* [token_type_ids](glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence. - -Return your input by decoding the `input_ids`: - -```py ->>> tokenizer.decode(encoded_input["input_ids"]) -'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' -``` - -As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need -special tokens, but if they do, the tokenizer automatically adds them for you. - -If there are several sentences you want to preprocess, pass them as a list to the tokenizer: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_inputs = tokenizer(batch_sentences) ->>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1]]} -``` - -### Pad - -Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences. - -Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -The first and third sentences are now padded with `0`'s because they are shorter. - -### Truncation - -On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length. - -Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - - - -Check out the [Padding and truncation](./pad_truncation) concept guide to learn more different padding and truncation arguments. - - - -### Build tensors - -Finally, you want the tokenizer to return the actual tensors that get fed to the model. - -Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for TensorFlow: - - - - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") ->>> print(encoded_input) -{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} -``` - - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") ->>> print(encoded_input) -{'input_ids': , - 'token_type_ids': , - 'attention_mask': } -``` - - - -## Audio - -For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors. - -Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets: - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") -``` - -Access the first element of the `audio` column to take a look at the input. Calling the `audio` column automatically loads and resamples the audio file: - -```py ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} -``` - -This returns three items: - -* `array` is the speech signal loaded - and potentially resampled - as a 1D array. -* `path` points to the location of the audio file. -* `sampling_rate` refers to how many data points in the speech signal are measured per second. - -For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data. - -1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz: - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -2. Call the `audio` column again to resample the audio file: - -```py ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} -``` - -Next, load a feature extractor to normalize and pad the input. When padding textual data, a `0` is added for shorter sequences. The same idea applies to audio data. The feature extractor adds a `0` - interpreted as silence - to `array`. - -Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") -``` - -Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur. - -```py ->>> audio_input = [dataset[0]["audio"]["array"]] ->>> feature_extractor(audio_input, sampling_rate=16000) -{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., - 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} -``` - -Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples: - -```py ->>> dataset[0]["audio"]["array"].shape -(173398,) - ->>> dataset[1]["audio"]["array"].shape -(106496,) -``` - -Create a function to preprocess the dataset so the audio samples are the same lengths. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it: - -```py ->>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] -... inputs = feature_extractor( -... audio_arrays, -... sampling_rate=16000, -... padding=True, -... max_length=100000, -... truncation=True, -... ) -... return inputs -``` - -Apply the `preprocess_function` to the the first few examples in the dataset: - -```py ->>> processed_dataset = preprocess_function(dataset[:5]) -``` - -The sample lengths are now the same and match the specified maximum length. You can pass your processed dataset to the model now! - -```py ->>> processed_dataset["input_values"][0].shape -(100000,) - ->>> processed_dataset["input_values"][1].shape -(100000,) -``` - -## Computer vision - -For computer vision tasks, you'll need an [image processor](main_classes/image_processor) to prepare your dataset for the model. The image processor is designed to preprocess images, and convert them into tensors. - -Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets: - - - -Use 🤗 Datasets `split` parameter to only load a small sample from the training split since the dataset is quite large! - - - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("food101", split="train[:100]") -``` - -Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) feature: - -```py ->>> dataset[0]["image"] -``` - -
- -
- -Load the image processor with [`AutoImageProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoImageProcessor - ->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -``` - -For computer vision tasks, it is common to add some type of data augmentation to the images as a part of preprocessing. You can add augmentations with any library you'd like, but in this tutorial, you'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module. If you're interested in using another data augmentation library, learn how in the [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) or [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb). - -1. Normalize the image with the image processor and use [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) to chain some transforms - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - together: - -```py ->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor - ->>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) ->>> size = ( -... image_processor.size["shortest_edge"] -... if "shortest_edge" in image_processor.size -... else (image_processor.size["height"], image_processor.size["width"]) -... ) ->>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]) -``` - -2. The model accepts [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) as its input, which is generated by the image processor. Create a function that generates `pixel_values` from the transforms: - -```py ->>> def transforms(examples): -... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] -... return examples -``` - -3. Then use 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) to apply the transforms on the fly: - -```py ->>> dataset.set_transform(transforms) -``` - -4. Now when you access the image, you'll notice the image processor has added `pixel_values`. You can pass your processed dataset to the model now! - -```py ->>> dataset[0].keys() -``` - -Here is what the image looks like after the transforms are applied. The image has been randomly cropped and it's color properties are different. - -```py ->>> import numpy as np ->>> import matplotlib.pyplot as plt - ->>> img = dataset[0]["pixel_values"] ->>> plt.imshow(img.permute(1, 2, 0)) -``` - -
- -
- -## Multimodal - -For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as as tokenizer and feature extractor. - -Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR): - -```py ->>> from datasets import load_dataset - ->>> lj_speech = load_dataset("lj_speech", split="train") -``` - -For ASR, you're mainly focused on `audio` and `text` so you can remove the other columns: - -```py ->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) -``` - -Now take a look at the `audio` and `text` columns: - -```py ->>> lj_speech[0]["audio"] -{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., - 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 22050} - ->>> lj_speech[0]["text"] -'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' -``` - -Remember you should always [resample](preprocessing#audio) your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model! - -```py ->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -Load a processor with [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") -``` - -1. Create a function to process the audio data contained in `array` to `input_values`, and tokenize `text` to `labels`. These are the inputs to the model: - -```py ->>> def prepare_dataset(example): -... audio = example["audio"] - -... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) - -... return example -``` - -2. Apply the `prepare_dataset` function to a sample: - -```py ->>> prepare_dataset(lj_speech[0]) -``` - -The processor has now added `input_values` and `labels`, and the sampling rate has also been correctly downsampled to 16kHz. You can pass your processed dataset to the model now! diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md new file mode 100644 index 000000000000..d49943da17a1 --- /dev/null +++ b/docs/source/en/quicktour.md @@ -0,0 +1,556 @@ + + +# Quick tour + +[[open-in-colab]] + +Get up and running with 🤗 Transformers! Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use the [`pipeline`] for inference, load a pretrained model and preprocessor with an [AutoClass](./model_doc/auto), and quickly train a model with PyTorch or TensorFlow. If you're a beginner, we recommend checking out our tutorials or [course](https://huggingface.co/course/chapter1/1) next for more in-depth explanations of the concepts introduced here. + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +!pip install transformers datasets +``` + +You'll also need to install your preferred machine learning framework: + + + + +```bash +pip install torch +``` + + + +```bash +pip install tensorflow +``` + + + +## Pipeline + + + +The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below: + + + +For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines). + + + +| **Task** | **Description** | **Modality** | **Pipeline identifier** | +|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------| +| Text classification | assign a label to a given sequence of text | NLP | pipeline(task=“sentiment-analysis”) | +| Text generation | generate text given a prompt | NLP | pipeline(task=“text-generation”) | +| Summarization | generate a summary of a sequence of text or document | NLP | pipeline(task=“summarization”) | +| Image classification | assign a label to an image | Computer vision | pipeline(task=“image-classification”) | +| Image segmentation | assign a label to each individual pixel of an image (supports semantic, panoptic, and instance segmentation) | Computer vision | pipeline(task=“image-segmentation”) | +| Object detection | predict the bounding boxes and classes of objects in an image | Computer vision | pipeline(task=“object-detection”) | +| Audio classification | assign a label to some audio data | Audio | pipeline(task=“audio-classification”) | +| Automatic speech recognition | transcribe speech into text | Audio | pipeline(task=“automatic-speech-recognition”) | +| Visual question answering | answer a question about the image, given an image and a question | Multimodal | pipeline(task=“vqa”) | +| Document question answering | answer a question about the document, given a document and a question | Multimodal | pipeline(task="document-question-answering") | +| Image captioning | generate a caption for a given image | Multimodal | pipeline(task="image-to-text") | + +Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. In this guide, you'll use the [`pipeline`] for sentiment analysis as an example: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("sentiment-analysis") +``` + +The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text: + +```py +>>> classifier("We are very happy to show you the 🤗 Transformers library.") +[{'label': 'POSITIVE', 'score': 0.9998}] +``` + +If you have more than one input, pass your inputs as a list to the [`pipeline`] to return a list of dictionaries: + +```py +>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) +>>> for result in results: +... print(f"label: {result['label']}, with score: {round(result['score'], 4)}") +label: POSITIVE, with score: 0.9998 +label: NEGATIVE, with score: 0.5309 +``` + +The [`pipeline`] can also iterate over an entire dataset for any task you like. For this example, let's choose automatic speech recognition as our task: + +```py +>>> import torch +>>> from transformers import pipeline + +>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") +``` + +Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) for more details) you'd like to iterate over. For example, load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset: + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT +``` + +You need to make sure the sampling rate of the dataset matches the sampling +rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on: + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate)) +``` + +The audio files are automatically loaded and resampled when calling the `"audio"` column. +Extract the raw waveform arrays from the first 4 samples and pass it as a list to the pipeline: + +```py +>>> result = speech_recognizer(dataset[:4]["audio"]) +>>> print([d["text"] for d in result]) +['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT'] +``` + +For larger datasets where the inputs are big (like in speech or vision), you'll want to pass a generator instead of a list to load all the inputs in memory. Take a look at the [pipeline API reference](./main_classes/pipelines) for more information. + +### Use another model and tokenizer in the pipeline + +The [`pipeline`] can accommodate any model from the [Hub](https://huggingface.co/models), making it easy to adapt the [`pipeline`] for other use-cases. For example, if you'd like a model capable of handling French text, use the tags on the Hub to filter for an appropriate model. The top filtered result returns a multilingual [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) finetuned for sentiment analysis you can use for French text: + +```py +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +``` + + + +Use [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` in the next section): + +```py +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + +Use [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` in the next section): + +```py +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + + +Specify the model and tokenizer in the [`pipeline`], and now you can apply the `classifier` on French text: + +```py +>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) +>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") +[{'label': '5 stars', 'score': 0.7273}] +``` + +If you can't find a model for your use-case, you'll need to finetune a pretrained model on your data. Take a look at our [finetuning tutorial](./training) to learn how. Finally, after you've finetuned your pretrained model, please consider [sharing](./model_sharing) the model with the community on the Hub to democratize machine learning for everyone! 🤗 + +## AutoClass + + + +Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class. + +Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`]. + +### AutoTokenizer + +A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model. There are multiple rules that govern the tokenization process, including how to split a word and at what level words should be split (learn more about tokenization in the [tokenizer summary](./tokenizer_summary)). The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you're using the same tokenization rules a model was pretrained with. + +Load a tokenizer with [`AutoTokenizer`]: + +```py +>>> from transformers import AutoTokenizer + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + +Pass your text to the tokenizer: + +```py +>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.") +>>> print(encoding) +{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +The tokenizer returns a dictionary containing: + +* [input_ids](./glossary#input-ids): numerical representations of your tokens. +* [attention_mask](.glossary#attention-mask): indicates which tokens should be attended to. + +A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length: + + + + +```py +>>> pt_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="pt", +... ) +``` + + + +```py +>>> tf_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="tf", +... ) +``` + + + + + +Check out the [preprocess](./preprocessing) tutorial for more details about tokenization, and how to use an [`AutoImageProcessor`], [`AutoFeatureExtractor`] and [`AutoProcessor`] to preprocess image, audio, and multimodal inputs. + + + +### AutoModel + + + +🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class. + + + +Now pass your preprocessed batch of inputs directly to the model. You just have to unpack the dictionary by adding `**`: + +```py +>>> pt_outputs = pt_model(**pt_batch) +``` + +The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities: + +```py +>>> from torch import nn + +>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) +>>> print(pt_predictions) +tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], + [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) +``` + + +🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`TFAutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`TFAutoModel`] for the task. For text (or sequence) classification, you should load [`TFAutoModelForSequenceClassification`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class. + + + +Now pass your preprocessed batch of inputs directly to the model. You can pass the tensors as-is: + +```py +>>> tf_outputs = tf_model(tf_batch) +``` + +The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities: + +```py +>>> import tensorflow as tf + +>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) +>>> tf_predictions # doctest: +IGNORE_RESULT +``` + + + + + +All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors *before* the final activation +function (like softmax) because the final activation function is often fused with the loss. Model outputs are special dataclasses so their attributes are autocompleted in an IDE. The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored. + + + +### Save a model + + + +Once your model is fine-tuned, you can save it with its tokenizer using [`PreTrainedModel.save_pretrained`]: + +```py +>>> pt_save_directory = "./pt_save_pretrained" +>>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT +>>> pt_model.save_pretrained(pt_save_directory) +``` + +When you are ready to use the model again, reload it with [`PreTrainedModel.from_pretrained`]: + +```py +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") +``` + + +Once your model is fine-tuned, you can save it with its tokenizer using [`TFPreTrainedModel.save_pretrained`]: + +```py +>>> tf_save_directory = "./tf_save_pretrained" +>>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT +>>> tf_model.save_pretrained(tf_save_directory) +``` + +When you are ready to use the model again, reload it with [`TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") +``` + + + +One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. The `from_pt` or `from_tf` parameter can convert the model from one framework to the other: + + + + +```py +>>> from transformers import AutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +``` + + + +```py +>>> from transformers import TFAutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +``` + + + +## Custom model builds + +You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results. + +Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads: + +```py +>>> from transformers import AutoConfig + +>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12) +``` + + + +Create a model from your custom configuration with [`AutoModel.from_config`]: + +```py +>>> from transformers import AutoModel + +>>> my_model = AutoModel.from_config(my_config) +``` + + +Create a model from your custom configuration with [`TFAutoModel.from_config`]: + +```py +>>> from transformers import TFAutoModel + +>>> my_model = TFAutoModel.from_config(my_config) +``` + + + +Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations. + +## Trainer - a PyTorch optimized training loop + +All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a [`Trainer`] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more. + +Depending on your task, you'll typically pass the following parameters to [`Trainer`]: + +1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module): + + ```py + >>> from transformers import AutoModelForSequenceClassification + + >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") + ``` + +2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments: + + ```py + >>> from transformers import TrainingArguments + + >>> training_args = TrainingArguments( + ... output_dir="path/to/save/folder/", + ... learning_rate=2e-5, + ... per_device_train_batch_size=8, + ... per_device_eval_batch_size=8, + ... num_train_epochs=2, + ... ) + ``` + +3. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor: + + ```py + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + ``` + +4. Load a dataset: + + ```py + >>> from datasets import load_dataset + + >>> dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT + ``` + +5. Create a function to tokenize the dataset: + + ```py + >>> def tokenize_dataset(dataset): + ... return tokenizer(dataset["text"]) + ``` + + Then apply it over the entire dataset with [`~datasets.Dataset.map`]: + + ```py + >>> dataset = dataset.map(tokenize_dataset, batched=True) + ``` + +6. A [`DataCollatorWithPadding`] to create a batch of examples from your dataset: + + ```py + >>> from transformers import DataCollatorWithPadding + + >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + ``` + +Now gather all these classes in [`Trainer`]: + +```py +>>> from transformers import Trainer + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=dataset["train"], +... eval_dataset=dataset["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... ) # doctest: +SKIP +``` + +When you're ready, call [`~Trainer.train`] to start training: + +```py +>>> trainer.train() # doctest: +SKIP +``` + + + +For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] classes instead. + + + +You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed. + +The other way to customize the training loop is by using [Callbacks](./main_classes/callbacks). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead. + +## Train with TensorFlow + +All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) so they can be trained in TensorFlow with the [Keras](https://keras.io/) API. 🤗 Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to easily load your dataset as a `tf.data.Dataset` so you can start training right away with Keras' [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) methods. + +1. You'll start with a [`TFPreTrainedModel`] or a [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model): + + ```py + >>> from transformers import TFAutoModelForSequenceClassification + + >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") + ``` + +2. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor: + + ```py + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + ``` + +3. Create a function to tokenize the dataset: + + ```py + >>> def tokenize_dataset(dataset): + ... return tokenizer(dataset["text"]) # doctest: +SKIP + ``` + +4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like: + + ```py + >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP + >>> tf_dataset = model.prepare_tf_dataset( + ... dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer + ... ) # doctest: +SKIP + ``` + +5. When you're ready, you can call `compile` and `fit` to start training. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + + ```py + >>> from tensorflow.keras.optimizers import Adam + + >>> model.compile(optimizer=Adam(3e-5)) # No loss argument! + >>> model.fit(tf_dataset) # doctest: +SKIP + ``` + +## What's next? + +Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides! diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx deleted file mode 100644 index 8b56eac0beba..000000000000 --- a/docs/source/en/quicktour.mdx +++ /dev/null @@ -1,542 +0,0 @@ - - -# Quick tour - -[[open-in-colab]] - -Get up and running with 🤗 Transformers! Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use the [`pipeline`] for inference, load a pretrained model and preprocessor with an [AutoClass](./model_doc/auto), and quickly train a model with PyTorch or TensorFlow. If you're a beginner, we recommend checking out our tutorials or [course](https://huggingface.co/course/chapter1/1) next for more in-depth explanations of the concepts introduced here. - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -!pip install transformers datasets -``` - -You'll also need to install your preferred machine learning framework: - - - -```bash -pip install torch -``` - - -```bash -pip install tensorflow -``` - - - -## Pipeline - - - -The [`pipeline`] is the easiest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities. Take a look at the table below for some supported tasks: - -| **Task** | **Description** | **Modality** | **Pipeline identifier** | -|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------| -| Text classification | assign a label to a given sequence of text | NLP | pipeline(task="sentiment-analysis") | -| Text generation | generate text that follows a given prompt | NLP | pipeline(task="text-generation") | -| Name entity recognition | assign a label to each token in a sequence (people, organization, location, etc.) | NLP | pipeline(task="ner") | -| Question answering | extract an answer from the text given some context and a question | NLP | pipeline(task="question-answering") | -| Fill-mask | predict the correct masked token in a sequence | NLP | pipeline(task="fill-mask") | -| Summarization | generate a summary of a sequence of text or document | NLP | pipeline(task="summarization") | -| Translation | translate text from one language into another | NLP | pipeline(task="translation") | -| Image classification | assign a label to an image | Computer vision | pipeline(task="image-classification") | -| Image segmentation | assign a label to each individual pixel of an image (supports semantic, panoptic, and instance segmentation) | Computer vision | pipeline(task="image-segmentation") | -| Object detection | predict the bounding boxes and classes of objects in an image | Computer vision | pipeline(task="object-detection") | -| Audio classification | assign a label to an audio file | Audio | pipeline(task="audio-classification") | -| Automatic speech recognition | extract speech from an audio file into text | Audio | pipeline(task="automatic-speech-recognition") | -| Visual question answering | given an image and a question, correctly answer a question about the image | Multimodal | pipeline(task="vqa") | - -Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. You can use the [`pipeline`] for any of the previously mentioned tasks, and for a complete list of supported tasks, take a look at the [pipeline API reference](./main_classes/pipelines). In this guide though, you'll use the [`pipeline`] for sentiment analysis as an example: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("sentiment-analysis") -``` - -The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text: - -```py ->>> classifier("We are very happy to show you the 🤗 Transformers library.") -[{'label': 'POSITIVE', 'score': 0.9998}] -``` - -If you have more than one input, pass your inputs as a list to the [`pipeline`] to return a list of dictionaries: - -```py ->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) ->>> for result in results: -... print(f"label: {result['label']}, with score: {round(result['score'], 4)}") -label: POSITIVE, with score: 0.9998 -label: NEGATIVE, with score: 0.5309 -``` - -The [`pipeline`] can also iterate over an entire dataset for any task you like. For this example, let's choose automatic speech recognition as our task: - -```py ->>> import torch ->>> from transformers import pipeline - ->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") -``` - -Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) for more details) you'd like to iterate over. For example, load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset: - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT -``` - -You need to make sure the sampling rate of the dataset matches the sampling -rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on: - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate)) -``` - -The audio files are automatically loaded and resampled when calling the `"audio"` column. -Extract the raw waveform arrays from the first 4 samples and pass it as a list to the pipeline: - -```py ->>> result = speech_recognizer(dataset[:4]["audio"]) ->>> print([d["text"] for d in result]) -['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT'] -``` - -For larger datasets where the inputs are big (like in speech or vision), you'll want to pass a generator instead of a list to load all the inputs in memory. Take a look at the [pipeline API reference](./main_classes/pipelines) for more information. - -### Use another model and tokenizer in the pipeline - -The [`pipeline`] can accommodate any model from the [Hub](https://huggingface.co/models), making it easy to adapt the [`pipeline`] for other use-cases. For example, if you'd like a model capable of handling French text, use the tags on the Hub to filter for an appropriate model. The top filtered result returns a multilingual [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) finetuned for sentiment analysis you can use for French text: - -```py ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" -``` - - - -Use [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` in the next section): - -```py ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - -Use [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` in the next section): - -```py ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - - -Specify the model and tokenizer in the [`pipeline`], and now you can apply the `classifier` on French text: - -```py ->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) ->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") -[{'label': '5 stars', 'score': 0.7273}] -``` - -If you can't find a model for your use-case, you'll need to finetune a pretrained model on your data. Take a look at our [finetuning tutorial](./training) to learn how. Finally, after you've finetuned your pretrained model, please consider [sharing](./model_sharing) the model with the community on the Hub to democratize machine learning for everyone! 🤗 - -## AutoClass - - - -Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class. - -Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`]. - -### AutoTokenizer - -A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model. There are multiple rules that govern the tokenization process, including how to split a word and at what level words should be split (learn more about tokenization in the [tokenizer summary](./tokenizer_summary)). The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you're using the same tokenization rules a model was pretrained with. - -Load a tokenizer with [`AutoTokenizer`]: - -```py ->>> from transformers import AutoTokenizer - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - -Pass your text to the tokenizer: - -```py ->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.") ->>> print(encoding) -{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -The tokenizer returns a dictionary containing: - -* [input_ids](./glossary#input-ids): numerical representations of your tokens. -* [attention_mask](.glossary#attention-mask): indicates which tokens should be attended to. - -A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length: - - - -```py ->>> pt_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="pt", -... ) -``` - - -```py ->>> tf_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="tf", -... ) -``` - - - - - -Check out the [preprocess](./preprocessing) tutorial for more details about tokenization, and how to use an [`AutoImageProcessor`], [`AutoFeatureExtractor`] and [`AutoProcessor`] to preprocess image, audio, and multimodal inputs. - - - -### AutoModel - - - -🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - -See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class. - - - -Now pass your preprocessed batch of inputs directly to the model. You just have to unpack the dictionary by adding `**`: - -```py ->>> pt_outputs = pt_model(**pt_batch) -``` - -The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities: - -```py ->>> from torch import nn - ->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) ->>> print(pt_predictions) -tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], - [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) -``` - - -🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`TFAutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`TFAutoModel`] for the task. For text (or sequence) classification, you should load [`TFAutoModelForSequenceClassification`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - -See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class. - - - -Now pass your preprocessed batch of inputs directly to the model by passing the dictionary keys directly to the tensors: - -```py ->>> tf_outputs = tf_model(tf_batch) -``` - -The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities: - -```py ->>> import tensorflow as tf - ->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) ->>> tf_predictions # doctest: +IGNORE_RESULT -``` - - - - - -All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors *before* the final activation -function (like softmax) because the final activation function is often fused with the loss. Model outputs are special dataclasses so their attributes are autocompleted in an IDE. The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored. - - - -### Save a model - - - -Once your model is fine-tuned, you can save it with its tokenizer using [`PreTrainedModel.save_pretrained`]: - -```py ->>> pt_save_directory = "./pt_save_pretrained" ->>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT ->>> pt_model.save_pretrained(pt_save_directory) -``` - -When you are ready to use the model again, reload it with [`PreTrainedModel.from_pretrained`]: - -```py ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") -``` - - -Once your model is fine-tuned, you can save it with its tokenizer using [`TFPreTrainedModel.save_pretrained`]: - -```py ->>> tf_save_directory = "./tf_save_pretrained" ->>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT ->>> tf_model.save_pretrained(tf_save_directory) -``` - -When you are ready to use the model again, reload it with [`TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") -``` - - - -One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. The `from_pt` or `from_tf` parameter can convert the model from one framework to the other: - - - -```py ->>> from transformers import AutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) -``` - - -```py ->>> from transformers import TFAutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) -``` - - - -## Custom model builds - -You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results. - -Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads: - -```py ->>> from transformers import AutoConfig - ->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12) -``` - - - -Create a model from your custom configuration with [`AutoModel.from_config`]: - -```py ->>> from transformers import AutoModel - ->>> my_model = AutoModel.from_config(my_config) -``` - - -Create a model from your custom configuration with [`TFAutoModel.from_config`]: - -```py ->>> from transformers import TFAutoModel - ->>> my_model = TFAutoModel.from_config(my_config) -``` - - - -Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations. - -## Trainer - a PyTorch optimized training loop - -All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a [`Trainer`] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more. - -Depending on your task, you'll typically pass the following parameters to [`Trainer`]: - -1. A [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module): - - ```py - >>> from transformers import AutoModelForSequenceClassification - - >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") - ``` - -2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments: - - ```py - >>> from transformers import TrainingArguments - - >>> training_args = TrainingArguments( - ... output_dir="path/to/save/folder/", - ... learning_rate=2e-5, - ... per_device_train_batch_size=8, - ... per_device_eval_batch_size=8, - ... num_train_epochs=2, - ... ) - ``` - -3. A preprocessing class like a tokenizer, image processor, feature extractor, or processor: - - ```py - >>> from transformers import AutoTokenizer - - >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") - ``` - -4. Load a dataset: - - ```py - >>> from datasets import load_dataset - - >>> dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT - ``` - -5. Create a function to tokenize the dataset: - - ```py - >>> def tokenize_dataset(dataset): - ... return tokenizer(dataset["text"]) - ``` - - Then apply it over the entire dataset with [`~datasets.Dataset.map`]: - - ```py - >>> dataset = dataset.map(tokenize_dataset, batched=True) - ``` - -6. A [`DataCollatorWithPadding`] to create a batch of examples from your dataset: - - ```py - >>> from transformers import DataCollatorWithPadding - - >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - ``` - -Now gather all these classes in [`Trainer`]: - -```py ->>> from transformers import Trainer - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=dataset["train"], -... eval_dataset=dataset["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... ) # doctest: +SKIP -``` - -When you're ready, call [`~Trainer.train`] to start training: - -```py ->>> trainer.train() # doctest: +SKIP -``` - - - -For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] classes instead. - - - -You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed. - -The other way to customize the training loop is by using [Callbacks](./main_classes/callbacks). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead. - -## Train with TensorFlow - -All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) so they can be trained in TensorFlow with the [Keras](https://keras.io/) API. 🤗 Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to easily load your dataset as a `tf.data.Dataset` so you can start training right away with Keras' [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) methods. - -1. You'll start with a [`TFPreTrainedModel`] or a [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model): - - ```py - >>> from transformers import TFAutoModelForSequenceClassification - - >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") - ``` - -2. A preprocessing class like a tokenizer, image processor, feature extractor, or processor: - - ```py - >>> from transformers import AutoTokenizer - - >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") - ``` - -3. Create a function to tokenize the dataset: - - ```py - >>> def tokenize_dataset(dataset): - ... return tokenizer(dataset["text"]) # doctest: +SKIP - ``` - -4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like: - - ```py - >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP - >>> tf_dataset = model.prepare_tf_dataset( - ... dataset, batch_size=16, shuffle=True, tokenizer=tokenizer - ... ) # doctest: +SKIP - ``` - -5. When you're ready, you can call `compile` and `fit` to start training: - - ```py - >>> from tensorflow.keras.optimizers import Adam - - >>> model.compile(optimizer=Adam(3e-5)) - >>> model.fit(dataset) # doctest: +SKIP - ``` - -## What's next? - -Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides! diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md new file mode 100644 index 000000000000..3b40b6ea0672 --- /dev/null +++ b/docs/source/en/run_scripts.md @@ -0,0 +1,351 @@ + + +# Train with a script + +Along with the 🤗 Transformers [notebooks](./noteboks/README), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). + +You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library. + +The example scripts are not expected to work out-of-the-box on every problem, and you may need to adapt the script to the problem you're trying to solve. To help you with this, most of the scripts fully expose how data is preprocessed, allowing you to edit it as necessary for your use case. + +For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a Pull Request. While we welcome bug fixes, it is unlikely we will merge a Pull Request that adds more functionality at the cost of readability. + +This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). All examples are expected to work with both frameworks unless otherwise specified. + +## Setup + +To successfully run the latest version of the example scripts, you have to **install 🤗 Transformers from source** in a new virtual environment: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +pip install . +``` + +For older versions of the example scripts, click on the toggle below: + +
+ Examples for older versions of 🤗 Transformers + +
+ +Then switch your current clone of 🤗 Transformers to a specific version, like v3.5.1 for example: + +```bash +git checkout tags/v3.5.1 +``` + +After you've setup the correct library version, navigate to the example folder of your choice and install the example specific requirements: + +```bash +pip install -r requirements.txt +``` + +## Run a script + + + +The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task. + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task. + +```bash +python examples/tensorflow/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Distributed training and mixed precision + +The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features: + +- Add the `fp16` argument to enable mixed precision. +- Set the number of GPUs to use with the `nproc_per_node` argument. + +```bash +python -m torch.distributed.launch \ + --nproc_per_node 8 pytorch/summarization/run_summarization.py \ + --fp16 \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +TensorFlow scripts utilize a [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training, and you don't need to add any additional arguments to the training script. The TensorFlow script will use multiple GPUs by default if they are available. + +## Run a script on a TPU + + + +Tensor Processing Units (TPUs) are specifically designed to accelerate performance. PyTorch supports TPUs with the [XLA](https://www.tensorflow.org/xla) deep learning compiler (see [here](https://github.com/pytorch/xla/blob/master/README.md) for more details). To use a TPU, launch the `xla_spawn.py` script and use the `num_cores` argument to set the number of TPU cores you want to use. + +```bash +python xla_spawn.py --num_cores 8 \ + summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +Tensor Processing Units (TPUs) are specifically designed to accelerate performance. TensorFlow scripts utilize a [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) for training on TPUs. To use a TPU, pass the name of the TPU resource to the `tpu` argument. + +```bash +python run_summarization.py \ + --tpu name_of_tpu_resource \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Run a script with 🤗 Accelerate + +🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it: + +> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts +```bash +pip install git+https://github.com/huggingface/accelerate +``` + +Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file: + +```bash +accelerate config +``` + +Test your setup to make sure it is configured correctly: + +```bash +accelerate test +``` + +Now you are ready to launch the training: + +```bash +accelerate launch run_summarization_no_trainer.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir ~/tmp/tst-summarization +``` + +## Use a custom dataset + +The summarization script supports custom datasets as long as they are a CSV or JSON Line file. When you use your own dataset, you need to specify several additional arguments: + +- `train_file` and `validation_file` specify the path to your training and validation files. +- `text_column` is the input text to summarize. +- `summary_column` is the target text to output. + +A summarization script using a custom dataset would look like this: + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --train_file path_to_csv_or_jsonlines_file \ + --validation_file path_to_csv_or_jsonlines_file \ + --text_column text_column_name \ + --summary_column summary_column_name \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --overwrite_output_dir \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --predict_with_generate +``` + +## Test a script + +It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. Use the following arguments to truncate the dataset to a maximum number of samples: + +- `max_train_samples` +- `max_eval_samples` +- `max_predict_samples` + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +Not all example scripts support the `max_predict_samples` argument. If you aren't sure whether your script supports this argument, add the `-h` argument to check: + +```bash +examples/pytorch/summarization/run_summarization.py -h +``` + +## Resume training from checkpoint + +Another helpful option to enable is resuming training from a previous checkpoint. This will ensure you can pick up where you left off without starting over if your training gets interrupted. There are two methods to resume training from a checkpoint. + +The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --output_dir previous_output_dir \ + --predict_with_generate +``` + +The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder. + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --resume_from_checkpoint path_to_specific_checkpoint \ + --predict_with_generate +``` + +## Share your model + +All scripts can upload your final model to the [Model Hub](https://huggingface.co/models). Make sure you are logged into Hugging Face before you begin: + +```bash +huggingface-cli login +``` + +Then add the `push_to_hub` argument to the script. This argument will create a repository with your Hugging Face username and the folder name specified in `output_dir`. + +To give your repository a specific name, use the `push_to_hub_model_id` argument to add it. The repository will be automatically listed under your namespace. + +The following example shows how to upload a model with a specific repository name: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --push_to_hub \ + --push_to_hub_model_id finetuned-t5-cnn_dailymail \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` \ No newline at end of file diff --git a/docs/source/en/run_scripts.mdx b/docs/source/en/run_scripts.mdx deleted file mode 100644 index 58d6b8dd3e20..000000000000 --- a/docs/source/en/run_scripts.mdx +++ /dev/null @@ -1,347 +0,0 @@ - - -# Train with a script - -Along with the 🤗 Transformers [notebooks](./noteboks/README), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). - -You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library. - -The example scripts are not expected to work out-of-the-box on every problem, and you may need to adapt the script to the problem you're trying to solve. To help you with this, most of the scripts fully expose how data is preprocessed, allowing you to edit it as necessary for your use case. - -For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a Pull Request. While we welcome bug fixes, it is unlikely we will merge a Pull Request that adds more functionality at the cost of readability. - -This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). All examples are expected to work with both frameworks unless otherwise specified. - -## Setup - -To successfully run the latest version of the example scripts, you have to **install 🤗 Transformers from source** in a new virtual environment: - -```bash -git clone https://github.com/huggingface/transformers -cd transformers -pip install . -``` - -For older versions of the example scripts, click on the toggle below: - -
- Examples for older versions of 🤗 Transformers - -
- -Then switch your current clone of 🤗 Transformers to a specific version, like v3.5.1 for example: - -```bash -git checkout tags/v3.5.1 -``` - -After you've setup the correct library version, navigate to the example folder of your choice and install the example specific requirements: - -```bash -pip install -r requirements.txt -``` - -## Run a script - - - -The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task. - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task. - -```bash -python examples/tensorflow/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval -``` - - - -## Distributed training and mixed precision - -The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features: - -- Add the `fp16` argument to enable mixed precision. -- Set the number of GPUs to use with the `nproc_per_node` argument. - -```bash -python -m torch.distributed.launch \ - --nproc_per_node 8 pytorch/summarization/run_summarization.py \ - --fp16 \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -TensorFlow scripts utilize a [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training, and you don't need to add any additional arguments to the training script. The TensorFlow script will use multiple GPUs by default if they are available. - -## Run a script on a TPU - - - -Tensor Processing Units (TPUs) are specifically designed to accelerate performance. PyTorch supports TPUs with the [XLA](https://www.tensorflow.org/xla) deep learning compiler (see [here](https://github.com/pytorch/xla/blob/master/README.md) for more details). To use a TPU, launch the `xla_spawn.py` script and use the `num_cores` argument to set the number of TPU cores you want to use. - -```bash -python xla_spawn.py --num_cores 8 \ - summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -Tensor Processing Units (TPUs) are specifically designed to accelerate performance. TensorFlow scripts utilize a [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) for training on TPUs. To use a TPU, pass the name of the TPU resource to the `tpu` argument. - -```bash -python run_summarization.py \ - --tpu name_of_tpu_resource \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval -``` - - - -## Run a script with 🤗 Accelerate - -🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it: - -> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts -```bash -pip install git+https://github.com/huggingface/accelerate -``` - -Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file: - -```bash -accelerate config -``` - -Test your setup to make sure it is configured correctly: - -```bash -accelerate test -``` - -Now you are ready to launch the training: - -```bash -accelerate launch run_summarization_no_trainer.py \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir ~/tmp/tst-summarization -``` - -## Use a custom dataset - -The summarization script supports custom datasets as long as they are a CSV or JSON Line file. When you use your own dataset, you need to specify several additional arguments: - -- `train_file` and `validation_file` specify the path to your training and validation files. -- `text_column` is the input text to summarize. -- `summary_column` is the target text to output. - -A summarization script using a custom dataset would look like this: - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --train_file path_to_csv_or_jsonlines_file \ - --validation_file path_to_csv_or_jsonlines_file \ - --text_column text_column_name \ - --summary_column summary_column_name \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --overwrite_output_dir \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --predict_with_generate -``` - -## Test a script - -It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. Use the following arguments to truncate the dataset to a maximum number of samples: - -- `max_train_samples` -- `max_eval_samples` -- `max_predict_samples` - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --max_train_samples 50 \ - --max_eval_samples 50 \ - --max_predict_samples 50 \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -Not all example scripts support the `max_predict_samples` argument. If you aren't sure whether your script supports this argument, add the `-h` argument to check: - -```bash -examples/pytorch/summarization/run_summarization.py -h -``` - -## Resume training from checkpoint - -Another helpful option to enable is resuming training from a previous checkpoint. This will ensure you can pick up where you left off without starting over if your training gets interrupted. There are two methods to resume training from a checkpoint. - -The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`: - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --output_dir previous_output_dir \ - --predict_with_generate -``` - -The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder. - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --resume_from_checkpoint path_to_specific_checkpoint \ - --predict_with_generate -``` - -## Share your model - -All scripts can upload your final model to the [Model Hub](https://huggingface.co/models). Make sure you are logged into Hugging Face before you begin: - -```bash -huggingface-cli login -``` - -Then add the `push_to_hub` argument to the script. This argument will create a repository with your Hugging Face username and the folder name specified in `output_dir`. - -To give your repository a specific name, use the `push_to_hub_model_id` argument to add it. The repository will be automatically listed under your namespace. - -The following example shows how to upload a model with a specific repository name: - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --push_to_hub \ - --push_to_hub_model_id finetuned-t5-cnn_dailymail \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` \ No newline at end of file diff --git a/docs/source/en/sagemaker.md b/docs/source/en/sagemaker.md new file mode 100644 index 000000000000..f0a5a5f9c114 --- /dev/null +++ b/docs/source/en/sagemaker.md @@ -0,0 +1,29 @@ + + +# Run training on Amazon SageMaker + +The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. + +### Table of Content + +- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train) +- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference) +- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq) diff --git a/docs/source/en/sagemaker.mdx b/docs/source/en/sagemaker.mdx deleted file mode 100644 index 1ffdd4326e4d..000000000000 --- a/docs/source/en/sagemaker.mdx +++ /dev/null @@ -1,25 +0,0 @@ - - -# Run training on Amazon SageMaker - -The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. - -### Table of Content - -- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train) -- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference) -- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq) diff --git a/docs/source/en/serialization.md b/docs/source/en/serialization.md new file mode 100644 index 000000000000..9fec884a8be4 --- /dev/null +++ b/docs/source/en/serialization.md @@ -0,0 +1,210 @@ + + +# Export to ONNX + +Deploying 🤗 Transformers models in production environments often requires, or can benefit from exporting the models into +a serialized format that can be loaded and executed on specialized runtimes and hardware. + +🤗 Optimum is an extension of Transformers that enables exporting models from PyTorch or TensorFlow to serialized formats +such as ONNX and TFLite through its `exporters` module. 🤗 Optimum also provides a set of performance optimization tools to train +and run models on targeted hardware with maximum efficiency. + +This guide demonstrates how you can export 🤗 Transformers models to ONNX with 🤗 Optimum, for the guide on exporting models to TFLite, +please refer to the [Export to TFLite page](tflite). + +## Export to ONNX + +[ONNX (Open Neural Network eXchange)](http://onnx.ai) is an open standard that defines a common set of operators and a +common file format to represent deep learning models in a wide variety of frameworks, including PyTorch and +TensorFlow. When a model is exported to the ONNX format, these operators are used to +construct a computational graph (often called an _intermediate representation_) which +represents the flow of data through the neural network. + +By exposing a graph with standardized operators and data types, ONNX makes it easy to +switch between frameworks. For example, a model trained in PyTorch can be exported to +ONNX format and then imported in TensorFlow (and vice versa). + +Once exported to ONNX format, a model can be: +- optimized for inference via techniques such as [graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization). +- run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort), +which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers. +- run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines), +which has the same API as the [`pipeline`] function in 🤗 Transformers. + +🤗 Optimum provides support for the ONNX export by leveraging configuration objects. These configuration objects come +ready-made for a number of model architectures, and are designed to be easily extendable to other architectures. + +For the list of ready-made configurations, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/onnx/overview). + +There are two ways to export a 🤗 Transformers model to ONNX, here we show both: + +- export with 🤗 Optimum via CLI. +- export with 🤗 Optimum with `optimum.onnxruntime`. + +### Exporting a 🤗 Transformers model to ONNX with CLI + +To export a 🤗 Transformers model to ONNX, first install an extra dependency: + +```bash +pip install optimum[exporters] +``` + +To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli), +or view help in command line: + +```bash +optimum-cli export onnx --help +``` + +To export a model's checkpoint from the 🤗 Hub, for example, `distilbert-base-uncased-distilled-squad`, run the following command: + +```bash +optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/ +``` + +You should see the logs indicating progress and showing where the resulting `model.onnx` is saved, like this: + +```bash +Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx... + -[✓] ONNX model output names match reference model (start_logits, end_logits) + - Validating ONNX Model output "start_logits": + -[✓] (2, 16) matches (2, 16) + -[✓] all values close (atol: 0.0001) + - Validating ONNX Model output "end_logits": + -[✓] (2, 16) matches (2, 16) + -[✓] all values close (atol: 0.0001) +The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx +``` + +The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you +saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the +`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub and provide the `--task` argument. +You can review the list of supported tasks in the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager). +If `task` argument is not provided, it will default to the model architecture without any task specific head. + +```bash +optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/ +``` + +The resulting `model.onnx` file can then be run on one of the [many +accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX +standard. For example, we can load and run the model with [ONNX +Runtime](https://onnxruntime.ai/) as follows: + +```python +>>> from transformers import AutoTokenizer +>>> from optimum.onnxruntime import ORTModelForQuestionAnswering + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx") +>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx") +>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt") +>>> outputs = model(**inputs) +``` + +The process is identical for TensorFlow checkpoints on the Hub. For instance, here's how you would +export a pure TensorFlow checkpoint from the [Keras organization](https://huggingface.co/keras-io): + +```bash +optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/ +``` + +### Exporting a 🤗 Transformers model to ONNX with `optimum.onnxruntime` + +Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so: + +```python +>>> from optimum.onnxruntime import ORTModelForSequenceClassification +>>> from transformers import AutoTokenizer + +>>> model_checkpoint = "distilbert_base_uncased_squad" +>>> save_directory = "onnx/" + +>>> # Load a model from transformers and export it to ONNX +>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True) +>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) + +>>> # Save the onnx model and tokenizer +>>> ort_model.save_pretrained(save_directory) +>>> tokenizer.save_pretrained(save_directory) +``` + +### Exporting a model for an unsupported architecture + +If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is +supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview), +and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) +directly. + +### Exporting a model with `transformers.onnx` + + + +`tranformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions. + + + +To export a 🤗 Transformers model to ONNX with `tranformers.onnx`, install extra dependencies: + +```bash +pip install transformers[onnx] +``` + +Use `transformers.onnx` package as a Python module to export a checkpoint using a ready-made configuration: + +```bash +python -m transformers.onnx --model=distilbert-base-uncased onnx/ +``` + +This exports an ONNX graph of the checkpoint defined by the `--model` argument. Pass any checkpoint on the 🤗 Hub or one that's stored locally. +The resulting `model.onnx` file can then be run on one of the many accelerators that support the ONNX standard. For example, +load and run the model with ONNX Runtime as follows: + +```python +>>> from transformers import AutoTokenizer +>>> from onnxruntime import InferenceSession + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> session = InferenceSession("onnx/model.onnx") +>>> # ONNX Runtime expects NumPy arrays as input +>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") +>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) +``` + +The required output names (like `["last_hidden_state"]`) can be obtained by taking a look at the ONNX configuration of +each model. For example, for DistilBERT we have: + +```python +>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig + +>>> config = DistilBertConfig() +>>> onnx_config = DistilBertOnnxConfig(config) +>>> print(list(onnx_config.outputs.keys())) +["last_hidden_state"] +``` + +The process is identical for TensorFlow checkpoints on the Hub. For example, export a pure TensorFlow checkpoint like so: + +```bash +python -m transformers.onnx --model=keras-io/transformers-qa onnx/ +``` + +To export a model that's stored locally, save the model's weights and tokenizer files in the same directory (e.g. `local-pt-checkpoint`), +then export it to ONNX by pointing the `--model` argument of the `transformers.onnx` package to the desired directory: + +```bash +python -m transformers.onnx --model=local-pt-checkpoint onnx/ +``` \ No newline at end of file diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx deleted file mode 100644 index 7079a91f40c3..000000000000 --- a/docs/source/en/serialization.mdx +++ /dev/null @@ -1,538 +0,0 @@ - - -# Export to ONNX - -If you need to deploy 🤗 Transformers models in production environments, we recommend -exporting them to a serialized format that can be loaded and executed on specialized -runtimes and hardware. In this guide, we'll show you how to export 🤗 Transformers -models to [ONNX (Open Neural Network eXchange)](http://onnx.ai). - -ONNX is an open standard that defines a common set of operators and a common file format -to represent deep learning models in a wide variety of frameworks, including PyTorch and -TensorFlow. When a model is exported to the ONNX format, these operators are used to -construct a computational graph (often called an _intermediate representation_) which -represents the flow of data through the neural network. - -By exposing a graph with standardized operators and data types, ONNX makes it easy to -switch between frameworks. For example, a model trained in PyTorch can be exported to -ONNX format and then imported in TensorFlow (and vice versa). - -🤗 Transformers provides a [`transformers.onnx`](main_classes/onnx) package that enables -you to convert model checkpoints to an ONNX graph by leveraging configuration objects. -These configuration objects come ready made for a number of model architectures, and are -designed to be easily extendable to other architectures. - - - -You can also export 🤗 Transformers models with the [`optimum.exporters.onnx` package](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model) -from 🤗 Optimum. - -Once exported, a model can be: - -- Optimized for inference via techniques such as quantization and graph optimization. -- Run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort), -which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers. -- Run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines), -which has the same API as the [`pipeline`] function in 🤗 Transformers. - -To explore all these features, check out the [🤗 Optimum library](https://github.com/huggingface/optimum). - - - -Ready-made configurations include the following architectures: - - - -- ALBERT -- BART -- BEiT -- BERT -- BigBird -- BigBird-Pegasus -- Blenderbot -- BlenderbotSmall -- BLOOM -- CamemBERT -- Chinese-CLIP -- CLIP -- CodeGen -- Conditional DETR -- ConvBERT -- ConvNeXT -- Data2VecText -- Data2VecVision -- DeBERTa -- DeBERTa-v2 -- DeiT -- DETR -- DistilBERT -- ELECTRA -- ERNIE -- FlauBERT -- GPT Neo -- GPT-J -- GPT-Sw3 -- GroupViT -- I-BERT -- ImageGPT -- LayoutLM -- LayoutLMv3 -- LeViT -- Longformer -- LongT5 -- M2M100 -- Marian -- mBART -- MobileBERT -- MobileNetV1 -- MobileNetV2 -- MobileViT -- MT5 -- OpenAI GPT-2 -- OWL-ViT -- Perceiver -- PLBart -- PoolFormer -- RemBERT -- ResNet -- RoBERTa -- RoBERTa-PreLayerNorm -- RoFormer -- SegFormer -- SqueezeBERT -- Swin Transformer -- T5 -- Table Transformer -- Vision Encoder decoder -- ViT -- Whisper -- XLM -- XLM-RoBERTa -- XLM-RoBERTa-XL -- YOLOS - -In the next two sections, we'll show you how to: - -* Export a supported model using the `transformers.onnx` package. -* Export a custom model for an unsupported architecture. - -## Exporting a model to ONNX - - - -The recommended way of exporting a model is now to use -[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli), -do not worry it is very similar to `transformers.onnx`! - - - -To export a 🤗 Transformers model to ONNX, you'll first need to install some extra -dependencies: - -```bash -pip install transformers[onnx] -``` - -The `transformers.onnx` package can then be used as a Python module: - -```bash -python -m transformers.onnx --help - -usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output - -positional arguments: - output Path indicating where to store generated ONNX model. - -optional arguments: - -h, --help show this help message and exit - -m MODEL, --model MODEL - Model ID on huggingface.co or path on disk to load model from. - --feature {causal-lm, ...} - The type of features to export the model with. - --opset OPSET ONNX opset version to export the model with. - --atol ATOL Absolute difference tolerance when validating the model. -``` - -Exporting a checkpoint using a ready-made configuration can be done as follows: - -```bash -python -m transformers.onnx --model=distilbert-base-uncased onnx/ -``` - -You should see the following logs: - -```bash -Validating ONNX model... - -[✓] ONNX model output names match reference model ({'last_hidden_state'}) - - Validating ONNX Model output "last_hidden_state": - -[✓] (2, 8, 768) matches (2, 8, 768) - -[✓] all values close (atol: 1e-05) -All good, model saved at: onnx/model.onnx -``` - -This exports an ONNX graph of the checkpoint defined by the `--model` argument. In this -example, it is `distilbert-base-uncased`, but it can be any checkpoint on the Hugging -Face Hub or one that's stored locally. - -The resulting `model.onnx` file can then be run on one of the [many -accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX -standard. For example, we can load and run the model with [ONNX -Runtime](https://onnxruntime.ai/) as follows: - -```python ->>> from transformers import AutoTokenizer ->>> from onnxruntime import InferenceSession - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> session = InferenceSession("onnx/model.onnx") ->>> # ONNX Runtime expects NumPy arrays as input ->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") ->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) -``` - -The required output names (like `["last_hidden_state"]`) can be obtained by taking a -look at the ONNX configuration of each model. For example, for DistilBERT we have: - -```python ->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig - ->>> config = DistilBertConfig() ->>> onnx_config = DistilBertOnnxConfig(config) ->>> print(list(onnx_config.outputs.keys())) -["last_hidden_state"] -``` - -The process is identical for TensorFlow checkpoints on the Hub. For example, we can -export a pure TensorFlow checkpoint from the [Keras -organization](https://huggingface.co/keras-io) as follows: - -```bash -python -m transformers.onnx --model=keras-io/transformers-qa onnx/ -``` - -To export a model that's stored locally, you'll need to have the model's weights and -tokenizer files stored in a directory. For example, we can load and save a checkpoint as -follows: - - -```python ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> # Load tokenizer and PyTorch weights form the Hub ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") ->>> # Save to disk ->>> tokenizer.save_pretrained("local-pt-checkpoint") ->>> pt_model.save_pretrained("local-pt-checkpoint") -``` - -Once the checkpoint is saved, we can export it to ONNX by pointing the `--model` -argument of the `transformers.onnx` package to the desired directory: - -```bash -python -m transformers.onnx --model=local-pt-checkpoint onnx/ -``` - -```python ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> # Load tokenizer and TensorFlow weights from the Hub ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") ->>> # Save to disk ->>> tokenizer.save_pretrained("local-tf-checkpoint") ->>> tf_model.save_pretrained("local-tf-checkpoint") -``` - -Once the checkpoint is saved, we can export it to ONNX by pointing the `--model` -argument of the `transformers.onnx` package to the desired directory: - -```bash -python -m transformers.onnx --model=local-tf-checkpoint onnx/ -``` - - -## Selecting features for different model tasks - - - -The recommended way of exporting a model is now to use `optimum.exporters.onnx`. -You can check the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#selecting-a-task) -to learn how to select a task. - - - -Each ready-made configuration comes with a set of _features_ that enable you to export -models for different types of tasks. As shown in the table below, each feature is -associated with a different `AutoClass`: - -| Feature | Auto Class | -| ------------------------------------ | ------------------------------------ | -| `causal-lm`, `causal-lm-with-past` | `AutoModelForCausalLM` | -| `default`, `default-with-past` | `AutoModel` | -| `masked-lm` | `AutoModelForMaskedLM` | -| `question-answering` | `AutoModelForQuestionAnswering` | -| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM` | -| `sequence-classification` | `AutoModelForSequenceClassification` | -| `token-classification` | `AutoModelForTokenClassification` | - -For each configuration, you can find the list of supported features via the -[`~transformers.onnx.FeaturesManager`]. For example, for DistilBERT we have: - -```python ->>> from transformers.onnx.features import FeaturesManager - ->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys()) ->>> print(distilbert_features) -["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"] -``` - -You can then pass one of these features to the `--feature` argument in the -`transformers.onnx` package. For example, to export a text-classification model we can -pick a fine-tuned model from the Hub and run: - -```bash -python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \ - --feature=sequence-classification onnx/ -``` - -This displays the following logs: - -```bash -Validating ONNX model... - -[✓] ONNX model output names match reference model ({'logits'}) - - Validating ONNX Model output "logits": - -[✓] (2, 2) matches (2, 2) - -[✓] all values close (atol: 1e-05) -All good, model saved at: onnx/model.onnx -``` - -Notice that in this case, the output names from the fine-tuned model are `logits` -instead of the `last_hidden_state` we saw with the `distilbert-base-uncased` checkpoint -earlier. This is expected since the fine-tuned model has a sequence classification head. - - - -The features that have a `with-past` suffix (like `causal-lm-with-past`) correspond to -model classes with precomputed hidden states (key and values in the attention blocks) -that can be used for fast autoregressive decoding. - - - - - -For `VisionEncoderDecoder` type models, the encoder and decoder parts are -exported separately as two ONNX files named `encoder_model.onnx` and `decoder_model.onnx` respectively. - - - - -## Exporting a model for an unsupported architecture - - - -If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is -supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/package_reference/configuration#supported-architectures), -and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/contribute) -directly. - - - -If you wish to export a model whose architecture is not natively supported by the -library, there are three main steps to follow: - -1. Implement a custom ONNX configuration. -2. Export the model to ONNX. -3. Validate the outputs of the PyTorch and exported models. - -In this section, we'll look at how DistilBERT was implemented to show what's involved -with each step. - -### Implementing a custom ONNX configuration - -Let's start with the ONNX configuration object. We provide three abstract classes that -you should inherit from, depending on the type of model architecture you wish to export: - -* Encoder-based models inherit from [`~onnx.config.OnnxConfig`] -* Decoder-based models inherit from [`~onnx.config.OnnxConfigWithPast`] -* Encoder-decoder models inherit from [`~onnx.config.OnnxSeq2SeqConfigWithPast`] - - - -A good way to implement a custom ONNX configuration is to look at the existing -implementation in the `configuration_.py` file of a similar architecture. - - - -Since DistilBERT is an encoder-based model, its configuration inherits from -`OnnxConfig`: - -```python ->>> from typing import Mapping, OrderedDict ->>> from transformers.onnx import OnnxConfig - - ->>> class DistilBertOnnxConfig(OnnxConfig): -... @property -... def inputs(self) -> Mapping[str, Mapping[int, str]]: -... return OrderedDict( -... [ -... ("input_ids", {0: "batch", 1: "sequence"}), -... ("attention_mask", {0: "batch", 1: "sequence"}), -... ] -... ) -``` - -Every configuration object must implement the `inputs` property and return a mapping, -where each key corresponds to an expected input, and each value indicates the axis of -that input. For DistilBERT, we can see that two inputs are required: `input_ids` and -`attention_mask`. These inputs have the same shape of `(batch_size, sequence_length)` -which is why we see the same axes used in the configuration. - - - -Notice that `inputs` property for `DistilBertOnnxConfig` returns an `OrderedDict`. This -ensures that the inputs are matched with their relative position within the -`PreTrainedModel.forward()` method when tracing the graph. We recommend using an -`OrderedDict` for the `inputs` and `outputs` properties when implementing custom ONNX -configurations. - - - -Once you have implemented an ONNX configuration, you can instantiate it by providing the -base model's configuration as follows: - -```python ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("distilbert-base-uncased") ->>> onnx_config = DistilBertOnnxConfig(config) -``` - -The resulting object has several useful properties. For example, you can view the ONNX -operator set that will be used during the export: - -```python ->>> print(onnx_config.default_onnx_opset) -11 -``` - -You can also view the outputs associated with the model as follows: - -```python ->>> print(onnx_config.outputs) -OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})]) -``` - -Notice that the outputs property follows the same structure as the inputs; it returns an -`OrderedDict` of named outputs and their shapes. The output structure is linked to the -choice of feature that the configuration is initialised with. By default, the ONNX -configuration is initialized with the `default` feature that corresponds to exporting a -model loaded with the `AutoModel` class. If you want to export a model for another task, -just provide a different feature to the `task` argument when you initialize the ONNX -configuration. For example, if we wished to export DistilBERT with a sequence -classification head, we could use: - -```python ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("distilbert-base-uncased") ->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification") ->>> print(onnx_config_for_seq_clf.outputs) -OrderedDict([('logits', {0: 'batch'})]) -``` - - - -All of the base properties and methods associated with [`~onnx.config.OnnxConfig`] and -the other configuration classes can be overridden if needed. Check out [`BartOnnxConfig`] -for an advanced example. - - - -### Exporting the model - -Once you have implemented the ONNX configuration, the next step is to export the model. -Here we can use the `export()` function provided by the `transformers.onnx` package. -This function expects the ONNX configuration, along with the base model and tokenizer, -and the path to save the exported file: - -```python ->>> from pathlib import Path ->>> from transformers.onnx import export ->>> from transformers import AutoTokenizer, AutoModel - ->>> onnx_path = Path("model.onnx") ->>> model_ckpt = "distilbert-base-uncased" ->>> base_model = AutoModel.from_pretrained(model_ckpt) ->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt) - ->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path) -``` - -The `onnx_inputs` and `onnx_outputs` returned by the `export()` function are lists of -the keys defined in the `inputs` and `outputs` properties of the configuration. Once the -model is exported, you can test that the model is well formed as follows: - -```python ->>> import onnx - ->>> onnx_model = onnx.load("model.onnx") ->>> onnx.checker.check_model(onnx_model) -``` - - - -If your model is larger than 2GB, you will see that many additional files are created -during the export. This is _expected_ because ONNX uses [Protocol -Buffers](https://developers.google.com/protocol-buffers/) to store the model and these -have a size limit of 2GB. See the [ONNX -documentation](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) for -instructions on how to load models with external data. - - - -### Validating the model outputs - -The final step is to validate that the outputs from the base and exported model agree -within some absolute tolerance. Here we can use the `validate_model_outputs()` function -provided by the `transformers.onnx` package as follows: - -```python ->>> from transformers.onnx import validate_model_outputs - ->>> validate_model_outputs( -... onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation -... ) -``` - -This function uses the [`~transformers.onnx.OnnxConfig.generate_dummy_inputs`] method to -generate inputs for the base and exported model, and the absolute tolerance can be -defined in the configuration. We generally find numerical agreement in the 1e-6 to 1e-4 -range, although anything smaller than 1e-3 is likely to be OK. - -## Contributing a new configuration to 🤗 Transformers - -We are looking to expand the set of ready-made configurations and welcome contributions -from the community! If you would like to contribute your addition to the library, you -will need to: - -* Implement the ONNX configuration in the corresponding `configuration_.py` -file -* Include the model architecture and corresponding features in - [`~onnx.features.FeatureManager`] -* Add your model architecture to the tests in `test_onnx_v2.py` - -Check out how the configuration for [IBERT was -contributed](https://github.com/huggingface/transformers/pull/14868/files) to get an -idea of what's involved. diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md new file mode 100644 index 000000000000..6cb49f88ba7f --- /dev/null +++ b/docs/source/en/task_summary.md @@ -0,0 +1,341 @@ + + +# What 🤗 Transformers can do + +🤗 Transformers is a library of pretrained state-of-the-art models for natural language processing (NLP), computer vision, and audio and speech processing tasks. Not only does the library contain Transformer models, but it also has non-Transformer models like modern convolutional networks for computer vision tasks. If you look at some of the most popular consumer products today, like smartphones, apps, and televisions, odds are that some kind of deep learning technology is behind it. Want to remove a background object from a picture taken by your smartphone? This is an example of a panoptic segmentation task (don't worry if you don't know what this means yet, we'll describe it in the following sections!). + +This page provides an overview of the different speech and audio, computer vision, and NLP tasks that can be solved with the 🤗 Transformers library in just three lines of code! + +## Audio + +Audio and speech processing tasks are a little different from the other modalities mainly because audio as an input is a continuous signal. Unlike text, a raw audio waveform can't be neatly split into discrete chunks the way a sentence can be divided into words. To get around this, the raw audio signal is typically sampled at regular intervals. If you take more samples within an interval, the sampling rate is higher, and the audio more closely resembles the original audio source. + +Previous approaches preprocessed the audio to extract useful features from it. It is now more common to start audio and speech processing tasks by directly feeding the raw audio waveform to a feature encoder to extract an audio representation. This simplifies the preprocessing step and allows the model to learn the most essential features. + +### Audio classification + +Audio classification is a task that labels audio data from a predefined set of classes. It is a broad category with many specific applications, some of which include: + +* acoustic scene classification: label audio with a scene label ("office", "beach", "stadium") +* acoustic event detection: label audio with a sound event label ("car horn", "whale calling", "glass breaking") +* tagging: label audio containing multiple sounds (birdsongs, speaker identification in a meeting) +* music classification: label music with a genre label ("metal", "hip-hop", "country") + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er") +>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.4532, 'label': 'hap'}, + {'score': 0.3622, 'label': 'sad'}, + {'score': 0.0943, 'label': 'neu'}, + {'score': 0.0903, 'label': 'ang'}] +``` + +### Automatic speech recognition + +Automatic speech recognition (ASR) transcribes speech into text. It is one of the most common audio tasks due partly to speech being such a natural form of human communication. Today, ASR systems are embedded in "smart" technology products like speakers, phones, and cars. We can ask our virtual assistants to play music, set reminders, and tell us the weather. + +But one of the key challenges Transformer architectures have helped with is in low-resource languages. By pretraining on large amounts of speech data, finetuning the model on only one hour of labeled speech data in a low-resource language can still produce high-quality results compared to previous ASR systems trained on 100x more labeled data. + +```py +>>> from transformers import pipeline + +>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small") +>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} +``` + +## Computer vision + +One of the first and earliest successful computer vision tasks was recognizing images of zip code numbers using a [convolutional neural network (CNN)](glossary#convolution). An image is composed of pixels, and each pixel has a numerical value. This makes it easy to represent an image as a matrix of pixel values. Each particular combination of pixel values describes the colors of an image. + +Two general ways computer vision tasks can be solved are: + +1. Use convolutions to learn the hierarchical features of an image from low-level features to high-level abstract things. +2. Split an image into patches and use a Transformer to gradually learn how each image patch is related to each other to form an image. Unlike the bottom-up approach favored by a CNN, this is kind of like starting out with a blurry image and then gradually bringing it into focus. + +### Image classification + +Image classification labels an entire image from a predefined set of classes. Like most classification tasks, there are many practical use cases for image classification, some of which include: + +* healthcare: label medical images to detect disease or monitor patient health +* environment: label satellite images to monitor deforestation, inform wildland management or detect wildfires +* agriculture: label images of crops to monitor plant health or satellite images for land use monitoring +* ecology: label images of animal or plant species to monitor wildlife populations or track endangered species + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline(task="image-classification") +>>> preds = classifier( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> print(*preds, sep="\n") +{'score': 0.4335, 'label': 'lynx, catamount'} +{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'} +{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'} +{'score': 0.0239, 'label': 'Egyptian cat'} +{'score': 0.0229, 'label': 'tiger cat'} +``` + +### Object detection + +Unlike image classification, object detection identifies multiple objects within an image and the objects' positions in an image (defined by the bounding box). Some example applications of object detection include: + +* self-driving vehicles: detect everyday traffic objects such as other vehicles, pedestrians, and traffic lights +* remote sensing: disaster monitoring, urban planning, and weather forecasting +* defect detection: detect cracks or structural damage in buildings, and manufacturing defects + +```py +>>> from transformers import pipeline + +>>> detector = pipeline(task="object-detection") +>>> preds = detector( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds] +>>> preds +[{'score': 0.9865, + 'label': 'cat', + 'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}] +``` + +### Image segmentation + +Image segmentation is a pixel-level task that assigns every pixel in an image to a class. It differs from object detection, which uses bounding boxes to label and predict objects in an image because segmentation is more granular. Segmentation can detect objects at a pixel-level. There are several types of image segmentation: + +* instance segmentation: in addition to labeling the class of an object, it also labels each distinct instance of an object ("dog-1", "dog-2") +* panoptic segmentation: a combination of semantic and instance segmentation; it labels each pixel with a semantic class **and** each distinct instance of an object + +Segmentation tasks are helpful in self-driving vehicles to create a pixel-level map of the world around them so they can navigate safely around pedestrians and other vehicles. It is also useful for medical imaging, where the task's finer granularity can help identify abnormal cells or organ features. Image segmentation can also be used in ecommerce to virtually try on clothes or create augmented reality experiences by overlaying objects in the real world through your camera. + +```py +>>> from transformers import pipeline + +>>> segmenter = pipeline(task="image-segmentation") +>>> preds = segmenter( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> print(*preds, sep="\n") +{'score': 0.9879, 'label': 'LABEL_184'} +{'score': 0.9973, 'label': 'snow'} +{'score': 0.9972, 'label': 'cat'} +``` + +### Depth estimation + +Depth estimation predicts the distance of each pixel in an image from the camera. This computer vision task is especially important for scene understanding and reconstruction. For example, in self-driving cars, vehicles need to understand how far objects like pedestrians, traffic signs, and other vehicles are to avoid obstacles and collisions. Depth information is also helpful for constructing 3D representations from 2D images and can be used to create high-quality 3D representations of biological structures or buildings. + +There are two approaches to depth estimation: + +* stereo: depths are estimated by comparing two images of the same image from slightly different angles +* monocular: depths are estimated from a single image + +```py +>>> from transformers import pipeline + +>>> depth_estimator = pipeline(task="depth-estimation") +>>> preds = depth_estimator( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +``` + +## Natural language processing + +NLP tasks are among the most common types of tasks because text is such a natural way for us to communicate. To get text into a format recognized by a model, it needs to be tokenized. This means dividing a sequence of text into separate words or subwords (tokens) and then converting these tokens into numbers. As a result, you can represent a sequence of text as a sequence of numbers, and once you have a sequence of numbers, it can be input into a model to solve all sorts of NLP tasks! + +### Text classification + +Like classification tasks in any modality, text classification labels a sequence of text (it can be sentence-level, a paragraph, or a document) from a predefined set of classes. There are many practical applications for text classification, some of which include: + +* sentiment analysis: label text according to some polarity like `positive` or `negative` which can inform and support decision-making in fields like politics, finance, and marketing +* content classification: label text according to some topic to help organize and filter information in news and social media feeds (`weather`, `sports`, `finance`, etc.) + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline(task="sentiment-analysis") +>>> preds = classifier("Hugging Face is the best thing since sliced bread!") +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.9991, 'label': 'POSITIVE'}] +``` + +### Token classification + +In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. These are known as [tokens](/glossary#token). Token classification assigns each token a label from a predefined set of classes. + +Two common types of token classification are: + +* named entity recognition (NER): label a token according to an entity category like organization, person, location or date. NER is especially popular in biomedical settings, where it can label genes, proteins, and drug names. +* part-of-speech tagging (POS): label a token according to its part-of-speech like noun, verb, or adjective. POS is useful for helping translation systems understand how two identical words are grammatically different (bank as a noun versus bank as a verb). + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline(task="ner") +>>> preds = classifier("Hugging Face is a French company based in New York City.") +>>> preds = [ +... { +... "entity": pred["entity"], +... "score": round(pred["score"], 4), +... "index": pred["index"], +... "word": pred["word"], +... "start": pred["start"], +... "end": pred["end"], +... } +... for pred in preds +... ] +>>> print(*preds, sep="\n") +{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2} +{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7} +{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12} +{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24} +{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45} +{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50} +{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55} +``` + +### Question answering + +Question answering is another token-level task that returns an answer to a question, sometimes with context (open-domain) and other times without context (closed-domain). This task happens whenever we ask a virtual assistant something like whether a restaurant is open. It can also provide customer or technical support and help search engines retrieve the relevant information you're asking for. + +There are two common types of question answering: + +* extractive: given a question and some context, the answer is a span of text from the context the model must extract +* abstractive: given a question and some context, the answer is generated from the context; this approach is handled by the [`Text2TextGenerationPipeline`] instead of the [`QuestionAnsweringPipeline`] shown below + + +```py +>>> from transformers import pipeline + +>>> question_answerer = pipeline(task="question-answering") +>>> preds = question_answerer( +... question="What is the name of the repository?", +... context="The name of the repository is huggingface/transformers", +... ) +>>> print( +... f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}" +... ) +score: 0.9327, start: 30, end: 54, answer: huggingface/transformers +``` + +### Summarization + +Summarization creates a shorter version of a text from a longer one while trying to preserve most of the meaning of the original document. Summarization is a sequence-to-sequence task; it outputs a shorter text sequence than the input. There are a lot of long-form documents that can be summarized to help readers quickly understand the main points. Legislative bills, legal and financial documents, patents, and scientific papers are a few examples of documents that could be summarized to save readers time and serve as a reading aid. + +Like question answering, there are two types of summarization: + +* extractive: identify and extract the most important sentences from the original text +* abstractive: generate the target summary (which may include new words not in the input document) from the original text; the [`SummarizationPipeline`] uses the abstractive approach + +```py +>>> from transformers import pipeline + +>>> summarizer = pipeline(task="summarization") +>>> summarizer( +... "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles." +... ) +[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}] +``` + +### Translation + +Translation converts a sequence of text in one language to another. It is important in helping people from different backgrounds communicate with each other, help translate content to reach wider audiences, and even be a learning tool to help people learn a new language. Along with summarization, translation is a sequence-to-sequence task, meaning the model receives an input sequence and returns a target output sequence. + +In the early days, translation models were mostly monolingual, but recently, there has been increasing interest in multilingual models that can translate between many pairs of languages. + +```py +>>> from transformers import pipeline + +>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning." +>>> translator = pipeline(task="translation", model="t5-small") +>>> translator(text) +[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}] +``` + +### Language modeling + +Language modeling is a task that predicts a word in a sequence of text. It has become a very popular NLP task because a pretrained language model can be finetuned for many other downstream tasks. Lately, there has been a lot of interest in large language models (LLMs) which demonstrate zero- or few-shot learning. This means the model can solve tasks it wasn't explicitly trained to do! Language models can be used to generate fluent and convincing text, though you need to be careful since the text may not always be accurate. + +There are two types of language modeling: + +* causal: the model's objective is to predict the next token in a sequence, and future tokens are masked + + ```py + >>> from transformers import pipeline + + >>> prompt = "Hugging Face is a community-based open-source platform for machine learning." + >>> generator = pipeline(task="text-generation") + >>> generator(prompt) # doctest: +SKIP + ``` + +* masked: the model's objective is to predict a masked token in a sequence with full access to the tokens in the sequence + + ```py + >>> text = "Hugging Face is a community-based open-source for machine learning." + >>> fill_mask = pipeline(task="fill-mask") + >>> preds = fill_mask(text, top_k=1) + >>> preds = [ + ... { + ... "score": round(pred["score"], 4), + ... "token": pred["token"], + ... "token_str": pred["token_str"], + ... "sequence": pred["sequence"], + ... } + ... for pred in preds + ... ] + >>> preds + [{'score': 0.2236, + 'token': 1761, + 'token_str': ' platform', + 'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}] + ``` + +## Multimodal + +Multimodal tasks require a model to process multiple data modalities (text, image, audio, video) to solve a particular problem. Image captioning is an example of a multimodal task where the model takes an image as input and outputs a sequence of text describing the image or some properties of the image. + +Although multimodal models work with different data types or modalities, internally, the preprocessing steps help the model convert all the data types into embeddings (vectors or list of numbers that holds meaningful information about the data). For a task like image captioning, the model learns relationships between image embeddings and text embeddings. + +### Document question answering + +Document question answering is a task that answers natural language questions from a document. Unlike a token-level question answering task which takes text as input, document question answering takes an image of a document as input along with a question about the document and returns an answer. Document question answering can be used to parse structured documents and extract key information from it. In the example below, the total amount and change due can be extracted from a receipt. + +```py +>>> from transformers import pipeline +>>> from PIL import Image +>>> import requests + +>>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices") +>>> preds = doc_question_answerer( +... question="What is the total amount?", +... image=image, +... ) +>>> preds +[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}] +``` + +Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next [section](tasks_explained), you'll learn **how** 🤗 Transformers work to solve these tasks. \ No newline at end of file diff --git a/docs/source/en/task_summary.mdx b/docs/source/en/task_summary.mdx deleted file mode 100644 index 697ee21df5f9..000000000000 --- a/docs/source/en/task_summary.mdx +++ /dev/null @@ -1,1134 +0,0 @@ - - -# Summary of the tasks - -[[open-in-colab]] - -This page shows the most frequent use-cases when using the library. The models available allow for many different -configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage for -tasks such as image classification, question answering, sequence classification, named entity recognition and others. - -These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint, -automatically selecting the correct model architecture. Please check the [`AutoModel`] documentation -for more information. Feel free to modify the code to be more specific and adapt it to your specific use-case. - -In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These -checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the -following: - -- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage - one of the *run_$TASK.py* scripts in the [examples](https://github.com/huggingface/transformers/tree/main/examples) directory. -- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case and - domain. As mentioned previously, you may leverage the [examples](https://github.com/huggingface/transformers/tree/main/examples) scripts to fine-tune your model, or you may - create your own training script. - -In order to do an inference on a task, several mechanisms are made available by the library: - -- Pipelines: very easy-to-use abstractions, which require as little as two lines of code. -- Direct model use: Less abstractions, but more flexibility and power via a direct access to a tokenizer - (PyTorch/TensorFlow) and full inference capacity. - -Both approaches are showcased here. - - - -All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a -checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the -additional head that is used for the task, initializing the weights of that head randomly. - -This would produce random output. - - - -## Sequence Classification - -Sequence classification is the task of classifying sequences according to a given number of classes. An example of -sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune a -model on a GLUE sequence classification task, you may leverage the [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py), [run_tf_glue.py](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification/run_tf_glue.py), [run_tf_text_classification.py](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification/run_tf_text_classification.py) or [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) scripts. - -Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative. It -leverages a fine-tuned model on sst2, which is a GLUE task. - -This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("sentiment-analysis") - ->>> result = classifier("I hate you")[0] ->>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}") -label: NEGATIVE, with score: 0.9991 - ->>> result = classifier("I love you")[0] ->>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}") -label: POSITIVE, with score: 0.9999 -``` - -Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases of -each other. The process is the following: - -1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it - with the weights stored in the checkpoint. -2. Build a sequence from the two sentences, with the correct model-specific separators, token type ids and attention - masks (which will be created automatically by the tokenizer). -3. Pass this sequence through the model so that it is classified in one of the two available classes: 0 (not a - paraphrase) and 1 (is a paraphrase). -4. Compute the softmax of the result to get probabilities over the classes. -5. Print the results. - - - -```py ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification ->>> import torch - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") - ->>> classes = ["not paraphrase", "is paraphrase"] - ->>> sequence_0 = "The company HuggingFace is based in New York City" ->>> sequence_1 = "Apples are especially bad for your health" ->>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan" - ->>> # The tokenizer will automatically add any model specific separators (i.e. and ) and tokens to ->>> # the sequence, as well as compute the attention masks. ->>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt") ->>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt") - ->>> paraphrase_classification_logits = model(**paraphrase).logits ->>> not_paraphrase_classification_logits = model(**not_paraphrase).logits - ->>> paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0] ->>> not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0] - ->>> # Should be paraphrase ->>> for i in range(len(classes)): -... print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%") -not paraphrase: 10% -is paraphrase: 90% - ->>> # Should not be paraphrase ->>> for i in range(len(classes)): -... print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%") -not paraphrase: 94% -is paraphrase: 6% -``` - - -```py ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification ->>> import tensorflow as tf - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") ->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") - ->>> classes = ["not paraphrase", "is paraphrase"] - ->>> sequence_0 = "The company HuggingFace is based in New York City" ->>> sequence_1 = "Apples are especially bad for your health" ->>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan" - ->>> # The tokenizer will automatically add any model specific separators (i.e. and ) and tokens to ->>> # the sequence, as well as compute the attention masks. ->>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf") ->>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf") - ->>> paraphrase_classification_logits = model(paraphrase).logits ->>> not_paraphrase_classification_logits = model(not_paraphrase).logits - ->>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0] ->>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0] - ->>> # Should be paraphrase ->>> for i in range(len(classes)): -... print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%") -not paraphrase: 10% -is paraphrase: 90% - ->>> # Should not be paraphrase ->>> for i in range(len(classes)): -... print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%") -not paraphrase: 94% -is paraphrase: 6% -``` - - - -## Extractive Question Answering - -Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a -question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune a -model on a SQuAD task, you may leverage the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering/run_qa.py) and -[run_tf_squad.py](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering/run_tf_squad.py) -scripts. - - -Here is an example of using pipelines to do question answering: extracting an answer from a text given a question. It -leverages a fine-tuned model on SQuAD. - -```py ->>> from transformers import pipeline - ->>> question_answerer = pipeline("question-answering") - ->>> context = r""" -... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a -... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune -... a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script. -... """ -``` - -This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values, which are the -positions of the extracted answer in the text. - -```py ->>> result = question_answerer(question="What is extractive question answering?", context=context) ->>> print( -... f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}" -... ) -Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95 - ->>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context) ->>> print( -... f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}" -... ) -Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160 -``` - -Here is an example of question answering using a model and a tokenizer. The process is the following: - -1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it - with the weights stored in the checkpoint. -2. Define a text and a few questions. -3. Iterate over the questions and build a sequence from the text and the current question, with the correct - model-specific separators, token type ids and attention masks. -4. Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and - text), for both the start and end positions. -5. Compute the softmax of the result to get probabilities over the tokens. -6. Fetch the tokens from the identified start and stop values, convert those tokens to a string. -7. Print the results. - - - -```py ->>> from transformers import AutoTokenizer, AutoModelForQuestionAnswering ->>> import torch - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") ->>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") - ->>> text = r""" -... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose -... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural -... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between -... TensorFlow 2.0 and PyTorch. -... """ - ->>> questions = [ -... "How many pretrained models are available in 🤗 Transformers?", -... "What does 🤗 Transformers provide?", -... "🤗 Transformers provides interoperability between which frameworks?", -... ] - ->>> for question in questions: -... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") -... input_ids = inputs["input_ids"].tolist()[0] - -... outputs = model(**inputs) -... answer_start_scores = outputs.start_logits -... answer_end_scores = outputs.end_logits - -... # Get the most likely beginning of answer with the argmax of the score -... answer_start = torch.argmax(answer_start_scores) -... # Get the most likely end of answer with the argmax of the score -... answer_end = torch.argmax(answer_end_scores) + 1 - -... answer = tokenizer.convert_tokens_to_string( -... tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) -... ) - -... print(f"Question: {question}") -... print(f"Answer: {answer}") -Question: How many pretrained models are available in 🤗 Transformers? -Answer: over 32 + -Question: What does 🤗 Transformers provide? -Answer: general - purpose architectures -Question: 🤗 Transformers provides interoperability between which frameworks? -Answer: tensorflow 2. 0 and pytorch -``` - - -```py ->>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering ->>> import tensorflow as tf - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") ->>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") - ->>> text = r""" -... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose -... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural -... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between -... TensorFlow 2.0 and PyTorch. -... """ - ->>> questions = [ -... "How many pretrained models are available in 🤗 Transformers?", -... "What does 🤗 Transformers provide?", -... "🤗 Transformers provides interoperability between which frameworks?", -... ] - ->>> for question in questions: -... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf") -... input_ids = inputs["input_ids"].numpy()[0] - -... outputs = model(inputs) -... answer_start_scores = outputs.start_logits -... answer_end_scores = outputs.end_logits - -... # Get the most likely beginning of answer with the argmax of the score -... answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0] -... # Get the most likely end of answer with the argmax of the score -... answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1 - -... answer = tokenizer.convert_tokens_to_string( -... tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) -... ) - -... print(f"Question: {question}") -... print(f"Answer: {answer}") -Question: How many pretrained models are available in 🤗 Transformers? -Answer: over 32 + -Question: What does 🤗 Transformers provide? -Answer: general - purpose architectures -Question: 🤗 Transformers provides interoperability between which frameworks? -Answer: tensorflow 2. 0 and pytorch -``` - - - -## Language Modeling - -Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular -transformer-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, -GPT-2 with causal language modeling. - -Language modeling can be useful outside of pretraining as well, for example to shift the model distribution to be -domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset or -on scientific papers e.g. [LysandreJik/arxiv-nlp](https://huggingface.co/lysandre/arxiv-nlp). - -### Masked Language Modeling - -Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to -fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the -right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis for -downstream tasks requiring bi-directional context, such as SQuAD (question answering, see [Lewis, Lui, Goyal et al.](https://arxiv.org/abs/1910.13461), part 4.2). If you would like to fine-tune a model on a masked language modeling -task, you may leverage the [run_mlm.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling/run_mlm.py) script. - -Here is an example of using pipelines to replace a mask from a sequence: - -```py ->>> from transformers import pipeline - ->>> unmasker = pipeline("fill-mask") -``` - -This outputs the sequences with the mask filled, the confidence score, and the token id in the tokenizer vocabulary: - -```py ->>> from pprint import pprint - ->>> pprint( -... unmasker( -... f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks." -... ) -... ) -[{'score': 0.1793, - 'sequence': 'HuggingFace is creating a tool that the community uses to solve ' - 'NLP tasks.', - 'token': 3944, - 'token_str': ' tool'}, - {'score': 0.1135, - 'sequence': 'HuggingFace is creating a framework that the community uses to ' - 'solve NLP tasks.', - 'token': 7208, - 'token_str': ' framework'}, - {'score': 0.0524, - 'sequence': 'HuggingFace is creating a library that the community uses to ' - 'solve NLP tasks.', - 'token': 5560, - 'token_str': ' library'}, - {'score': 0.0349, - 'sequence': 'HuggingFace is creating a database that the community uses to ' - 'solve NLP tasks.', - 'token': 8503, - 'token_str': ' database'}, - {'score': 0.0286, - 'sequence': 'HuggingFace is creating a prototype that the community uses to ' - 'solve NLP tasks.', - 'token': 17715, - 'token_str': ' prototype'}] -``` - -Here is an example of doing masked language modeling using a model and a tokenizer. The process is the following: - -1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and - loads it with the weights stored in the checkpoint. -2. Define a sequence with a masked token, placing the `tokenizer.mask_token` instead of a word. -3. Encode that sequence into a list of IDs and find the position of the masked token in that list. -4. Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the - values are the scores attributed to each token. The model gives higher score to tokens it deems probable in that - context. -5. Retrieve the top 5 tokens using the PyTorch `topk` or TensorFlow `top_k` methods. -6. Replace the mask token by the tokens and print the results - - - -```py ->>> from transformers import AutoModelForMaskedLM, AutoTokenizer ->>> import torch - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") ->>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased") - ->>> sequence = ( -... "Distilled models are smaller than the models they mimic. Using them instead of the large " -... f"versions would help {tokenizer.mask_token} our carbon footprint." -... ) - ->>> inputs = tokenizer(sequence, return_tensors="pt") ->>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] - ->>> token_logits = model(**inputs).logits ->>> mask_token_logits = token_logits[0, mask_token_index, :] - ->>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() - ->>> for token in top_5_tokens: -... print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token]))) -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint. -``` - - -```py ->>> from transformers import TFAutoModelForMaskedLM, AutoTokenizer ->>> import tensorflow as tf - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") ->>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased") - ->>> sequence = ( -... "Distilled models are smaller than the models they mimic. Using them instead of the large " -... f"versions would help {tokenizer.mask_token} our carbon footprint." -... ) - ->>> inputs = tokenizer(sequence, return_tensors="tf") ->>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] - ->>> token_logits = model(**inputs).logits ->>> mask_token_logits = token_logits[0, mask_token_index, :] - ->>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy() - ->>> for token in top_5_tokens: -... print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token]))) -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint. -Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint. -``` - - - -This prints five sequences, with the top 5 tokens predicted by the model. - - -### Causal Language Modeling - -Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the -model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting -for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the -[run_clm.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling/run_clm.py) script. - -Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the -input sequence. - - - -Here is an example of using the tokenizer and model and leveraging the -[`top_k_top_p_filtering`] method to sample the next token following an input sequence -of tokens. - -```py ->>> from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering ->>> import torch ->>> from torch import nn - ->>> tokenizer = AutoTokenizer.from_pretrained("gpt2") ->>> model = AutoModelForCausalLM.from_pretrained("gpt2") - ->>> sequence = f"Hugging Face is based in DUMBO, New York City, and" - ->>> inputs = tokenizer(sequence, return_tensors="pt") ->>> input_ids = inputs["input_ids"] - ->>> # get logits of last hidden state ->>> next_token_logits = model(**inputs).logits[:, -1, :] - ->>> # filter ->>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) - ->>> # sample ->>> probs = nn.functional.softmax(filtered_next_token_logits, dim=-1) ->>> next_token = torch.multinomial(probs, num_samples=1) - ->>> generated = torch.cat([input_ids, next_token], dim=-1) - ->>> resulting_string = tokenizer.decode(generated.tolist()[0]) ->>> print(resulting_string) -Hugging Face is based in DUMBO, New York City, and ... -``` - - -Here is an example of using the tokenizer and model and leveraging the -[`tf_top_k_top_p_filtering`] method to sample the next token following an input sequence -of tokens. - -```py ->>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering ->>> import tensorflow as tf - ->>> tokenizer = AutoTokenizer.from_pretrained("gpt2") ->>> model = TFAutoModelForCausalLM.from_pretrained("gpt2") - ->>> sequence = f"Hugging Face is based in DUMBO, New York City, and" - ->>> inputs = tokenizer(sequence, return_tensors="tf") ->>> input_ids = inputs["input_ids"] - ->>> # get logits of last hidden state ->>> next_token_logits = model(**inputs).logits[:, -1, :] - ->>> # filter ->>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) - ->>> # sample ->>> next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1) - ->>> generated = tf.concat([input_ids, next_token], axis=1) - ->>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0]) ->>> print(resulting_string) -Hugging Face is based in DUMBO, New York City, and ... -``` - - - -This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *is* or -*features*. - -In the next section, we show how [`generation.GenerationMixin.generate`] can be used to -generate multiple tokens up to a specified length instead of one token at a time. - -### Text Generation - -In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a -continuation from the given context. The following example shows how *GPT-2* can be used in pipelines to generate text. -As a default all models apply *Top-K* sampling when used in pipelines, as configured in their respective configurations -(see [gpt-2 config](https://huggingface.co/gpt2/blob/main/config.json) for example). - - - -```py ->>> from transformers import pipeline - ->>> text_generator = pipeline("text-generation") ->>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False)) -[{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a -"free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}] -``` - -Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am -concerned, I will"*. Behind the scenes, the pipeline object calls the method -[`PreTrainedModel.generate`] to generate text. The default arguments for this method can be -overridden in the pipeline, as is shown above for the arguments `max_length` and `do_sample`. - -Below is an example of text generation using `XLNet` and its tokenizer, which includes calling `generate()` directly: - -```py ->>> from transformers import AutoModelForCausalLM, AutoTokenizer - ->>> model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased") ->>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") - ->>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology ->>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family -... (except for Alexei and Maria) are discovered. -... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the -... remainder of the story. 1883 Western Siberia, -... a young Grigori Rasputin is asked by his father and a group of men to perform magic. -... Rasputin has a vision and denounces one of the men as a horse thief. Although his -... father initially slaps him for making such an accusation, Rasputin watches as the -... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of -... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, -... with people, even a bishop, begging for his blessing. """ - ->>> prompt = "Today the weather is really nice and I am planning on " ->>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"] - ->>> prompt_length = len(tokenizer.decode(inputs[0])) ->>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) ->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :] - ->>> print(generated) -Today the weather is really nice and I am planning ... -``` - - -```py ->>> from transformers import TFAutoModelForCausalLM, AutoTokenizer - ->>> model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased") ->>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") - ->>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology ->>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family -... (except for Alexei and Maria) are discovered. -... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the -... remainder of the story. 1883 Western Siberia, -... a young Grigori Rasputin is asked by his father and a group of men to perform magic. -... Rasputin has a vision and denounces one of the men as a horse thief. Although his -... father initially slaps him for making such an accusation, Rasputin watches as the -... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of -... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, -... with people, even a bishop, begging for his blessing. """ - ->>> prompt = "Today the weather is really nice and I am planning on " ->>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")["input_ids"] - ->>> prompt_length = len(tokenizer.decode(inputs[0])) ->>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) ->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :] - ->>> print(generated) -Today the weather is really nice and I am planning ... -``` - - - -Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in -PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-XL* often -need to be padded to work well. GPT-2 is usually a good choice for *open-ended text generation* because it was trained -on millions of webpages with a causal language modeling objective. - -For more information on how to apply different decoding strategies for text generation, please also refer to our text -generation blog post [here](https://huggingface.co/blog/how-to-generate). - - -## Named Entity Recognition - -Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example, identifying a token -as a person, an organisation or a location. An example of a named entity recognition dataset is the CoNLL-2003 dataset, -which is entirely based on that task. If you would like to fine-tune a model on an NER task, you may leverage the -[run_ner.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification/run_ner.py) script. - -Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as -belonging to one of 9 classes: - -- O, Outside of a named entity -- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity -- I-MIS, Miscellaneous entity -- B-PER, Beginning of a person's name right after another person's name -- I-PER, Person's name -- B-ORG, Beginning of an organisation right after another organisation -- I-ORG, Organisation -- B-LOC, Beginning of a location right after another location -- I-LOC, Location - -It leverages a fine-tuned model on CoNLL-2003, fine-tuned by [@stefan-it](https://github.com/stefan-it) from [dbmdz](https://github.com/dbmdz). - -```py ->>> from transformers import pipeline - ->>> ner_pipe = pipeline("ner") - ->>> sequence = """Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, -... therefore very close to the Manhattan Bridge which is visible from the window.""" -``` - -This outputs a list of all words that have been identified as one of the entities from the 9 classes defined above. -Here are the expected results: - -```py ->>> for entity in ner_pipe(sequence): -... print(entity) -{'entity': 'I-ORG', 'score': 0.9996, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2} -{'entity': 'I-ORG', 'score': 0.9910, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7} -{'entity': 'I-ORG', 'score': 0.9982, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12} -{'entity': 'I-ORG', 'score': 0.9995, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16} -{'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'New', 'start': 40, 'end': 43} -{'entity': 'I-LOC', 'score': 0.9993, 'index': 12, 'word': 'York', 'start': 44, 'end': 48} -{'entity': 'I-LOC', 'score': 0.9994, 'index': 13, 'word': 'City', 'start': 49, 'end': 53} -{'entity': 'I-LOC', 'score': 0.9863, 'index': 19, 'word': 'D', 'start': 79, 'end': 80} -{'entity': 'I-LOC', 'score': 0.9514, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82} -{'entity': 'I-LOC', 'score': 0.9337, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84} -{'entity': 'I-LOC', 'score': 0.9762, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123} -{'entity': 'I-LOC', 'score': 0.9915, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130} -``` - -Note how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City", -"DUMBO" and "Manhattan Bridge" have been identified as locations. - -Here is an example of doing named entity recognition, using a model and a tokenizer. The process is the following: - -1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it - with the weights stored in the checkpoint. -2. Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location. -3. Split words into tokens so that they can be mapped to predictions. We use a small hack by, first, completely - encoding and decoding the sequence, so that we're left with a string that contains the special tokens. -4. Encode that sequence into IDs (special tokens are added automatically). -5. Retrieve the predictions by passing the input to the model and getting the first output. This results in a - distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class for - each token. -6. Zip together each token with its prediction and print it. - - - -```py ->>> from transformers import AutoModelForTokenClassification, AutoTokenizer ->>> import torch - ->>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - ->>> sequence = ( -... "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " -... "therefore very close to the Manhattan Bridge." -... ) - ->>> inputs = tokenizer(sequence, return_tensors="pt") ->>> tokens = inputs.tokens() - ->>> outputs = model(**inputs).logits ->>> predictions = torch.argmax(outputs, dim=2) -``` - - -```py ->>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer ->>> import tensorflow as tf - ->>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - ->>> sequence = ( -... "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " -... "therefore very close to the Manhattan Bridge." -... ) - ->>> inputs = tokenizer(sequence, return_tensors="tf") ->>> tokens = inputs.tokens() - ->>> outputs = model(**inputs)[0] ->>> predictions = tf.argmax(outputs, axis=2) -``` - - - -This outputs a list of each token mapped to its corresponding prediction. Differently from the pipeline, here every -token has a prediction as we didn't remove the "0"th class, which means that no particular entity was found on that -token. - -In the above example, `predictions` is an integer that corresponds to the predicted class. We can use the -`model.config.id2label` property in order to recover the class name corresponding to the class number, which is -illustrated below: - -```py ->>> for token, prediction in zip(tokens, predictions[0].numpy()): -... print((token, model.config.id2label[prediction])) -('[CLS]', 'O') -('Hu', 'I-ORG') -('##gging', 'I-ORG') -('Face', 'I-ORG') -('Inc', 'I-ORG') -('.', 'O') -('is', 'O') -('a', 'O') -('company', 'O') -('based', 'O') -('in', 'O') -('New', 'I-LOC') -('York', 'I-LOC') -('City', 'I-LOC') -('.', 'O') -('Its', 'O') -('headquarters', 'O') -('are', 'O') -('in', 'O') -('D', 'I-LOC') -('##UM', 'I-LOC') -('##BO', 'I-LOC') -(',', 'O') -('therefore', 'O') -('very', 'O') -('close', 'O') -('to', 'O') -('the', 'O') -('Manhattan', 'I-LOC') -('Bridge', 'I-LOC') -('.', 'O') -('[SEP]', 'O') -``` - -## Summarization - -Summarization is the task of summarizing a document or an article into a shorter text. If you would like to fine-tune a -model on a summarization task, you may leverage the [run_summarization.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/run_summarization.py) -script. - -An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was -created for the task of summarization. If you would like to fine-tune a model on a summarization task, various -approaches are described in this [document](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md). - -Here is an example of using the pipelines to do summarization. It leverages a Bart model that was fine-tuned on the CNN -/ Daily Mail data set. - -```py ->>> from transformers import pipeline - ->>> summarizer = pipeline("summarization") - ->>> ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. -... A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. -... Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. -... In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. -... Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the -... 2010 marriage license application, according to court documents. -... Prosecutors said the marriages were part of an immigration scam. -... On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. -... After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective -... Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. -... All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. -... Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. -... Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. -... The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s -... Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. -... Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. -... If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18. -... """ -``` - -Because the summarization pipeline depends on the `PreTrainedModel.generate()` method, we can override the default -arguments of `PreTrainedModel.generate()` directly in the pipeline for `max_length` and `min_length` as shown -below. This outputs the following summary: - -```py ->>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)) -[{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in -the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and -2002 . At one time, she was married to eight men at once, prosecutors say .'}] -``` - -Here is an example of doing summarization using a model and a tokenizer. The process is the following: - -1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder - model, such as `Bart` or `T5`. -2. Define the article that should be summarized. -3. Add the T5 specific prefix "summarize: ". -4. Use the `PreTrainedModel.generate()` method to generate the summary. - -In this example we use Google's T5 model. Even though it was pre-trained only on a multi-task mixed dataset (including -CNN / Daily Mail), it yields very good results. - - - -```py ->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") ->>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - ->>> # T5 uses a max_length of 512 so we cut the article to 512 tokens. ->>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True) ->>> outputs = model.generate( -... inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True -... ) - ->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal -counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them -between 1999 and 2002. -``` - - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base") ->>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - ->>> # T5 uses a max_length of 512 so we cut the article to 512 tokens. ->>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="tf", max_length=512) ->>> outputs = model.generate( -... inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True -... ) - ->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal -counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them -between 1999 and 2002. -``` - - - -## Translation - -Translation is the task of translating a text from one language to another. If you would like to fine-tune a model on a -translation task, you may leverage the [run_translation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation/run_translation.py) script. - -An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input -data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a -translation task, various approaches are described in this [document](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation/README.md). - -Here is an example of using the pipelines to do translation. It leverages a T5 model that was only pre-trained on a -multi-task mixture dataset (including WMT), yet, yielding impressive translation results. - -```py ->>> from transformers import pipeline - ->>> translator = pipeline("translation_en_to_de") ->>> print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40)) -[{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}] -``` - -Because the translation pipeline depends on the `PreTrainedModel.generate()` method, we can override the default -arguments of `PreTrainedModel.generate()` directly in the pipeline as is shown for `max_length` above. - -Here is an example of doing translation using a model and a tokenizer. The process is the following: - -1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder - model, such as `Bart` or `T5`. -2. Define the article that should be summarized. -3. Add the T5 specific prefix "translate English to German: " -4. Use the `PreTrainedModel.generate()` method to perform the translation. - - - -```py ->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") ->>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - ->>> inputs = tokenizer( -... "translate English to German: Hugging Face is a technology company based in New York and Paris", -... return_tensors="pt", -... ) ->>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True) - ->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris. -``` - - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base") ->>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - ->>> inputs = tokenizer( -... "translate English to German: Hugging Face is a technology company based in New York and Paris", -... return_tensors="tf", -... ) ->>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True) - ->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris. -``` - - - -We get the same translation as with the pipeline example. - -## Audio classification - -Audio classification assigns a class to an audio signal. The Keyword Spotting dataset from the [SUPERB](https://huggingface.co/datasets/superb) benchmark is an example dataset that can be used for audio classification fine-tuning. This dataset contains ten classes of keywords for classification. If you'd like to fine-tune a model for audio classification, take a look at the [run_audio_classification.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/audio-classification/run_audio_classification.py) script or this [how-to guide](./tasks/audio_classification). - -The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for audio classification inference: - -```py ->>> from transformers import pipeline ->>> from datasets import load_dataset ->>> import torch - ->>> torch.manual_seed(42) # doctest: +IGNORE_RESULT - ->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") ->>> dataset = dataset.sort("id") ->>> audio_file = dataset[0]["audio"]["path"] - ->>> audio_classifier = pipeline( -... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) ->>> predictions = audio_classifier(audio_file) ->>> predictions = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in predictions] ->>> predictions -[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}] -``` - -The general process for using a model and feature extractor for audio classification is: - -1. Instantiate a feature extractor and a model from the checkpoint name. -2. Process the audio signal to be classified with a feature extractor. -3. Pass the input through the model and take the `argmax` to retrieve the most likely class. -4. Convert the class id to a class name with `id2label` to return an interpretable result. - - - -```py ->>> from transformers import AutoFeatureExtractor, AutoModelForAudioClassification ->>> from datasets import load_dataset ->>> import torch - ->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") ->>> dataset = dataset.sort("id") ->>> sampling_rate = dataset.features["audio"].sampling_rate - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks") ->>> model = AutoModelForAudioClassification.from_pretrained("superb/wav2vec2-base-superb-ks") - ->>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") - ->>> with torch.no_grad(): -... logits = model(**inputs).logits - ->>> predicted_class_ids = torch.argmax(logits, dim=-1).item() ->>> predicted_label = model.config.id2label[predicted_class_ids] ->>> predicted_label -'_unknown_' -``` - - - -## Automatic speech recognition - -Automatic speech recognition transcribes an audio signal to text. The [Common Voice](https://huggingface.co/datasets/common_voice) dataset is an example dataset that can be used for automatic speech recognition fine-tuning. It contains an audio file of a speaker and the corresponding sentence. If you'd like to fine-tune a model for automatic speech recognition, take a look at the [run_speech_recognition_ctc.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) or [run_speech_recognition_seq2seq.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py) scripts or this [how-to guide](./tasks/asr). - -The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for automatic speech recognition inference: - -```py ->>> from transformers import pipeline ->>> from datasets import load_dataset - ->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") ->>> dataset = dataset.sort("id") ->>> audio_file = dataset[0]["audio"]["path"] - ->>> speech_recognizer = pipeline(task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h") ->>> speech_recognizer(audio_file) -{'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'} -``` - -The general process for using a model and processor for automatic speech recognition is: - -1. Instantiate a processor (which regroups a feature extractor for input processing and a tokenizer for decoding) and a model from the checkpoint name. -2. Process the audio signal and text with a processor. -3. Pass the input through the model and take the `argmax` to retrieve the predicted text. -4. Decode the text with a tokenizer to obtain the transcription. - - - -```py ->>> from transformers import AutoProcessor, AutoModelForCTC ->>> from datasets import load_dataset ->>> import torch - ->>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") ->>> dataset = dataset.sort("id") ->>> sampling_rate = dataset.features["audio"].sampling_rate - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") ->>> model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h") - ->>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") ->>> with torch.no_grad(): -... logits = model(**inputs).logits ->>> predicted_ids = torch.argmax(logits, dim=-1) - ->>> transcription = processor.batch_decode(predicted_ids) ->>> transcription[0] -'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL' -``` - - - -## Image classification - -Like text and audio classification, image classification assigns a class to an image. The [CIFAR-100](https://huggingface.co/datasets/cifar100) dataset is an example dataset that can be used for image classification fine-tuning. It contains an image and the corresponding class. If you'd like to fine-tune a model for image classification, take a look at the [run_image_classification.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-classification/run_image_classification.py) script or this [how-to guide](./tasks/image_classification). - -The following examples demonstrate how to use a [`pipeline`] and a model and tokenizer for image classification inference: - -```py ->>> from transformers import pipeline - ->>> vision_classifier = pipeline(task="image-classification") ->>> result = vision_classifier( -... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" -... ) ->>> print("\n".join([f"Class {d['label']} with score {round(d['score'], 4)}" for d in result])) -Class lynx, catamount with score 0.4335 -Class cougar, puma, catamount, mountain lion, painter, panther, Felis concolor with score 0.0348 -Class snow leopard, ounce, Panthera uncia with score 0.0324 -Class Egyptian cat with score 0.0239 -Class tiger cat with score 0.0229 -``` - -The general process for using a model and image processor for image classification is: - -1. Instantiate an image processor and a model from the checkpoint name. -2. Process the image to be classified with an image processor. -3. Pass the input through the model and take the `argmax` to retrieve the predicted class. -4. Convert the class id to a class name with `id2label` to return an interpretable result. - - - -```py ->>> from transformers import AutoImageProcessor, AutoModelForImageClassification ->>> import torch ->>> from datasets import load_dataset - ->>> dataset = load_dataset("huggingface/cats-image") ->>> image = dataset["test"]["image"][0] - ->>> feature_extractor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") ->>> model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224") - ->>> inputs = feature_extractor(image, return_tensors="pt") - ->>> with torch.no_grad(): -... logits = model(**inputs).logits - ->>> predicted_label = logits.argmax(-1).item() ->>> print(model.config.id2label[predicted_label]) -Egyptian cat -``` - - diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md new file mode 100644 index 000000000000..d01269ba60a6 --- /dev/null +++ b/docs/source/en/tasks/asr.md @@ -0,0 +1,376 @@ + + +# Automatic speech recognition + +[[open-in-colab]] + + + +Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings. + +This guide will show you how to: + +1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate jiwer +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load MInDS-14 dataset + +Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from datasets import load_dataset, Audio + +>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]") +``` + +Split the dataset's `train` split into a train and test set with the [`~Dataset.train_test_split`] method: + +```py +>>> minds = minds.train_test_split(test_size=0.2) +``` + +Then take a look at the dataset: + +```py +>>> minds +DatasetDict({ + train: Dataset({ + features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], + num_rows: 16 + }) + test: Dataset({ + features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], + num_rows: 4 + }) +}) +``` + +While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: + +```py +>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"]) +``` + +Take a look at the example again: + +```py +>>> minds["train"][0] +{'audio': {'array': array([-0.00024414, 0. , 0. , ..., 0.00024414, + 0.00024414, 0.00024414], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'sampling_rate': 8000}, + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} +``` + +There are two fields: + +- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. +- `transcription`: the target text. + +## Preprocess + +The next step is to load a Wav2Vec2 processor to process the audio signal: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") +``` + +The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: + +```py +>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) +>>> minds["train"][0] +{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ..., + 2.78103951e-04, 2.38446111e-04, 1.18740834e-04], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'sampling_rate': 16000}, + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} +``` + +As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary: + +```py +>>> def uppercase(example): +... return {"transcription": example["transcription"].upper()} + + +>>> minds = minds.map(uppercase) +``` + +Now create a preprocessing function that: + +1. Calls the `audio` column to load and resample the audio file. +2. Extracts the `input_values` from the audio file and tokenize the `transcription` column with the processor. + +```py +>>> def prepare_dataset(batch): +... audio = batch["audio"] +... batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"]) +... batch["input_length"] = len(batch["input_values"][0]) +... return batch +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by increasing the number of processes with the `num_proc` parameter. Remove the columns you don't need with the [`~datasets.Dataset.remove_columns`] method: + +```py +>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4) +``` + +🤗 Transformers doesn't have a data collator for ASR, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It'll also dynamically pad your text and labels to the length of the longest element in its batch (instead of the entire dataset) so they are a uniform length. While it is possible to pad your text in the `tokenizer` function by setting `padding=True`, dynamic padding is more efficient. + +Unlike other data collators, this specific data collator needs to apply a different padding method to `input_values` and `labels`: + +```py +>>> import torch + +>>> from dataclasses import dataclass, field +>>> from typing import Any, Dict, List, Optional, Union + + +>>> @dataclass +... class DataCollatorCTCWithPadding: +... processor: AutoProcessor +... padding: Union[bool, str] = "longest" + +... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: +... # split inputs and labels since they have to be of different lengths and need +... # different padding methods +... input_features = [{"input_values": feature["input_values"][0]} for feature in features] +... label_features = [{"input_ids": feature["labels"]} for feature in features] + +... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt") + +... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt") + +... # replace padding with -100 to ignore loss correctly +... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + +... batch["labels"] = labels + +... return batch +``` + +Now instantiate your `DataCollatorForCTCWithPadding`: + +```py +>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest") +``` + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> wer = evaluate.load("wer") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the WER: + +```py +>>> import numpy as np + + +>>> def compute_metrics(pred): +... pred_logits = pred.predictions +... pred_ids = np.argmax(pred_logits, axis=-1) + +... pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id + +... pred_str = processor.batch_decode(pred_ids) +... label_str = processor.batch_decode(pred.label_ids, group_tokens=False) + +... wer = wer.compute(predictions=pred_str, references=label_str) + +... return {"wer": wer} +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation: + +```py +>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer + +>>> model = AutoModelForCTC.from_pretrained( +... "facebook/wav2vec2-base", +... ctc_loss_reduction="mean", +... pad_token_id=processor.tokenizer.pad_token_id, +... ) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_asr_mind_model", +... per_device_train_batch_size=8, +... gradient_accumulation_steps=2, +... learning_rate=1e-5, +... warmup_steps=500, +... max_steps=2000, +... gradient_checkpointing=True, +... fp16=True, +... group_by_length=True, +... evaluation_strategy="steps", +... per_device_eval_batch_size=8, +... save_steps=1000, +... eval_steps=1000, +... logging_steps=25, +... load_best_model_at_end=True, +... metric_for_best_model="wer", +... greater_is_better=False, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=encoded_minds["train"], +... eval_dataset=encoded_minds["test"], +... tokenizer=processor, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + + +For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR. + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +>>> sampling_rate = dataset.features["audio"].sampling_rate +>>> audio_file = dataset[0]["audio"]["path"] +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it: + +```py +>>> from transformers import pipeline + +>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model") +>>> transcriber(audio_file) +{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'} +``` + + + +The transcription is decent, but it could be better! Try finetuning your model on more examples to get even better results! + + + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Load a processor to preprocess the audio file and transcription and return the `input` as PyTorch tensors: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model") +>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") +``` + +Pass your inputs to the model and return the logits: + +```py +>>> from transformers import AutoModelForCTC + +>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +Get the predicted `input_ids` with the highest probability, and use the processor to decode the predicted `input_ids` back into text: + +```py +>>> import torch + +>>> predicted_ids = torch.argmax(logits, dim=-1) +>>> transcription = processor.batch_decode(predicted_ids) +>>> transcription +['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'] +``` + + \ No newline at end of file diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx deleted file mode 100644 index fcd5bc508c87..000000000000 --- a/docs/source/en/tasks/asr.mdx +++ /dev/null @@ -1,366 +0,0 @@ - - -# Automatic speech recognition - - - -Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings. - -This guide will show you how to: - -1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text. -2. Use your finetuned model for inference. - - - -See the automatic speech recognition [task page](https://huggingface.co/tasks/automatic-speech-recognition) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate jiwer -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load MInDS-14 dataset - -Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. - -```py ->>> from datasets import load_dataset, Audio - ->>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]") -``` - -Split the dataset's `train` split into a train and test set with the [`~Dataset.train_test_split`] method: - -```py ->>> minds = minds.train_test_split(test_size=0.2) -``` - -Then take a look at the dataset: - -```py ->>> minds -DatasetDict({ - train: Dataset({ - features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], - num_rows: 16 - }) - test: Dataset({ - features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], - num_rows: 4 - }) -}) -``` - -While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: - -```py ->>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"]) -``` - -Take a look at the example again: - -```py ->>> minds["train"][0] -{'audio': {'array': array([-0.00024414, 0. , 0. , ..., 0.00024414, - 0.00024414, 0.00024414], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'sampling_rate': 8000}, - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} -``` - -There are two fields: - -- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. -- `transcription`: the target text. - -## Preprocess - -The next step is to load a Wav2Vec2 processor to process the audio signal: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") -``` - -The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: - -```py ->>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) ->>> minds["train"][0] -{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ..., - 2.78103951e-04, 2.38446111e-04, 1.18740834e-04], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'sampling_rate': 16000}, - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} -``` - -As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary: - -```py ->>> def uppercase(example): -... return {"transcription": example["transcription"].upper()} - - ->>> minds = minds.map(uppercase) -``` - -Now create a preprocessing function that: - -1. Calls the `audio` column to load and resample the audio file. -2. Extracts the `input_values` from the audio file and tokenize the `transcription` column with the processor. - -```py ->>> def prepare_dataset(batch): -... audio = batch["audio"] -... batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"]) -... batch["input_length"] = len(batch["input_values"][0]) -... return batch -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by increasing the number of processes with the `num_proc` parameter. Remove the columns you don't need with the [`~datasets.Dataset.remove_columns`] method: - -```py ->>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4) -``` - -🤗 Transformers doesn't have a data collator for ASR, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It'll also dynamically pad your text and labels to the length of the longest element in its batch (instead of the entire dataset) so they are a uniform length. While it is possible to pad your text in the `tokenizer` function by setting `padding=True`, dynamic padding is more efficient. - -Unlike other data collators, this specific data collator needs to apply a different padding method to `input_values` and `labels`: - -```py ->>> import torch - ->>> from dataclasses import dataclass, field ->>> from typing import Any, Dict, List, Optional, Union - - ->>> @dataclass -... class DataCollatorCTCWithPadding: - -... processor: AutoProcessor -... padding: Union[bool, str] = "longest" - -... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: -... # split inputs and labels since they have to be of different lengths and need -... # different padding methods -... input_features = [{"input_values": feature["input_values"][0]} for feature in features] -... label_features = [{"input_ids": feature["labels"]} for feature in features] - -... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt") - -... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt") - -... # replace padding with -100 to ignore loss correctly -... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) - -... batch["labels"] = labels - -... return batch -``` - -Now instantiate your `DataCollatorForCTCWithPadding`: - -```py ->>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest") -``` - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> wer = evaluate.load("wer") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the WER: - -```py ->>> import numpy as np - - ->>> def compute_metrics(pred): -... pred_logits = pred.predictions -... pred_ids = np.argmax(pred_logits, axis=-1) - -... pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id - -... pred_str = processor.batch_decode(pred_ids) -... label_str = processor.batch_decode(pred.label_ids, group_tokens=False) - -... wer = wer.compute(predictions=pred_str, references=label_str) - -... return {"wer": wer} -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - - -You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation: - -```py ->>> from transformers import AutoModelForCTC, TrainingArguments, Trainer - ->>> model = AutoModelForCTC.from_pretrained( -... "facebook/wav2vec2-base", -... ctc_loss_reduction="mean", -... pad_token_id=processor.tokenizer.pad_token_id, -... ) -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_asr_mind_model", -... per_device_train_batch_size=8, -... gradient_accumulation_steps=2, -... learning_rate=1e-5, -... warmup_steps=500, -... max_steps=2000, -... gradient_checkpointing=True, -... fp16=True, -... group_by_length=True, -... evaluation_strategy="steps", -... per_device_eval_batch_size=8, -... save_steps=1000, -... eval_steps=1000, -... logging_steps=25, -... load_best_model_at_end=True, -... metric_for_best_model="wer", -... greater_is_better=False, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=encoded_minds["train"], -... eval_dataset=encoded_minds["test"], -... tokenizer=processor.feature_extractor, -... data_collator=data_collator, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - - -For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR. - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) ->>> sampling_rate = dataset.features["audio"].sampling_rate ->>> audio_file = dataset[0]["audio"]["path"] -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it: - -```py ->>> from transformers import pipeline - ->>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model") ->>> transcriber(audio_file) -{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'} -``` - - - -The transcription is decent, but it could be better! Try finetuning your model on more examples to get even better results! - - - -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Load a processor to preprocess the audio file and transcription and return the `input` as PyTorch tensors: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model") ->>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") -``` - -Pass your inputs to the model and return the logits: - -```py ->>> from transformers import AutoModelForCTC - ->>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model") ->>> with torch.no_grad(): -... logits = model(**inputs).logits -``` - -Get the predicted `input_ids` with the highest probability, and use the processor to decode the predicted `input_ids` back into text: - -```py ->>> import torch - ->>> predicted_ids = torch.argmax(logits, dim=-1) ->>> transcription = processor.batch_decode(predicted_ids) ->>> transcription -['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'] -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md new file mode 100644 index 000000000000..743a797fc53f --- /dev/null +++ b/docs/source/en/tasks/audio_classification.md @@ -0,0 +1,329 @@ + + +# Audio classification + +[[open-in-colab]] + + + +Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds. + +This guide will show you how to: + +1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load MInDS-14 dataset + +Start by loading the MInDS-14 dataset from the 🤗 Datasets library: + +```py +>>> from datasets import load_dataset, Audio + +>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train") +``` + +Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset. + +```py +>>> minds = minds.train_test_split(test_size=0.2) +``` + +Then take a look at the dataset: + +```py +>>> minds +DatasetDict({ + train: Dataset({ + features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], + num_rows: 450 + }) + test: Dataset({ + features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], + num_rows: 113 + }) +}) +``` + +While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: + +```py +>>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"]) +``` + +Take a look at an example now: + +```py +>>> minds["train"][0] +{'audio': {'array': array([ 0. , 0. , 0. , ..., -0.00048828, + -0.00024414, -0.00024414], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav', + 'sampling_rate': 8000}, + 'intent_class': 2} +``` + +There are two fields: + +- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. +- `intent_class`: represents the class id of the speaker's intent. + +To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa: + +```py +>>> labels = minds["train"].features["intent_class"].names +>>> label2id, id2label = dict(), dict() +>>> for i, label in enumerate(labels): +... label2id[label] = str(i) +... id2label[str(i)] = label +``` + +Now you can convert the label id to a label name: + +```py +>>> id2label[str(2)] +'app_error' +``` + +## Preprocess + +The next step is to load a Wav2Vec2 feature extractor to process the audio signal: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +``` + +The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: + +```py +>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) +>>> minds["train"][0] +{'audio': {'array': array([ 2.2098757e-05, 4.6582241e-05, -2.2803260e-05, ..., + -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav', + 'sampling_rate': 16000}, + 'intent_class': 2} +``` + +Now create a preprocessing function that: + +1. Calls the `audio` column to load, and if necessary, resample the audio file. +2. Checks if the sampling rate of the audio file matches the sampling rate of the audio data a model was pretrained with. You can find this information in the Wav2Vec2 [model card](https://huggingface.co/facebook/wav2vec2-base). +3. Set a maximum input length to batch longer inputs without truncating them. + +```py +>>> def preprocess_function(examples): +... audio_arrays = [x["array"] for x in examples["audio"]] +... inputs = feature_extractor( +... audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True +... ) +... return inputs +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects: + +```py +>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True) +>>> encoded_minds = encoded_minds.rename_column("intent_class", "label") +``` + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: + +```py +>>> import numpy as np + + +>>> def compute_metrics(eval_pred): +... predictions = np.argmax(eval_pred.predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=eval_pred.label_ids) +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForAudioClassification`] along with the number of expected labels, and the label mappings: + +```py +>>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer + +>>> num_labels = len(id2label) +>>> model = AutoModelForAudioClassification.from_pretrained( +... "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label +... ) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_mind_model", +... evaluation_strategy="epoch", +... save_strategy="epoch", +... learning_rate=3e-5, +... per_device_train_batch_size=32, +... gradient_accumulation_steps=4, +... per_device_eval_batch_size=32, +... num_train_epochs=10, +... warmup_ratio=0.1, +... logging_steps=10, +... load_best_model_at_end=True, +... metric_for_best_model="accuracy", +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=encoded_minds["train"], +... eval_dataset=encoded_minds["test"], +... tokenizer=feature_extractor, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + + +For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +>>> sampling_rate = dataset.features["audio"].sampling_rate +>>> audio_file = dataset[0]["audio"]["path"] +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model") +>>> classifier(audio_file) +[ + {'score': 0.09766869246959686, 'label': 'cash_deposit'}, + {'score': 0.07998877018690109, 'label': 'app_error'}, + {'score': 0.0781070664525032, 'label': 'joint_account'}, + {'score': 0.07667109370231628, 'label': 'pay_bill'}, + {'score': 0.0755252093076706, 'label': 'balance'} +] +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Load a feature extractor to preprocess the audio file and return the `input` as PyTorch tensors: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model") +>>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") +``` + +Pass your inputs to the model and return the logits: + +```py +>>> from transformers import AutoModelForAudioClassification + +>>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a label: + +```py +>>> import torch + +>>> predicted_class_ids = torch.argmax(logits).item() +>>> predicted_label = model.config.id2label[predicted_class_ids] +>>> predicted_label +'cash_deposit' +``` + + \ No newline at end of file diff --git a/docs/source/en/tasks/audio_classification.mdx b/docs/source/en/tasks/audio_classification.mdx deleted file mode 100644 index ab0abbced785..000000000000 --- a/docs/source/en/tasks/audio_classification.mdx +++ /dev/null @@ -1,317 +0,0 @@ - - -# Audio classification - - - -Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds. - -This guide will show you how to: - -1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent. -2. Use your finetuned model for inference. - - - -See the audio classification [task page](https://huggingface.co/tasks/audio-classification) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load MInDS-14 dataset - -Start by loading the MInDS-14 dataset from the 🤗 Datasets library: - -```py ->>> from datasets import load_dataset, Audio - ->>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train") -``` - -Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset. - -```py ->>> minds = minds.train_test_split(test_size=0.2) -``` - -Then take a look at the dataset: - -```py ->>> minds -DatasetDict({ - train: Dataset({ - features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], - num_rows: 450 - }) - test: Dataset({ - features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], - num_rows: 113 - }) -}) -``` - -While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: - -```py ->>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"]) -``` - -Take a look at an example now: - -```py ->>> minds["train"][0] -{'audio': {'array': array([ 0. , 0. , 0. , ..., -0.00048828, - -0.00024414, -0.00024414], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav', - 'sampling_rate': 8000}, - 'intent_class': 2} -``` - -There are two fields: - -- `audio`: a 1-dimensional `array` of the speech signal that must be called to load and resample the audio file. -- `intent_class`: represents the class id of the speaker's intent. - -To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa: - -```py ->>> labels = minds["train"].features["intent_class"].names ->>> label2id, id2label = dict(), dict() ->>> for i, label in enumerate(labels): -... label2id[label] = str(i) -... id2label[str(i)] = label -``` - -Now you can convert the label id to a label name: - -```py ->>> id2label[str(2)] -'app_error' -``` - -## Preprocess - -The next step is to load a Wav2Vec2 feature extractor to process the audio signal: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") -``` - -The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: - -```py ->>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) ->>> minds["train"][0] -{'audio': {'array': array([ 2.2098757e-05, 4.6582241e-05, -2.2803260e-05, ..., - -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav', - 'sampling_rate': 16000}, - 'intent_class': 2} -``` - -Now create a preprocessing function that: - -1. Calls the `audio` column to load, and if necessary, resample the audio file. -2. Checks if the sampling rate of the audio file matches the sampling rate of the audio data a model was pretrained with. You can find this information in the Wav2Vec2 [model card](https://huggingface.co/facebook/wav2vec2-base). -3. Set a maximum input length to batch longer inputs without truncating them. - -```py ->>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] -... inputs = feature_extractor( -... audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True -... ) -... return inputs -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects: - -```py ->>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True) ->>> encoded_minds = encoded_minds.rename_column("intent_class", "label") -``` - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> accuracy = evaluate.load("accuracy") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: - -```py ->>> import numpy as np - - ->>> def compute_metrics(eval_pred): -... predictions = np.argmax(eval_pred.predictions, axis=1) -... return accuracy.compute(predictions=predictions, references=eval_pred.label_ids) -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForAudioClassification`] along with the number of expected labels, and the label mappings: - -```py ->>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer - ->>> num_labels = len(id2label) ->>> model = AutoModelForAudioClassification.from_pretrained( -... "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label -... ) -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_mind_model", -... evaluation_strategy="epoch", -... save_strategy="epoch", -... learning_rate=3e-5, -... per_device_train_batch_size=32, -... gradient_accumulation_steps=4, -... per_device_eval_batch_size=32, -... num_train_epochs=10, -... warmup_ratio=0.1, -... logging_steps=10, -... load_best_model_at_end=True, -... metric_for_best_model="accuracy", -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=encoded_minds["train"], -... eval_dataset=encoded_minds["test"], -... tokenizer=feature_extractor, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - - -For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) ->>> sampling_rate = dataset.features["audio"].sampling_rate ->>> audio_file = dataset[0]["audio"]["path"] -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model") ->>> classifier(audio_file) -[ - {'score': 0.09766869246959686, 'label': 'cash_deposit'}, - {'score': 0.07998877018690109, 'label': 'app_error'}, - {'score': 0.0781070664525032, 'label': 'joint_account'}, - {'score': 0.07667109370231628, 'label': 'pay_bill'}, - {'score': 0.0755252093076706, 'label': 'balance'} -] -``` - -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Load a feature extractor to preprocess the audio file and return the `input` as PyTorch tensors: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model") ->>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") -``` - -Pass your inputs to the model and return the logits: - -```py ->>> from transformers import AutoModelForAudioClassification - ->>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model") ->>> with torch.no_grad(): -... logits = model(**inputs).logits -``` - -Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a label: - -```py ->>> import torch - ->>> predicted_class_ids = torch.argmax(logits).item() ->>> predicted_label = model.config.id2label[predicted_class_ids] ->>> predicted_label -'cash_deposit' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/document_question_answering.md b/docs/source/en/tasks/document_question_answering.md new file mode 100644 index 000000000000..24bf3a069ac9 --- /dev/null +++ b/docs/source/en/tasks/document_question_answering.md @@ -0,0 +1,498 @@ + + +# Document Question Answering + +[[open-in-colab]] + +Document Question Answering, also referred to as Document Visual Question Answering, is a task that involves providing +answers to questions posed about document images. The input to models supporting this task is typically a combination of an image and +a question, and the output is an answer expressed in natural language. These models utilize multiple modalities, including +text, the positions of words (bounding boxes), and the image itself. + +This guide illustrates how to: + +- Fine-tune [LayoutLMv2](../model_doc/layoutlmv2) on the [DocVQA dataset](https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut). +- Use your fine-tuned model for inference. + + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3) + + + + + +LayoutLMv2 solves the document question-answering task by adding a question-answering head on top of the final hidden +states of the tokens, to predict the positions of the start and end tokens of the +answer. In other words, the problem is treated as extractive question answering: given the context, extract which piece +of information answers the question. The context comes from the output of an OCR engine, here it is Google's Tesseract. + +Before you begin, make sure you have all the necessary libraries installed. LayoutLMv2 depends on detectron2, torchvision and tesseract. + +```bash +pip install -q transformers datasets +``` + +```bash +pip install 'git+https://github.com/facebookresearch/detectron2.git' +pip install torchvision +``` + +```bash +sudo apt install tesseract-ocr +pip install -q pytesseract +``` + +Once you have installed all of the dependencies, restart your runtime. + +We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the 🤗 Hub. +When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +Let's define some global variables. + +```py +>>> model_checkpoint = "microsoft/layoutlmv2-base-uncased" +>>> batch_size = 4 +``` + +## Load the data + +In this guide we use a small sample of preprocessed DocVQA that you can find on 🤗 Hub. If you'd like to use the full +DocVQA dataset, you can register and download it on [DocVQA homepage](https://rrc.cvc.uab.es/?ch=17). If you do so, to +proceed with this guide check out [how to load files into a 🤗 dataset](https://huggingface.co/docs/datasets/loading#local-and-remote-files). + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("nielsr/docvqa_1200_examples") +>>> dataset +DatasetDict({ + train: Dataset({ + features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'], + num_rows: 1000 + }) + test: Dataset({ + features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'], + num_rows: 200 + }) +}) +``` + +As you can see, the dataset is split into train and test sets already. Take a look at a random example to familiarize +yourself with the features. + +```py +>>> dataset["train"].features +``` + +Here's what the individual fields represent: +* `id`: the example's id +* `image`: a PIL.Image.Image object containing the document image +* `query`: the question string - natural language asked question, in several languages +* `answers`: a list of correct answers provided by human annotators +* `words` and `bounding_boxes`: the results of OCR, which we will not use here +* `answer`: an answer matched by a different model which we will not use here + +Let's leave only English questions, and drop the `answer` feature which appears to contain predictions by another model. +We'll also take the first of the answers from the set provided by the annotators. Alternatively, you can randomly sample it. + +```py +>>> updated_dataset = dataset.map(lambda example: {"question": example["query"]["en"]}, remove_columns=["query"]) +>>> updated_dataset = updated_dataset.map( +... lambda example: {"answer": example["answers"][0]}, remove_columns=["answer", "answers"] +... ) +``` + +Note that the LayoutLMv2 checkpoint that we use in this guide has been trained with `max_position_embeddings = 512` (you can +find this information in the [checkpoint's `config.json` file](https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18)). +We can truncate the examples but to avoid the situation where the answer might be at the end of a large document and end up truncated, +here we'll remove the few examples where the embedding is likely to end up longer than 512. +If most of the documents in your dataset are long, you can implement a sliding window strategy - check out [this notebook](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb) for details. + +```py +>>> updated_dataset = updated_dataset.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512) +``` + +At this point let's also remove the OCR features from this dataset. These are a result of OCR for fine-tuning a different +model. They would still require some processing if we wanted to use them, as they do not match the input requirements +of the model we use in this guide. Instead, we can use the [`LayoutLMv2Processor`] on the original data for both OCR and +tokenization. This way we'll get the inputs that match model's expected input. If you want to process images manually, +check out the [`LayoutLMv2` model documentation](../model_doc/layoutlmv2) to learn what input format the model expects. + +```py +>>> updated_dataset = updated_dataset.remove_columns("words") +>>> updated_dataset = updated_dataset.remove_columns("bounding_boxes") +``` + +Finally, the data exploration won't be complete if we don't peek at an image example. + +```py +>>> updated_dataset["train"][11]["image"] +``` + +
+ DocVQA Image Example +
+ +## Preprocess the data + +The Document Question Answering task is a multimodal task, and you need to make sure that the inputs from each modality +are preprocessed according to the model's expectations. Let's start by loading the [`LayoutLMv2Processor`], which internally combines an image processor that can handle image data and a tokenizer that can encode text data. + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained(model_checkpoint) +``` + +### Preprocessing document images + +First, let's prepare the document images for the model with the help of the `image_processor` from the processor. +By default, image processor resizes the images to 224x224, makes sure they have the correct order of color channels, +applies OCR with tesseract to get words and normalized bounding boxes. In this tutorial, all of these defaults are exactly what we need. +Write a function that applies the default image processing to a batch of images and returns the results of OCR. + +```py +>>> image_processor = processor.image_processor + + +>>> def get_ocr_words_and_boxes(examples): +... images = [image.convert("RGB") for image in examples["image"]] +... encoded_inputs = image_processor(images) + +... examples["image"] = encoded_inputs.pixel_values +... examples["words"] = encoded_inputs.words +... examples["boxes"] = encoded_inputs.boxes + +... return examples +``` + +To apply this preprocessing to the entire dataset in a fast way, use [`~datasets.Dataset.map`]. + +```py +>>> dataset_with_ocr = updated_dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2) +``` + +### Preprocessing text data + +Once we have applied OCR to the images, we need to encode the text part of the dataset to prepare it for the model. +This involves converting the words and boxes that we got in the previous step to token-level `input_ids`, `attention_mask`, +`token_type_ids` and `bbox`. For preprocessing text, we'll need the `tokenizer` from the processor. + +```py +>>> tokenizer = processor.tokenizer +``` + +On top of the preprocessing mentioned above, we also need to add the labels for the model. For `xxxForQuestionAnswering` models +in 🤗 Transformers, the labels consist of the `start_positions` and `end_positions`, indicating which token is at the +start and which token is at the end of the answer. + +Let's start with that. Define a helper function that can find a sublist (the answer split into words) in a larger list (the words list). + +This function will take two lists as input, `words_list` and `answer_list`. It will then iterate over the `words_list` and check +if the current word in the `words_list` (words_list[i]) is equal to the first word of answer_list (answer_list[0]) and if +the sublist of `words_list` starting from the current word and of the same length as `answer_list` is equal `to answer_list`. +If this condition is true, it means that a match has been found, and the function will record the match, its starting index (idx), +and its ending index (idx + len(answer_list) - 1). If more than one match was found, the function will return only the first one. +If no match is found, the function returns (`None`, 0, and 0). + +```py +>>> def subfinder(words_list, answer_list): +... matches = [] +... start_indices = [] +... end_indices = [] +... for idx, i in enumerate(range(len(words_list))): +... if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list: +... matches.append(answer_list) +... start_indices.append(idx) +... end_indices.append(idx + len(answer_list) - 1) +... if matches: +... return matches[0], start_indices[0], end_indices[0] +... else: +... return None, 0, 0 +``` + +To illustrate how this function finds the position of the answer, let's use it on an example: + +```py +>>> example = dataset_with_ocr["train"][1] +>>> words = [word.lower() for word in example["words"]] +>>> match, word_idx_start, word_idx_end = subfinder(words, example["answer"].lower().split()) +>>> print("Question: ", example["question"]) +>>> print("Words:", words) +>>> print("Answer: ", example["answer"]) +>>> print("start_index", word_idx_start) +>>> print("end_index", word_idx_end) +Question: Who is in cc in this letter? +Words: ['wie', 'baw', 'brown', '&', 'williamson', 'tobacco', 'corporation', 'research', '&', 'development', 'internal', 'correspondence', 'to:', 'r.', 'h.', 'honeycutt', 'ce:', 't.f.', 'riehl', 'from:', '.', 'c.j.', 'cook', 'date:', 'may', '8,', '1995', 'subject:', 'review', 'of', 'existing', 'brainstorming', 'ideas/483', 'the', 'major', 'function', 'of', 'the', 'product', 'innovation', 'graup', 'is', 'to', 'develop', 'marketable', 'nove!', 'products', 'that', 'would', 'be', 'profitable', 'to', 'manufacture', 'and', 'sell.', 'novel', 'is', 'defined', 'as:', 'of', 'a', 'new', 'kind,', 'or', 'different', 'from', 'anything', 'seen', 'or', 'known', 'before.', 'innovation', 'is', 'defined', 'as:', 'something', 'new', 'or', 'different', 'introduced;', 'act', 'of', 'innovating;', 'introduction', 'of', 'new', 'things', 'or', 'methods.', 'the', 'products', 'may', 'incorporate', 'the', 'latest', 'technologies,', 'materials', 'and', 'know-how', 'available', 'to', 'give', 'then', 'a', 'unique', 'taste', 'or', 'look.', 'the', 'first', 'task', 'of', 'the', 'product', 'innovation', 'group', 'was', 'to', 'assemble,', 'review', 'and', 'categorize', 'a', 'list', 'of', 'existing', 'brainstorming', 'ideas.', 'ideas', 'were', 'grouped', 'into', 'two', 'major', 'categories', 'labeled', 'appearance', 'and', 'taste/aroma.', 'these', 'categories', 'are', 'used', 'for', 'novel', 'products', 'that', 'may', 'differ', 'from', 'a', 'visual', 'and/or', 'taste/aroma', 'point', 'of', 'view', 'compared', 'to', 'canventional', 'cigarettes.', 'other', 'categories', 'include', 'a', 'combination', 'of', 'the', 'above,', 'filters,', 'packaging', 'and', 'brand', 'extensions.', 'appearance', 'this', 'category', 'is', 'used', 'for', 'novel', 'cigarette', 'constructions', 'that', 'yield', 'visually', 'different', 'products', 'with', 'minimal', 'changes', 'in', 'smoke', 'chemistry', 'two', 'cigarettes', 'in', 'cne.', 'emulti-plug', 'te', 'build', 'yaur', 'awn', 'cigarette.', 'eswitchable', 'menthol', 'or', 'non', 'menthol', 'cigarette.', '*cigarettes', 'with', 'interspaced', 'perforations', 'to', 'enable', 'smoker', 'to', 'separate', 'unburned', 'section', 'for', 'future', 'smoking.', '«short', 'cigarette,', 'tobacco', 'section', '30', 'mm.', '«extremely', 'fast', 'buming', 'cigarette.', '«novel', 'cigarette', 'constructions', 'that', 'permit', 'a', 'significant', 'reduction', 'iretobacco', 'weight', 'while', 'maintaining', 'smoking', 'mechanics', 'and', 'visual', 'characteristics.', 'higher', 'basis', 'weight', 'paper:', 'potential', 'reduction', 'in', 'tobacco', 'weight.', '«more', 'rigid', 'tobacco', 'column;', 'stiffing', 'agent', 'for', 'tobacco;', 'e.g.', 'starch', '*colored', 'tow', 'and', 'cigarette', 'papers;', 'seasonal', 'promotions,', 'e.g.', 'pastel', 'colored', 'cigarettes', 'for', 'easter', 'or', 'in', 'an', 'ebony', 'and', 'ivory', 'brand', 'containing', 'a', 'mixture', 'of', 'all', 'black', '(black', 'paper', 'and', 'tow)', 'and', 'ail', 'white', 'cigarettes.', '499150498'] +Answer: T.F. Riehl +start_index 17 +end_index 18 +``` + +Once examples are encoded, however, they will look like this: + +```py +>>> encoding = tokenizer(example["question"], example["words"], example["boxes"]) +>>> tokenizer.decode(encoding["input_ids"]) +[CLS] who is in cc in this letter? [SEP] wie baw brown & williamson tobacco corporation research & development ... +``` + +We'll need to find the position of the answer in the encoded input. +* `token_type_ids` tells us which tokens are part of the question, and which ones are part of the document's words. +* `tokenizer.cls_token_id` will help find the special token at the beginning of the input. +* `word_ids` will help match the answer found in the original `words` to the same answer in the full encoded input and determine +the start/end position of the answer in the encoded input. + +With that in mind, let's create a function to encode a batch of examples in the dataset: + +```py +>>> def encode_dataset(examples, max_length=512): +... questions = examples["question"] +... words = examples["words"] +... boxes = examples["boxes"] +... answers = examples["answer"] + +... # encode the batch of examples and initialize the start_positions and end_positions +... encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True) +... start_positions = [] +... end_positions = [] + +... # loop through the examples in the batch +... for i in range(len(questions)): +... cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id) + +... # find the position of the answer in example's words +... words_example = [word.lower() for word in words[i]] +... answer = answers[i] +... match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split()) + +... if match: +... # if match is found, use `token_type_ids` to find where words start in the encoding +... token_type_ids = encoding["token_type_ids"][i] +... token_start_index = 0 +... while token_type_ids[token_start_index] != 1: +... token_start_index += 1 + +... token_end_index = len(encoding["input_ids"][i]) - 1 +... while token_type_ids[token_end_index] != 1: +... token_end_index -= 1 + +... word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1] +... start_position = cls_index +... end_position = cls_index + +... # loop over word_ids and increase `token_start_index` until it matches the answer position in words +... # once it matches, save the `token_start_index` as the `start_position` of the answer in the encoding +... for id in word_ids: +... if id == word_idx_start: +... start_position = token_start_index +... else: +... token_start_index += 1 + +... # similarly loop over `word_ids` starting from the end to find the `end_position` of the answer +... for id in word_ids[::-1]: +... if id == word_idx_end: +... end_position = token_end_index +... else: +... token_end_index -= 1 + +... start_positions.append(start_position) +... end_positions.append(end_position) + +... else: +... start_positions.append(cls_index) +... end_positions.append(cls_index) + +... encoding["image"] = examples["image"] +... encoding["start_positions"] = start_positions +... encoding["end_positions"] = end_positions + +... return encoding +``` + +Now that we have this preprocessing function, we can encode the entire dataset: + +```py +>>> encoded_train_dataset = dataset_with_ocr["train"].map( +... encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["train"].column_names +... ) +>>> encoded_test_dataset = dataset_with_ocr["test"].map( +... encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["test"].column_names +... ) +``` + +Let's check what the features of the encoded dataset look like: + +```py +>>> encoded_train_dataset.features +{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), + 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), + 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), + 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), + 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), + 'start_positions': Value(dtype='int64', id=None), + 'end_positions': Value(dtype='int64', id=None)} +``` + +## Evaluation + +Evaluation for document question answering requires a significant amount of postprocessing. To avoid taking up too much +of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so +you're not completely in the dark about your model's performance. Extractive question answering is typically evaluated using F1/exact match. +If you'd like to implement it yourself, check out the [Question Answering chapter](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) +of the Hugging Face course for inspiration. + +## Train + +Congratulations! You've successfully navigated the toughest part of this guide and now you are ready to train your own model. +Training involves the following steps: +* Load the model with [`AutoModelForDocumentQuestionAnswering`] using the same checkpoint as in the preprocessing. +* Define your training hyperparameters in [`TrainingArguments`]. +* Define a function to batch examples together, here the [`DefaultDataCollator`] will do just fine +* Pass the training arguments to [`Trainer`] along with the model, dataset, and data collator. +* Call [`~Trainer.train`] to finetune your model. + +```py +>>> from transformers import AutoModelForDocumentQuestionAnswering + +>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint) +``` + +In the [`TrainingArguments`] use `output_dir` to specify where to save your model, and configure hyperparameters as you see fit. +If you wish to share your model with the community, set `push_to_hub` to `True` (you must be signed in to Hugging Face to upload your model). +In this case the `output_dir` will also be the name of the repo where your model checkpoint will be pushed. + +```py +>>> from transformers import TrainingArguments + +>>> # REPLACE THIS WITH YOUR REPO ID +>>> repo_id = "MariaK/layoutlmv2-base-uncased_finetuned_docvqa" + +>>> training_args = TrainingArguments( +... output_dir=repo_id, +... per_device_train_batch_size=4, +... num_train_epochs=20, +... save_steps=200, +... logging_steps=50, +... evaluation_strategy="steps", +... learning_rate=5e-5, +... save_total_limit=2, +... remove_unused_columns=False, +... push_to_hub=True, +... ) +``` + +Define a simple data collator to batch examples together. + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + +Finally, bring everything together, and call [`~Trainer.train`]: + +```py +>>> from transformers import Trainer + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... data_collator=data_collator, +... train_dataset=encoded_train_dataset, +... eval_dataset=encoded_test_dataset, +... tokenizer=processor, +... ) + +>>> trainer.train() +``` + +To add the final model to 🤗 Hub, create a model card and call `push_to_hub`: + +```py +>>> trainer.create_model_card() +>>> trainer.push_to_hub() +``` + +## Inference + +Now that you have finetuned a LayoutLMv2 model, and uploaded it to the 🤗 Hub, you can use it for inference. The simplest +way to try out your finetuned model for inference is to use it in a [`Pipeline`]. + +Let's take an example: +```py +>>> example = dataset["test"][2] +>>> question = example["query"]["en"] +>>> image = example["image"] +>>> print(question) +>>> print(example["answers"]) +'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?' +['TRRF Vice President', 'lee a. waller'] +``` + +Next, instantiate a pipeline for +document question answering with your model, and pass the image + question combination to it. + +```py +>>> from transformers import pipeline + +>>> qa_pipeline = pipeline("document-question-answering", model="MariaK/layoutlmv2-base-uncased_finetuned_docvqa") +>>> qa_pipeline(image, question) +[{'score': 0.9949808120727539, + 'answer': 'Lee A. Waller', + 'start': 55, + 'end': 57}] +``` + +You can also manually replicate the results of the pipeline if you'd like: +1. Take an image and a question, prepare them for the model using the processor from your model. +2. Forward the result or preprocessing through the model. +3. The model returns `start_logits` and `end_logits`, which indicate which token is at the start of the answer and +which token is at the end of the answer. Both have shape (batch_size, sequence_length). +4. Take an argmax on the last dimension of both the `start_logits` and `end_logits` to get the predicted `start_idx` and `end_idx`. +5. Decode the answer with the tokenizer. + +```py +>>> import torch +>>> from transformers import AutoProcessor +>>> from transformers import AutoModelForDocumentQuestionAnswering + +>>> processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa") +>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa") + +>>> with torch.no_grad(): +... encoding = processor(image.convert("RGB"), question, return_tensors="pt") +... outputs = model(**encoding) +... start_logits = outputs.start_logits +... end_logits = outputs.end_logits +... predicted_start_idx = start_logits.argmax(-1).item() +... predicted_end_idx = end_logits.argmax(-1).item() + +>>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]) +'lee a. waller' +``` \ No newline at end of file diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md new file mode 100644 index 000000000000..376ec8b308b0 --- /dev/null +++ b/docs/source/en/tasks/idefics.md @@ -0,0 +1,426 @@ + + +# Image tasks with IDEFICS + +[[open-in-colab]] + +While individual tasks can be tackled by fine-tuning specialized models, an alternative approach +that has recently emerged and gained popularity is to use large models for a diverse set of tasks without fine-tuning. +For instance, large language models can handle such NLP tasks as summarization, translation, classification, and more. +This approach is no longer limited to a single modality, such as text, and in this guide, we will illustrate how you can +solve image-text tasks with a large multimodal model called IDEFICS. + +[IDEFICS](../model_doc/idefics) is an open-access vision and language model based on [Flamingo](https://huggingface.co/papers/2204.14198), +a state-of-the-art visual language model initially developed by DeepMind. The model accepts arbitrary sequences of image +and text inputs and generates coherent text as output. It can answer questions about images, describe visual content, +create stories grounded in multiple images, and so on. IDEFICS comes in two variants - [80 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-80b) +and [9 billion parameters](https://huggingface.co/HuggingFaceM4/idefics-9b), both of which are available on the 🤗 Hub. For each variant, you can also find fine-tuned instructed +versions of the model adapted for conversational use cases. + +This model is exceptionally versatile and can be used for a wide range of image and multimodal tasks. However, +being a large model means it requires significant computational resources and infrastructure. It is up to you to decide whether +this approach suits your use case better than fine-tuning specialized models for each individual task. + +In this guide, you'll learn how to: +- [Load IDEFICS](#loading-the-model) and [load the quantized version of the model](#loading-the-quantized-version-of-the-model) +- Use IDEFICS for: + - [Image captioning](#image-captioning) + - [Prompted image captioning](#prompted-image-captioning) + - [Few-shot prompting](#few-shot-prompting) + - [Visual question answering](#visual-question-answering) + - [Image classificaiton](#image-classification) + - [Image-guided text generation](#image-guided-text-generation) +- [Run inference in batch mode](#running-inference-in-batch-mode) +- [Run IDEFICS instruct for conversational use](#idefics-instruct-for-conversational-use) + +Before you begin, make sure you have all the necessary libraries installed. + +```bash +pip install -q bitsandbytes sentencepiece accelerate transformers +``` + + +To run the following examples with a non-quantized version of the model checkpoint you will need at least 20GB of GPU memory. + + +## Loading the model + +Let's start by loading the model's 9 billion parameters checkpoint: + +```py +>>> checkpoint = "HuggingFaceM4/idefics-9b" +``` + +Just like for other Transformers models, you need to load a processor and the model itself from the checkpoint. +The IDEFICS processor wraps a [`LlamaTokenizer`] and IDEFICS image processor into a single processor to take care of +preparing text and image inputs for the model. + +```py +>>> import torch + +>>> from transformers import IdeficsForVisionText2Text, AutoProcessor + +>>> processor = AutoProcessor.from_pretrained(checkpoint) + +>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto") +``` + +Setting `device_map` to `"auto"` will automatically determine how to load and store the model weights in the most optimized +manner given existing devices. + +### Quantized model + +If high-memory GPU availability is an issue, you can load the quantized version of the model. To load the model and the +processor in 4bit precision, pass a `BitsAndBytesConfig` to the `from_pretrained` method and the model will be compressed +on the fly while loading. + +```py +>>> import torch +>>> from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig + +>>> quantization_config = BitsAndBytesConfig( +... load_in_4bit=True, +... bnb_4bit_compute_dtype=torch.float16, +... ) + +>>> processor = AutoProcessor.from_pretrained(checkpoint) + +>>> model = IdeficsForVisionText2Text.from_pretrained( +... checkpoint, +... quantization_config=quantization_config, +... device_map="auto" +... ) +``` + +Now that you have the model loaded in one of the suggested ways, let's move on to exploring tasks that you can use IDEFICS for. + +## Image captioning + +Image captioning is the task of predicting a caption for a given image. A common application is to aid visually impaired +people navigate through different situations, for instance, explore image content online. + +To illustrate the task, get an image to be captioned, e.g.: + +
+ Image of a puppy in a flower bed +
+ +Photo by [Hendo Wang](https://unsplash.com/@hendoo). + +IDEFICS accepts text and image prompts. However, to caption an image, you do not have to provide a text prompt to the +model, only the preprocessed input image. Without a text prompt, the model will start generating text from the +BOS (beginning-of-sequence) token thus creating a caption. + +As image input to the model, you can use either an image object (`PIL.Image`) or a url from which the image can be retrieved. + +```py +>>> prompt = [ +... "https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80", +... ] + +>>> inputs = processor(prompt, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> print(generated_text[0]) +A puppy in a flower bed +``` + + + +It is a good idea to include the `bad_words_ids` in the call to `generate` to avoid errors arising when increasing +the `max_new_tokens`: the model will want to generate a new `` or `` token when there +is no image being generated by the model. +You can set it on-the-fly as in this guide, or store in the `GenerationConfig` as described in the [Text generation strategies](../generation_strategies) guide. + + +## Prompted image captioning + +You can extend image captioning by providing a text prompt, which the model will continue given the image. Let's take +another image to illustrate: + +
+ Image of the Eiffel Tower at night +
+ +Photo by [Denys Nevozhai](https://unsplash.com/@dnevozhai). + +Textual and image prompts can be passed to the model's processor as a single list to create appropriate inputs. + +```py +>>> prompt = [ +... "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80", +... "This is an image of ", +... ] + +>>> inputs = processor(prompt, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> print(generated_text[0]) +This is an image of the Eiffel Tower in Paris, France. +``` + +## Few-shot prompting + +While IDEFICS demonstrates great zero-shot results, your task may require a certain format of the caption, or come with +other restrictions or requirements that increase task's complexity. Few-shot prompting can be used to enable in-context learning. +By providing examples in the prompt, you can steer the model to generate results that mimic the format of given examples. + +Let's use the previous image of the Eiffel Tower as an example for the model and build a prompt that demonstrates to the model +that in addition to learning what the object in an image is, we would also like to get some interesting information about it. +Then, let's see, if we can get the same response format for an image of the Statue of Liberty: + +
+ Image of the Statue of Liberty +
+ +Photo by [Juan Mayobre](https://unsplash.com/@jmayobres). + +```py +>>> prompt = ["User:", +... "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80", +... "Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n", +... "User:", +... "https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3387&q=80", +... "Describe this image.\nAssistant:" +... ] + +>>> inputs = processor(prompt, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, max_new_tokens=30, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> print(generated_text[0]) +User: Describe this image. +Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building. +User: Describe this image. +Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty is 151 feet tall. +``` + +Notice that just from a single example (i.e., 1-shot) the model has learned how to perform the task. For more complex tasks, +feel free to experiment with a larger number of examples (e.g., 3-shot, 5-shot, etc.). + +## Visual question answering + +Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. Similar to image +captioning it can be used in accessibility applications, but also in education (reasoning about visual materials), customer +service (questions about products based on images), and image retrieval. + +Let's get a new image for this task: + +
+ Image of a couple having a picnic +
+ +Photo by [Jarritos Mexican Soda](https://unsplash.com/@jarritos). + +You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions: + +```py +>>> prompt = [ +... "Instruction: Provide an answer to the question. Use the image to answer.\n", +... "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", +... "Question: Where are these people and what's the weather like? Answer:" +... ] + +>>> inputs = processor(prompt, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, max_new_tokens=20, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> print(generated_text[0]) +Instruction: Provide an answer to the question. Use the image to answer. + Question: Where are these people and what's the weather like? Answer: They're in a park in New York City, and it's a beautiful day. +``` + +## Image classification + +IDEFICS is capable of classifying images into different categories without being explicitly trained on data containing +labeled examples from those specific categories. Given a list of categories and using its image and text understanding +capabilities, the model can infer which category the image likely belongs to. + +Say, we have this image of a vegetable stand: + +
+ Image of a vegetable stand +
+ +Photo by [Peter Wendt](https://unsplash.com/@peterwendt). + +We can instruct the model to classify the image into one of the categories that we have: + +```py +>>> categories = ['animals','vegetables', 'city landscape', 'cars', 'office'] +>>> prompt = [f"Instruction: Classify the following image into a single category from the following list: {categories}.\n", +... "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", +... "Category: " +... ] + +>>> inputs = processor(prompt, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, max_new_tokens=6, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> print(generated_text[0]) +Instruction: Classify the following image into a single category from the following list: ['animals', 'vegetables', 'city landscape', 'cars', 'office']. +Category: Vegetables +``` + +In the example above we instruct the model to classify the image into a single category, however, you can also prompt the model to do rank classification. + +## Image-guided text generation + +For more creative applications, you can use image-guided text generation to generate text based on an image. This can be +useful to create descriptions of products, ads, descriptions of a scene, etc. + +Let's prompt IDEFICS to write a story based on a simple image of a red door: + +
+ Image of a red door with a pumpkin on the steps +
+ +Photo by [Craig Tidball](https://unsplash.com/@devonshiremedia). + +```py +>>> prompt = ["Instruction: Use the image to write a story. \n", +... "https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=2203&q=80", +... "Story: \n"] + +>>> inputs = processor(prompt, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, num_beams=2, max_new_tokens=200, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> print(generated_text[0]) +Instruction: Use the image to write a story. + Story: +Once upon a time, there was a little girl who lived in a house with a red door. She loved her red door. It was the prettiest door in the whole world. + +One day, the little girl was playing in her yard when she noticed a man standing on her doorstep. He was wearing a long black coat and a top hat. + +The little girl ran inside and told her mother about the man. + +Her mother said, “Don’t worry, honey. He’s just a friendly ghost.” + +The little girl wasn’t sure if she believed her mother, but she went outside anyway. + +When she got to the door, the man was gone. + +The next day, the little girl was playing in her yard again when she noticed the man standing on her doorstep. + +He was wearing a long black coat and a top hat. + +The little girl ran +``` + +Looks like IDEFICS noticed the pumpkin on the doorstep and went with a spooky Halloween story about a ghost. + + + +For longer outputs like this, you will greatly benefit from tweaking the text generation strategy. This can help +you significantly improve the quality of the generated output. Check out [Text generation strategies](../generation_strategies) +to learn more. + + +## Running inference in batch mode + +All of the earlier sections illustrated IDEFICS for a single example. In a very similar fashion, you can run inference +for a batch of examples by passing a list of prompts: + +```py +>>> prompts = [ +... [ "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80", +... "This is an image of ", +... ], +... [ "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", +... "This is an image of ", +... ], +... [ "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", +... "This is an image of ", +... ], +... ] + +>>> inputs = processor(prompts, return_tensors="pt").to("cuda") +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> for i,t in enumerate(generated_text): +... print(f"{i}:\n{t}\n") +0: +This is an image of the Eiffel Tower in Paris, France. + +1: +This is an image of a couple on a picnic blanket. + +2: +This is an image of a vegetable stand. +``` + +## IDEFICS instruct for conversational use + +For conversational use cases, you can find fine-tuned instructed versions of the model on the 🤗 Hub: +`HuggingFaceM4/idefics-80b-instruct` and `HuggingFaceM4/idefics-9b-instruct`. + +These checkpoints are the result of fine-tuning the respective base models on a mixture of supervised and instruction +fine-tuning datasets, which boosts the downstream performance while making the models more usable in conversational settings. + +The use and prompting for the conversational use is very similar to using the base models: + +```py +>>> import torch +>>> from transformers import IdeficsForVisionText2Text, AutoProcessor + +>>> device = "cuda" if torch.cuda.is_available() else "cpu" + +>>> checkpoint = "HuggingFaceM4/idefics-9b-instruct" +>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device) +>>> processor = AutoProcessor.from_pretrained(checkpoint) + +>>> prompts = [ +... [ +... "User: What is in this image?", +... "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG", +... "", + +... "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.", + +... "\nUser:", +... "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052", +... "And who is that?", + +... "\nAssistant:", +... ], +... ] + +>>> # --batched mode +>>> inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device) +>>> # --single sample mode +>>> # inputs = processor(prompts[0], return_tensors="pt").to(device) + +>>> # Generation args +>>> exit_condition = processor.tokenizer("", add_special_tokens=False).input_ids +>>> bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + +>>> generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +>>> for i, t in enumerate(generated_text): +... print(f"{i}:\n{t}\n") +``` diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md new file mode 100644 index 000000000000..71e81b4651bd --- /dev/null +++ b/docs/source/en/tasks/image_captioning.md @@ -0,0 +1,276 @@ + + + +# Image captioning + +[[open-in-colab]] + +Image captioning is the task of predicting a caption for a given image. Common real world applications of it include +aiding visually impaired people that can help them navigate through different situations. Therefore, image captioning +helps to improve content accessibility for people by describing images to them. + +This guide will show you how to: + +* Fine-tune an image captioning model. +* Use the fine-tuned model for inference. + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate -q +pip install jiwer -q +``` + +We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: + + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +## Load the Pokémon BLIP captions dataset + +Use the 🤗 Dataset library to load a dataset that consists of {image-caption} pairs. To create your own image captioning dataset +in PyTorch, you can follow [this notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb). + + +```python +from datasets import load_dataset + +ds = load_dataset("lambdalabs/pokemon-blip-captions") +ds +``` +```bash +DatasetDict({ + train: Dataset({ + features: ['image', 'text'], + num_rows: 833 + }) +}) +``` + +The dataset has two features, `image` and `text`. + + + +Many image captioning datasets contain multiple captions per image. In those cases, a common strategy is to randomly sample a caption amongst the available ones during training. + + + +Split the dataset’s train split into a train and test set with the [~datasets.Dataset.train_test_split] method: + + +```python +ds = ds["train"].train_test_split(test_size=0.1) +train_ds = ds["train"] +test_ds = ds["test"] +``` + +Let's visualize a couple of samples from the training set. + + +```python +from textwrap import wrap +import matplotlib.pyplot as plt +import numpy as np + + +def plot_images(images, captions): + plt.figure(figsize=(20, 20)) + for i in range(len(images)): + ax = plt.subplot(1, len(images), i + 1) + caption = captions[i] + caption = "\n".join(wrap(caption, 12)) + plt.title(caption) + plt.imshow(images[i]) + plt.axis("off") + + +sample_images_to_visualize = [np.array(train_ds[i]["image"]) for i in range(5)] +sample_captions = [train_ds[i]["text"] for i in range(5)] +plot_images(sample_images_to_visualize, sample_captions) +``` + +
+ Sample training images +
+ +## Preprocess the dataset + +Since the dataset has two modalities (image and text), the pre-processing pipeline will preprocess images and the captions. + +To do so, load the processor class associated with the model you are about to fine-tune. + +```python +from transformers import AutoProcessor + +checkpoint = "microsoft/git-base" +processor = AutoProcessor.from_pretrained(checkpoint) +``` + +The processor will internally pre-process the image (which includes resizing, and pixel scaling) and tokenize the caption. + +```python +def transforms(example_batch): + images = [x for x in example_batch["image"]] + captions = [x for x in example_batch["text"]] + inputs = processor(images=images, text=captions, padding="max_length") + inputs.update({"labels": inputs["input_ids"]}) + return inputs + + +train_ds.set_transform(transforms) +test_ds.set_transform(transforms) +``` + +With the dataset ready, you can now set up the model for fine-tuning. + +## Load a base model + +Load the ["microsoft/git-base"](https://huggingface.co/microsoft/git-base) into a [`AutoModelForCausalLM`](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM) object. + + +```python +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained(checkpoint) +``` + +## Evaluate + +Image captioning models are typically evaluated with the [Rouge Score](https://huggingface.co/spaces/evaluate-metric/rouge) or [Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer). For this guide, you will use the Word Error Rate (WER). + +We use the 🤗 Evaluate library to do so. For potential limitations and other gotchas of the WER, refer to [this guide](https://huggingface.co/spaces/evaluate-metric/wer). + + +```python +from evaluate import load +import torch + +wer = load("wer") + + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predicted = logits.argmax(-1) + decoded_labels = processor.batch_decode(labels, skip_special_tokens=True) + decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True) + wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels) + return {"wer_score": wer_score} +``` + +## Train! + +Now, you are ready to start fine-tuning the model. You will use the 🤗 [`Trainer`] for this. + +First, define the training arguments using [`TrainingArguments`]. + + +```python +from transformers import TrainingArguments, Trainer + +model_name = checkpoint.split("/")[1] + +training_args = TrainingArguments( + output_dir=f"{model_name}-pokemon", + learning_rate=5e-5, + num_train_epochs=50, + fp16=True, + per_device_train_batch_size=32, + per_device_eval_batch_size=32, + gradient_accumulation_steps=2, + save_total_limit=3, + evaluation_strategy="steps", + eval_steps=50, + save_strategy="steps", + save_steps=50, + logging_steps=50, + remove_unused_columns=False, + push_to_hub=True, + label_names=["labels"], + load_best_model_at_end=True, +) +``` + +Then pass them along with the datasets and the model to 🤗 Trainer. + +```python +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=test_ds, + compute_metrics=compute_metrics, +) +``` + +To start training, simply call [`~Trainer.train`] on the [`Trainer`] object. + +```python +trainer.train() +``` + +You should see the training loss drop smoothly as training progresses. + +Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method so everyone can use your model: + + +```python +trainer.push_to_hub() +``` + +## Inference + +Take a sample image from `test_ds` to test the model. + + +```python +from PIL import Image +import requests + +url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png" +image = Image.open(requests.get(url, stream=True).raw) +image +``` + +
+ Test image +
+ +Prepare image for the model. + +```python +device = "cuda" if torch.cuda.is_available() else "cpu" + +inputs = processor(images=image, return_tensors="pt").to(device) +pixel_values = inputs.pixel_values +``` + +Call [`generate`] and decode the predictions. + +```python +generated_ids = model.generate(pixel_values=pixel_values, max_length=50) +generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(generated_caption) +``` +```bash +a drawing of a pink and blue pokemon +``` + +Looks like the fine-tuned model generated a pretty good caption! diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md new file mode 100644 index 000000000000..489ec59ddf6a --- /dev/null +++ b/docs/source/en/tasks/image_classification.md @@ -0,0 +1,547 @@ + + +# Image classification + +[[open-in-colab]] + + + +Image classification assigns a label or class to an image. Unlike text or audio classification, the inputs are the +pixel values that comprise an image. There are many applications for image classification, such as detecting damage +after a natural disaster, monitoring crop health, or helping screen medical images for signs of disease. + +This guide illustrates how to: + +1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. +2. Use your fine-tuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load Food-101 dataset + +Start by loading a smaller subset of the Food-101 dataset from the 🤗 Datasets library. This will give you a chance to +experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from datasets import load_dataset + +>>> food = load_dataset("food101", split="train[:5000]") +``` + +Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> food = food.train_test_split(test_size=0.2) +``` + +Then take a look at an example: + +```py +>>> food["train"][0] +{'image': , + 'label': 79} +``` + +Each example in the dataset has two fields: + +- `image`: a PIL image of the food item +- `label`: the label class of the food item + +To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name +to an integer and vice versa: + +```py +>>> labels = food["train"].features["label"].names +>>> label2id, id2label = dict(), dict() +>>> for i, label in enumerate(labels): +... label2id[label] = str(i) +... id2label[str(i)] = label +``` + +Now you can convert the label id to a label name: + +```py +>>> id2label[str(79)] +'prime_rib' +``` + +## Preprocess + +The next step is to load a ViT image processor to process the image into a tensor: + +```py +>>> from transformers import AutoImageProcessor + +>>> checkpoint = "google/vit-base-patch16-224-in21k" +>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint) +``` + + + +Apply some image transformations to the images to make the model more robust against overfitting. Here you'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module, but you can also use any image library you like. + +Crop a random part of the image, resize it, and normalize it with the image mean and standard deviation: + +```py +>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor + +>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) +>>> size = ( +... image_processor.size["shortest_edge"] +... if "shortest_edge" in image_processor.size +... else (image_processor.size["height"], image_processor.size["width"]) +... ) +>>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize]) +``` + +Then create a preprocessing function to apply the transforms and return the `pixel_values` - the inputs to the model - of the image: + +```py +>>> def transforms(examples): +... examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]] +... del examples["image"] +... return examples +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.with_transform`] method. The transforms are applied on the fly when you load an element of the dataset: + +```py +>>> food = food.with_transform(transforms) +``` + +Now create a batch of examples using [`DefaultDataCollator`]. Unlike other data collators in 🤗 Transformers, the `DefaultDataCollator` does not apply additional preprocessing such as padding. + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + + + + + + + +To avoid overfitting and to make the model more robust, add some data augmentation to the training part of the dataset. +Here we use Keras preprocessing layers to define the transformations for the training data (includes data augmentation), +and transformations for the validation data (only center cropping, resizing and normalizing). You can use `tf.image`or +any other library you prefer. + +```py +>>> from tensorflow import keras +>>> from tensorflow.keras import layers + +>>> size = (image_processor.size["height"], image_processor.size["width"]) + +>>> train_data_augmentation = keras.Sequential( +... [ +... layers.RandomCrop(size[0], size[1]), +... layers.Rescaling(scale=1.0 / 127.5, offset=-1), +... layers.RandomFlip("horizontal"), +... layers.RandomRotation(factor=0.02), +... layers.RandomZoom(height_factor=0.2, width_factor=0.2), +... ], +... name="train_data_augmentation", +... ) + +>>> val_data_augmentation = keras.Sequential( +... [ +... layers.CenterCrop(size[0], size[1]), +... layers.Rescaling(scale=1.0 / 127.5, offset=-1), +... ], +... name="val_data_augmentation", +... ) +``` + +Next, create functions to apply appropriate transformations to a batch of images, instead of one image at a time. + +```py +>>> import numpy as np +>>> import tensorflow as tf +>>> from PIL import Image + + +>>> def convert_to_tf_tensor(image: Image): +... np_image = np.array(image) +... tf_image = tf.convert_to_tensor(np_image) +... # `expand_dims()` is used to add a batch dimension since +... # the TF augmentation layers operates on batched inputs. +... return tf.expand_dims(tf_image, 0) + + +>>> def preprocess_train(example_batch): +... """Apply train_transforms across a batch.""" +... images = [ +... train_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"] +... ] +... example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images] +... return example_batch + + +... def preprocess_val(example_batch): +... """Apply val_transforms across a batch.""" +... images = [ +... val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"] +... ] +... example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images] +... return example_batch +``` + +Use 🤗 Datasets [`~datasets.Dataset.set_transform`] to apply the transformations on the fly: + +```py +food["train"].set_transform(preprocess_train) +food["test"].set_transform(preprocess_val) +``` + +As a final preprocessing step, create a batch of examples using `DefaultDataCollator`. Unlike other data collators in 🤗 Transformers, the +`DefaultDataCollator` does not apply additional preprocessing, such as padding. + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an +evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load +the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: + +```py +>>> import numpy as np + + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you set up your training. + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load ViT with [`AutoModelForImageClassification`]. Specify the number of labels along with the number of expected labels, and the label mappings: + +```py +>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer + +>>> model = AutoModelForImageClassification.from_pretrained( +... checkpoint, +... num_labels=len(labels), +... id2label=id2label, +... label2id=label2id, +... ) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because that'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_food_model", +... remove_unused_columns=False, +... evaluation_strategy="epoch", +... save_strategy="epoch", +... learning_rate=5e-5, +... per_device_train_batch_size=16, +... gradient_accumulation_steps=4, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... warmup_ratio=0.1, +... logging_steps=10, +... load_best_model_at_end=True, +... metric_for_best_model="accuracy", +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... data_collator=data_collator, +... train_dataset=food["train"], +... eval_dataset=food["test"], +... tokenizer=image_processor, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + + + + + +If you are unfamiliar with fine-tuning a model with Keras, check out the [basic tutorial](./training#train-a-tensorflow-model-with-keras) first! + + + +To fine-tune a model in TensorFlow, follow these steps: +1. Define the training hyperparameters, and set up an optimizer and a learning rate schedule. +2. Instantiate a pre-trained model. +3. Convert a 🤗 Dataset to a `tf.data.Dataset`. +4. Compile your model. +5. Add callbacks and use the `fit()` method to run the training. +6. Upload your model to 🤗 Hub to share with the community. + +Start by defining the hyperparameters, optimizer and learning rate schedule: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_epochs = 5 +>>> num_train_steps = len(food["train"]) * num_epochs +>>> learning_rate = 3e-5 +>>> weight_decay_rate = 0.01 + +>>> optimizer, lr_schedule = create_optimizer( +... init_lr=learning_rate, +... num_train_steps=num_train_steps, +... weight_decay_rate=weight_decay_rate, +... num_warmup_steps=0, +... ) +``` + +Then, load ViT with [`TFAutoModelForImageClassification`] along with the label mappings: + +```py +>>> from transformers import TFAutoModelForImageClassification + +>>> model = TFAutoModelForImageClassification.from_pretrained( +... checkpoint, +... id2label=id2label, +... label2id=label2id, +... ) +``` + +Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Dataset.to_tf_dataset`] and your `data_collator`: + +```py +>>> # converting our train dataset to tf.data.Dataset +>>> tf_train_dataset = food["train"].to_tf_dataset( +... columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator +... ) + +>>> # converting our test dataset to tf.data.Dataset +>>> tf_eval_dataset = food["test"].to_tf_dataset( +... columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator +... ) +``` + +Configure the model for training with `compile()`: + +```py +>>> from tensorflow.keras.losses import SparseCategoricalCrossentropy + +>>> loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) +>>> model.compile(optimizer=optimizer, loss=loss) +``` + +To compute the accuracy from the predictions and push your model to the 🤗 Hub, use [Keras callbacks](../main_classes/keras_callbacks). +Pass your `compute_metrics` function to [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback), +and use the [PushToHubCallback](../main_classes/keras_callbacks#transformers.PushToHubCallback) to upload the model: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="food_classifier", +... tokenizer=image_processor, +... save_strategy="no", +... ) +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you are ready to train your model! Call `fit()` with your training and validation datasets, the number of epochs, +and your callbacks to fine-tune the model: + +```py +>>> model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks) +Epoch 1/5 +250/250 [==============================] - 313s 1s/step - loss: 2.5623 - val_loss: 1.4161 - accuracy: 0.9290 +Epoch 2/5 +250/250 [==============================] - 265s 1s/step - loss: 0.9181 - val_loss: 0.6808 - accuracy: 0.9690 +Epoch 3/5 +250/250 [==============================] - 252s 1s/step - loss: 0.3910 - val_loss: 0.4303 - accuracy: 0.9820 +Epoch 4/5 +250/250 [==============================] - 251s 1s/step - loss: 0.2028 - val_loss: 0.3191 - accuracy: 0.9900 +Epoch 5/5 +250/250 [==============================] - 238s 949ms/step - loss: 0.1232 - val_loss: 0.3259 - accuracy: 0.9890 +``` + +Congratulations! You have fine-tuned your model and shared it on the 🤗 Hub. You can now use it for inference! + + + + + + +For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). + + + +## Inference + +Great, now that you've fine-tuned a model, you can use it for inference! + +Load an image you'd like to run inference on: + +```py +>>> ds = load_dataset("food101", split="validation[:10]") +>>> image = ds["image"][0] +``` + +
+ image of beignets +
+ +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image classification with your model, and pass your image to it: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("image-classification", model="my_awesome_food_model") +>>> classifier(image) +[{'score': 0.31856709718704224, 'label': 'beignets'}, + {'score': 0.015232225880026817, 'label': 'bruschetta'}, + {'score': 0.01519392803311348, 'label': 'chicken_wings'}, + {'score': 0.013022331520915031, 'label': 'pork_chop'}, + {'score': 0.012728818692266941, 'label': 'prime_rib'}] +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Load an image processor to preprocess the image and return the `input` as PyTorch tensors: + +```py +>>> from transformers import AutoImageProcessor +>>> import torch + +>>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model") +>>> inputs = image_processor(image, return_tensors="pt") +``` + +Pass your inputs to the model and return the logits: + +```py +>>> from transformers import AutoModelForImageClassification + +>>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +Get the predicted label with the highest probability, and use the model's `id2label` mapping to convert it to a label: + +```py +>>> predicted_label = logits.argmax(-1).item() +>>> model.config.id2label[predicted_label] +'beignets' +``` + + + + + +Load an image processor to preprocess the image and return the `input` as TensorFlow tensors: + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/food_classifier") +>>> inputs = image_processor(image, return_tensors="tf") +``` + +Pass your inputs to the model and return the logits: + +```py +>>> from transformers import TFAutoModelForImageClassification + +>>> model = TFAutoModelForImageClassification.from_pretrained("MariaK/food_classifier") +>>> logits = model(**inputs).logits +``` + +Get the predicted label with the highest probability, and use the model's `id2label` mapping to convert it to a label: + +```py +>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) +>>> model.config.id2label[predicted_class_id] +'beignets' +``` + + + diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx deleted file mode 100644 index 2543db6d2877..000000000000 --- a/docs/source/en/tasks/image_classification.mdx +++ /dev/null @@ -1,297 +0,0 @@ - - -# Image classification - -[[open-in-colab]] - - - -Image classification assigns a label or class to an image. Unlike text or audio classification, the inputs are the pixel values that comprise an image. There are many applications for image classification such as detecting damage after a natural disaster, monitoring crop health, or helping screen medical images for signs of disease. - -This guide will show you how to: - -1. Finetune [ViT](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. -2. Use your finetuned model for inference. - - - -See the image classification [task page](https://huggingface.co/tasks/image-classification) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load Food-101 dataset - -Start by loading a smaller subset of the Food-101 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset. - -```py ->>> from datasets import load_dataset - ->>> food = load_dataset("food101", split="train[:5000]") -``` - -Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - -```py ->>> food = food.train_test_split(test_size=0.2) -``` - -Then take a look at an example: - -```py ->>> food["train"][0] -{'image': , - 'label': 79} -``` - -There are two fields: - -- `image`: a PIL image of the food item. -- `label`: the label class of the food item. - -To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa: - -```py ->>> labels = food["train"].features["label"].names ->>> label2id, id2label = dict(), dict() ->>> for i, label in enumerate(labels): -... label2id[label] = str(i) -... id2label[str(i)] = label -``` - -Now you can convert the label id to a label name: - -```py ->>> id2label[str(79)] -'prime_rib' -``` - -## Preprocess - -The next step is to load a ViT image processor to process the image into a tensor: - -```py ->>> from transformers import AutoImageProcessor - ->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") -``` - -Apply some image transformations to the images to make the model more robust against overfitting. Here you'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module, but you can also use any image library you like. - -Crop a random part of the image, resize it, and normalize it with the image mean and standard deviation: - -```py ->>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor - ->>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) ->>> size = ( -... image_processor.size["shortest_edge"] -... if "shortest_edge" in image_processor.size -... else (image_processor.size["height"], image_processor.size["width"]) -... ) ->>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize]) -``` - -Then create a preprocessing function to apply the transforms and return the `pixel_values` - the inputs to the model - of the image: - -```py ->>> def transforms(examples): -... examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]] -... del examples["image"] -... return examples -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.with_transform`] method. The transforms are applied on the fly when you load an element of the dataset: - -```py ->>> food = food.with_transform(transforms) -``` - -Now create a batch of examples using [`DataCollatorWithPadding`]. Unlike other data collators in 🤗 Transformers, the `DefaultDataCollator` does not apply additional preprocessing such as padding. - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator() -``` - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> accuracy = evaluate.load("accuracy") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: - -```py ->>> import numpy as np - - ->>> def compute_metrics(eval_pred): -... predictions = np.argmax(eval_pred.predictions, axis=1) -... return accuracy.compute(predictions=predictions, references=eval_pred.label_ids) -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load ViT with [`AutoModelForImageClassification`]. Specify the number of labels along with the number of expected labels, and the label mappings: - -```py ->>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer - ->>> model = AutoModelForImageClassification.from_pretrained( -... "google/vit-base-patch16-224-in21k", -... num_labels=len(labels), -... id2label=id2label, -... label2id=label2id, -... ) -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_food_model", -... remove_unused_columns=False, -... evaluation_strategy="epoch", -... save_strategy="epoch", -... learning_rate=5e-5, -... per_device_train_batch_size=16, -... gradient_accumulation_steps=4, -... per_device_eval_batch_size=16, -... num_train_epochs=3, -... warmup_ratio=0.1, -... logging_steps=10, -... load_best_model_at_end=True, -... metric_for_best_model="accuracy", -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... data_collator=data_collator, -... train_dataset=food["train"], -... eval_dataset=food["test"], -... tokenizer=image_processor, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - - -For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Load an image you'd like to run inference on: - -```py ->>> ds = load_dataset("food101", split="validation[:10]") ->>> image = ds["image"][0] -``` - -
- image of beignets -
- -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image classification with your model, and pass your image to it: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("image-classification", model="my_awesome_food_model") ->>> classifier(image) -[{'score': 0.35574808716773987, 'label': 'beignets'}, - {'score': 0.018057454377412796, 'label': 'chicken_wings'}, - {'score': 0.017733804881572723, 'label': 'prime_rib'}, - {'score': 0.016335085034370422, 'label': 'bruschetta'}, - {'score': 0.0160061065107584, 'label': 'ramen'}] -``` -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Load an image processor to preprocess the image and return the `input` as PyTorch tensors: - -```py ->>> from transformers import AutoImageProcessor ->>> import torch - ->>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model") ->>> inputs = image_processor(image, return_tensors="pt") -``` - -Pass your inputs to the model and return the logits: - -```py ->>> from transformers import AutoModelForImageClassification - ->>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model") ->>> with torch.no_grad(): -... logits = model(**inputs).logits -``` - -Get the predicted label with the highest probability, and use the model's `id2label` mapping to convert it to a label: - -```py ->>> predicted_label = logits.argmax(-1).item() ->>> model.config.id2label[predicted_label] -'beignets' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md new file mode 100644 index 000000000000..42406788214c --- /dev/null +++ b/docs/source/en/tasks/language_modeling.md @@ -0,0 +1,425 @@ + + +# Causal language modeling + +[[open-in-colab]] + +There are two types of language modeling, causal and masked. This guide illustrates causal language modeling. +Causal language models are frequently used for text generation. You can use these models for creative applications like +choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot. + + + +Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on +the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model. + +This guide will show you how to: + +1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset. +2. Use your finetuned model for inference. + + +You can finetune other architectures for causal language modeling following the same steps in this guide. +Choose one of the following architectures: + + +[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod) + + + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load ELI5 dataset + +Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library. + This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from datasets import load_dataset + +>>> eli5 = load_dataset("eli5", split="train_asks[:5000]") +``` + +Split the dataset's `train_asks` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> eli5 = eli5.train_test_split(test_size=0.2) +``` + +Then take a look at an example: + +```py +>>> eli5["train"][0] +{'answers': {'a_id': ['c3d1aib', 'c3d4lya'], + 'score': [6, 3], + 'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", + "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]}, + 'answers_urls': {'url': []}, + 'document': '', + 'q_id': 'nyxfp', + 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', + 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']}, + 'subreddit': 'askscience', + 'title': 'Few questions about this space walk photograph.', + 'title_urls': {'url': []}} +``` + +While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling +tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label. + +## Preprocess + + + +The next step is to load a DistilGPT2 tokenizer to process the `text` subfield: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") +``` + +You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to +extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method: + +```py +>>> eli5 = eli5.flatten() +>>> eli5["train"][0] +{'answers.a_id': ['c3d1aib', 'c3d4lya'], + 'answers.score': [6, 3], + 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", + "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"], + 'answers_urls.url': [], + 'document': '', + 'q_id': 'nyxfp', + 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', + 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'], + 'subreddit': 'askscience', + 'title': 'Few questions about this space walk photograph.', + 'title_urls.url': []} +``` + +Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead +of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them. + +Here is a first preprocessing function to join the list of strings for each example and tokenize the result: + +```py +>>> def preprocess_function(examples): +... return tokenizer([" ".join(x) for x in examples["answers.text"]]) +``` + +To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need: + +```py +>>> tokenized_eli5 = eli5.map( +... preprocess_function, +... batched=True, +... num_proc=4, +... remove_columns=eli5["train"].column_names, +... ) +``` + +This dataset contains the token sequences, but some of these are longer than the maximum input length for the model. + +You can now use a second preprocessing function to +- concatenate all the sequences +- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM. + +```py +>>> block_size = 128 + + +>>> def group_texts(examples): +... # Concatenate all texts. +... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} +... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can +... # customize this part to your needs. +... if total_length >= block_size: +... total_length = (total_length // block_size) * block_size +... # Split by chunks of block_size. +... result = { +... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] +... for k, t in concatenated_examples.items() +... } +... result["labels"] = result["input_ids"].copy() +... return result +``` + +Apply the `group_texts` function over the entire dataset: + +```py +>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) +``` + +Now create a batch of examples using [`DataCollatorForLanguageModeling`]. It's more efficient to *dynamically pad* the +sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + + + +Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> tokenizer.pad_token = tokenizer.eos_token +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) +``` + + + +Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") +``` + + + + + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the [basic tutorial](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load DistilGPT2 with [`AutoModelForCausalLM`]: + +```py +>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + +>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). +2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_eli5_clm-model", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=lm_dataset["train"], +... eval_dataset=lm_dataset["test"], +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + +Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity: + +```py +>>> import math + +>>> eval_results = trainer.evaluate() +>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") +Perplexity: 49.61 +``` + +Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the [basic tutorial](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Then you can load DistilGPT2 with [`TFAutoModelForCausalLM`]: + +```py +>>> from transformers import TFAutoModelForCausalLM + +>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2") +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... lm_dataset["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... lm_dataset["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> callback = PushToHubCallback( +... output_dir="my_awesome_eli5_clm-model", +... tokenizer=tokenizer, +... ) +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Come up with a prompt you'd like to generate text from: + +```py +>>> prompt = "Somatic hypermutation allows the immune system to" +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for text generation with your model, and pass your text to it: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model") +>>> generator(prompt) +[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}] +``` + + + +Tokenize the text and return the `input_ids` as PyTorch tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model") +>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids +``` + +Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to generate text. +For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](../generation_strategies) page. + +```py +>>> from transformers import AutoModelForCausalLM + +>>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model") +>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) +``` + +Decode the generated token ids back into text: + +```py +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"] +``` + + +Tokenize the text and return the `input_ids` as TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model") +>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids +``` + +Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](../generation_strategies) page. + +```py +>>> from transformers import TFAutoModelForCausalLM + +>>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model") +>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) +``` + +Decode the generated token ids back into text: + +```py +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for'] +``` + + diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx deleted file mode 100644 index eaf8fdc947f1..000000000000 --- a/docs/source/en/tasks/language_modeling.mdx +++ /dev/null @@ -1,669 +0,0 @@ - - -# Language modeling - -Language modeling tasks predicts words in a sentence, making these types of models great at generating text. You can use these models for creative applications like choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot. There are two types of language modeling, causal and masked. - - - -Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model. - - - -Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This means the model has full access to the tokens on the left and right. BERT is an example of a masked language model. - -This guide will show you how to: - -1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) for causal language modeling and [DistilRoBERTa](https://huggingface.co/distilroberta-base) for masked language modeling on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset. -2. Use your finetuned model for inference. - - - -You can finetune other architectures for language modeling such as [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B), and [BERT](https://huggingface.co/bert-base-uncased), following the same steps in this guide! See the text generation [task page](https://huggingface.co/tasks/text-generation) and fill mask [task page](https://huggingface.co/tasks/fill-mask) for more information about their associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load ELI5 dataset - -Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset. - -```py ->>> from datasets import load_dataset - ->>> eli5 = load_dataset("eli5", split="train_asks[:5000]") -``` - -Split the dataset's `train_asks` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - -```py ->>> eli5 = eli5.train_test_split(test_size=0.2) -``` - -Then take a look at an example: - -```py ->>> eli5["train"][0] -{'answers': {'a_id': ['c3d1aib', 'c3d4lya'], - 'score': [6, 3], - 'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", - "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]}, - 'answers_urls': {'url': []}, - 'document': '', - 'q_id': 'nyxfp', - 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', - 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']}, - 'subreddit': 'askscience', - 'title': 'Few questions about this space walk photograph.', - 'title_urls': {'url': []}} -``` - -While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label. - -## Preprocess - - - -For causal language modeling, the next step is to load a DistilGPT2 tokenizer to process the `text` subfield: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") -``` - - - -For masked language modeling, the next step is to load a DistilRoBERTa tokenizer to process the `text` subfield: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") -``` - -You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method: - -```py ->>> eli5 = eli5.flatten() ->>> eli5["train"][0] -{'answers.a_id': ['c3d1aib', 'c3d4lya'], - 'answers.score': [6, 3], - 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", - "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"], - 'answers_urls.url': [], - 'document': '', - 'q_id': 'nyxfp', - 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', - 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'], - 'subreddit': 'askscience', - 'title': 'Few questions about this space walk photograph.', - 'title_urls.url': []} -``` - -Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them. - -Here is how you can create a preprocessing function to convert the list to a string, and truncate sequences to be no longer than DistilGPT2's maximum input length: - -```py ->>> def preprocess_function(examples): -... return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True) -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.with_transform`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need: - -```py ->>> tokenized_eli5 = eli5.map( -... preprocess_function, -... batched=True, -... num_proc=4, -... remove_columns=eli5["train"].column_names, -... ) -``` - -Now you'll need a second preprocessing function to capture text truncated from the lengthier examples to avoid losing any information. This preprocessing function should: - -- Concatenate all the text. -- Split the concatenated text into smaller chunks defined by `block_size`. - -```py ->>> block_size = 128 - - ->>> def group_texts(examples): -... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} -... total_length = len(concatenated_examples[list(examples.keys())[0]]) -... total_length = (total_length // block_size) * block_size -... result = { -... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] -... for k, t in concatenated_examples.items() -... } -... result["labels"] = result["input_ids"].copy() -... return result -``` - -Apply the `group_texts` function over the entire dataset: - -```py ->>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) -``` - -Now create a batch of examples using [`DataCollatorForLanguageModeling`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length. - - - -For causal language modeling, use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element: - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> tokenizer.pad_token = tokenizer.eos_token ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) -``` - -For masked language modeling, use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data: - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> tokenizer.pad_token = tokenizer.eos_token ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) -``` - - -For causal language modeling, use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element: - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") -``` - -For masked language modeling, use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data: - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf") -``` - - - -## Causal language modeling - -Causal language models are frequently used for text generation. This section shows you how to finetune [DistilGPT2](https://huggingface.co/distilgpt2) to generate new text. - -### Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load DistilGPT2 with [`AutoModelForCausalLM`]: - -```py ->>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer - ->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). -2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_eli5_clm-model", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... weight_decay=0.01, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=lm_dataset["train"], -... eval_dataset=lm_dataset["test"], -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - -Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity: - -```py ->>> import math - ->>> eval_results = trainer.evaluate() ->>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") -Perplexity: 49.61 -``` - -Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer, AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Then you can load DistilGPT2 with [`TFAutoModelForCausalLM`]: - -```py ->>> from transformers import TFAutoModelForCausalLM - ->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2") -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... lm_dataset["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = model.prepare_tf_dataset( -... lm_dataset["test"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> callback = PushToHubCallback( -... output_dir="my_awesome_eli5_clm-model", -... tokenizer=tokenizer, -... ) -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). - - - -### Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Come up with a prompt you'd like to generate text from: - -```py ->>> prompt = "Somatic hypermutation allows the immune system to" -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for text generation with your model, and pass your text to it: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model") ->>> generator(prompt) -[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}] -``` - - - -Tokenize the text and return the `input_ids` as PyTorch tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model") ->>> inputs = tokenizer(prompt, return_tensors="pt").input_ids -``` - -Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to generate text. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API. - -```py ->>> from transformers import AutoModelForCausalLM - ->>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model") ->>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) -``` - -Decode the generated token ids back into text: - -```py ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"] -``` - - -Tokenize the text and return the `input_ids` as TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model") ->>> inputs = tokenizer(prompt, return_tensors="tf").input_ids -``` - -Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API. - -```py ->>> from transformers import TFAutoModelForCausalLM - ->>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model") ->>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) -``` - -Decode the generated token ids back into text: - -```py ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for'] -``` - - - -## Masked language modeling - -Masked language modeling are good for tasks that require a good contextual understanding of an entire sequence. This section shows you how to finetune [DistilRoBERTa](https://huggingface.co/distilroberta-base) to predict a masked word. - -### Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load DistilRoBERTa with [`AutoModelForMaskedLM`]: - -```py ->>> from transformers import AutoModelForMaskedLM - ->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base") -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). -2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_eli5_mlm_model", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... num_train_epochs=3, -... weight_decay=0.01, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=lm_dataset["train"], -... eval_dataset=lm_dataset["test"], -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - -Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity: - -```py ->>> import math - ->>> eval_results = trainer.evaluate() ->>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") -Perplexity: 8.76 -``` - -Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer, AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Then you can load DistilRoBERTa with [`TFAutoModelForMaskedLM`]: - -```py ->>> from transformers import TFAutoModelForMaskedLM - ->>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base") -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... lm_dataset["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = model.prepare_tf_dataset( -... lm_dataset["test"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> callback = PushToHubCallback( -... output_dir="my_awesome_eli5_mlm_model", -... tokenizer=tokenizer, -... ) -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for masked language modeling, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). - - - -### Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Come up with some text you'd like the model to fill in the blank with, and use the special `` token to indicate the blank: - -```py ->>> text = "The Milky Way is a galaxy." -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for fill-mask with your model, and pass your text to it. If you like, you can use the `top_k` parameter to specify how many predictions to return: - -```py ->>> from transformers import pipeline - ->>> mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model") ->>> mask_filler(text, top_k=3) -[{'score': 0.5150994658470154, - 'token': 21300, - 'token_str': ' spiral', - 'sequence': 'The Milky Way is a spiral galaxy.'}, - {'score': 0.07087188959121704, - 'token': 2232, - 'token_str': ' massive', - 'sequence': 'The Milky Way is a massive galaxy.'}, - {'score': 0.06434620916843414, - 'token': 650, - 'token_str': ' small', - 'sequence': 'The Milky Way is a small galaxy.'}] -``` - - - -Tokenize the text and return the `input_ids` as PyTorch tensors. You'll also need to specify the position of the `` token: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model") ->>> inputs = tokenizer(text, return_tensors="pt") ->>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] -``` - -Pass your inputs to the model and return the `logits` of the masked token: - -```py ->>> from transformers import AutoModelForMaskedLM - ->>> model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model") ->>> logits = model(**inputs).logits ->>> mask_token_logits = logits[0, mask_token_index, :] -``` - -Then return the three masked tokens with the highest probability and print them out: - -```py ->>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist() - ->>> for token in top_3_tokens: -... print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) -The Milky Way is a spiral galaxy. -The Milky Way is a massive galaxy. -The Milky Way is a small galaxy. -``` - - -Tokenize the text and return the `input_ids` as TensorFlow tensors. You'll also need to specify the position of the `` token: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model") ->>> inputs = tokenizer(text, return_tensors="tf") ->>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] -``` - -Pass your inputs to the model and return the `logits` of the masked token: - -```py ->>> from transformers import TFAutoModelForMaskedLM - ->>> model = TFAutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model") ->>> logits = model(**inputs).logits ->>> mask_token_logits = logits[0, mask_token_index, :] -``` - -Then return the three masked tokens with the highest probability and print them out: - -```py ->>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy() - ->>> for token in top_3_tokens: -... print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) -The Milky Way is a spiral galaxy. -The Milky Way is a massive galaxy. -The Milky Way is a small galaxy. -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md new file mode 100644 index 000000000000..ba1e9e50dbe8 --- /dev/null +++ b/docs/source/en/tasks/masked_language_modeling.md @@ -0,0 +1,442 @@ + + +# Masked language modeling + +[[open-in-colab]] + + + +Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This +means the model has full access to the tokens on the left and right. Masked language modeling is great for tasks that +require a good contextual understanding of an entire sequence. BERT is an example of a masked language model. + +This guide will show you how to: + +1. Finetune [DistilRoBERTa](https://huggingface.co/distilroberta-base) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset. +2. Use your finetuned model for inference. + + +You can finetune other architectures for masked language modeling following the same steps in this guide. +Choose one of the following architectures: + + + +[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load ELI5 dataset + +Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library. This'll +give you a chance to experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from datasets import load_dataset + +>>> eli5 = load_dataset("eli5", split="train_asks[:5000]") +``` + +Split the dataset's `train_asks` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> eli5 = eli5.train_test_split(test_size=0.2) +``` + +Then take a look at an example: + +```py +>>> eli5["train"][0] +{'answers': {'a_id': ['c3d1aib', 'c3d4lya'], + 'score': [6, 3], + 'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", + "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]}, + 'answers_urls': {'url': []}, + 'document': '', + 'q_id': 'nyxfp', + 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', + 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']}, + 'subreddit': 'askscience', + 'title': 'Few questions about this space walk photograph.', + 'title_urls': {'url': []}} +``` + +While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label. + +## Preprocess + + + +For masked language modeling, the next step is to load a DistilRoBERTa tokenizer to process the `text` subfield: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") +``` + +You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to e +xtract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method: + +```py +>>> eli5 = eli5.flatten() +>>> eli5["train"][0] +{'answers.a_id': ['c3d1aib', 'c3d4lya'], + 'answers.score': [6, 3], + 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", + "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"], + 'answers_urls.url': [], + 'document': '', + 'q_id': 'nyxfp', + 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', + 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'], + 'subreddit': 'askscience', + 'title': 'Few questions about this space walk photograph.', + 'title_urls.url': []} +``` + +Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead +of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them. + +Here is a first preprocessing function to join the list of strings for each example and tokenize the result: + +```py +>>> def preprocess_function(examples): +... return tokenizer([" ".join(x) for x in examples["answers.text"]]) +``` + +To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need: + +```py +>>> tokenized_eli5 = eli5.map( +... preprocess_function, +... batched=True, +... num_proc=4, +... remove_columns=eli5["train"].column_names, +... ) +``` + +This dataset contains the token sequences, but some of these are longer than the maximum input length for the model. + +You can now use a second preprocessing function to +- concatenate all the sequences +- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM. + +```py +>>> block_size = 128 + + +>>> def group_texts(examples): +... # Concatenate all texts. +... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} +... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can +... # customize this part to your needs. +... if total_length >= block_size: +... total_length = (total_length // block_size) * block_size +... # Split by chunks of block_size. +... result = { +... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] +... for k, t in concatenated_examples.items() +... } +... return result +``` + +Apply the `group_texts` function over the entire dataset: + +```py +>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) +``` + +Now create a batch of examples using [`DataCollatorForLanguageModeling`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + + + + +Use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> tokenizer.pad_token = tokenizer.eos_token +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) +``` + + + +Use the end-of-sequence token as the padding token and specify `mlm_probability` to randomly mask tokens each time you iterate over the data: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf") +``` + + + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load DistilRoBERTa with [`AutoModelForMaskedLM`]: + +```py +>>> from transformers import AutoModelForMaskedLM + +>>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base") +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). +2. Pass the training arguments to [`Trainer`] along with the model, datasets, and data collator. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_eli5_mlm_model", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=lm_dataset["train"], +... eval_dataset=lm_dataset["test"], +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + +Once training is completed, use the [`~transformers.Trainer.evaluate`] method to evaluate your model and get its perplexity: + +```py +>>> import math + +>>> eval_results = trainer.evaluate() +>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") +Perplexity: 8.76 +``` + +Then share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Then you can load DistilRoBERTa with [`TFAutoModelForMaskedLM`]: + +```py +>>> from transformers import TFAutoModelForMaskedLM + +>>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base") +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... lm_dataset["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... lm_dataset["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> callback = PushToHubCallback( +... output_dir="my_awesome_eli5_mlm_model", +... tokenizer=tokenizer, +... ) +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for masked language modeling, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Come up with some text you'd like the model to fill in the blank with, and use the special `` token to indicate the blank: + +```py +>>> text = "The Milky Way is a galaxy." +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for fill-mask with your model, and pass your text to it. If you like, you can use the `top_k` parameter to specify how many predictions to return: + +```py +>>> from transformers import pipeline + +>>> mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model") +>>> mask_filler(text, top_k=3) +[{'score': 0.5150994658470154, + 'token': 21300, + 'token_str': ' spiral', + 'sequence': 'The Milky Way is a spiral galaxy.'}, + {'score': 0.07087188959121704, + 'token': 2232, + 'token_str': ' massive', + 'sequence': 'The Milky Way is a massive galaxy.'}, + {'score': 0.06434620916843414, + 'token': 650, + 'token_str': ' small', + 'sequence': 'The Milky Way is a small galaxy.'}] +``` + + + +Tokenize the text and return the `input_ids` as PyTorch tensors. You'll also need to specify the position of the `` token: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model") +>>> inputs = tokenizer(text, return_tensors="pt") +>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] +``` + +Pass your inputs to the model and return the `logits` of the masked token: + +```py +>>> from transformers import AutoModelForMaskedLM + +>>> model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model") +>>> logits = model(**inputs).logits +>>> mask_token_logits = logits[0, mask_token_index, :] +``` + +Then return the three masked tokens with the highest probability and print them out: + +```py +>>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist() + +>>> for token in top_3_tokens: +... print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) +The Milky Way is a spiral galaxy. +The Milky Way is a massive galaxy. +The Milky Way is a small galaxy. +``` + + +Tokenize the text and return the `input_ids` as TensorFlow tensors. You'll also need to specify the position of the `` token: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model") +>>> inputs = tokenizer(text, return_tensors="tf") +>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] +``` + +Pass your inputs to the model and return the `logits` of the masked token: + +```py +>>> from transformers import TFAutoModelForMaskedLM + +>>> model = TFAutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model") +>>> logits = model(**inputs).logits +>>> mask_token_logits = logits[0, mask_token_index, :] +``` + +Then return the three masked tokens with the highest probability and print them out: + +```py +>>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy() + +>>> for token in top_3_tokens: +... print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) +The Milky Way is a spiral galaxy. +The Milky Way is a massive galaxy. +The Milky Way is a small galaxy. +``` + + diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md new file mode 100644 index 000000000000..fa59771cbb02 --- /dev/null +++ b/docs/source/en/tasks/monocular_depth_estimation.md @@ -0,0 +1,151 @@ + + +# Monocular depth estimation + +Monocular depth estimation is a computer vision task that involves predicting the depth information of a scene from a +single image. In other words, it is the process of estimating the distance of objects in a scene from +a single camera viewpoint. + +Monocular depth estimation has various applications, including 3D reconstruction, augmented reality, autonomous driving, +and robotics. It is a challenging task as it requires the model to understand the complex relationships between objects +in the scene and the corresponding depth information, which can be affected by factors such as lighting conditions, +occlusion, and texture. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn) + + + + + +In this guide you'll learn how to: + +* create a depth estimation pipeline +* run depth estimation inference by hand + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install -q transformers +``` + +## Depth estimation pipeline + +The simplest way to try out inference with a model supporting depth estimation is to use the corresponding [`pipeline`]. +Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads): + +```py +>>> from transformers import pipeline + +>>> checkpoint = "vinvino02/glpn-nyu" +>>> depth_estimator = pipeline("depth-estimation", model=checkpoint) +``` + +Next, choose an image to analyze: + +```py +>>> from PIL import Image +>>> import requests + +>>> url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640" +>>> image = Image.open(requests.get(url, stream=True).raw) +>>> image +``` + +
+ Photo of a busy street +
+ +Pass the image to the pipeline. + +```py +>>> predictions = depth_estimator(image) +``` + +The pipeline returns a dictionary with two entries. The first one, called `predicted_depth`, is a tensor with the values +being the depth expressed in meters for each pixel. +The second one, `depth`, is a PIL image that visualizes the depth estimation result. + +Let's take a look at the visualized result: + +```py +>>> predictions["depth"] +``` + +
+ Depth estimation visualization +
+ +## Depth estimation inference by hand + +Now that you've seen how to use the depth estimation pipeline, let's see how we can replicate the same result by hand. + +Start by loading the model and associated processor from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads). +Here we'll use the same checkpoint as before: + +```py +>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation + +>>> checkpoint = "vinvino02/glpn-nyu" + +>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint) +>>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint) +``` + +Prepare the image input for the model using the `image_processor` that will take care of the necessary image transformations +such as resizing and normalization: + +```py +>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values +``` + +Pass the prepared inputs through the model: + +```py +>>> import torch + +>>> with torch.no_grad(): +... outputs = model(pixel_values) +... predicted_depth = outputs.predicted_depth +``` + +Visualize the results: + +```py +>>> import numpy as np + +>>> # interpolate to original size +>>> prediction = torch.nn.functional.interpolate( +... predicted_depth.unsqueeze(1), +... size=image.size[::-1], +... mode="bicubic", +... align_corners=False, +... ).squeeze() +>>> output = prediction.numpy() + +>>> formatted = (output * 255 / np.max(output)).astype("uint8") +>>> depth = Image.fromarray(formatted) +>>> depth +``` + +
+ Depth estimation visualization +
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md new file mode 100644 index 000000000000..938d3ba461bb --- /dev/null +++ b/docs/source/en/tasks/multiple_choice.md @@ -0,0 +1,465 @@ + + +# Multiple choice + +[[open-in-colab]] + +A multiple choice task is similar to question answering, except several candidate answers are provided along with a context and the model is trained to select the correct answer. + +This guide will show you how to: + +1. Finetune [BERT](https://huggingface.co/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load SWAG dataset + +Start by loading the `regular` configuration of the SWAG dataset from the 🤗 Datasets library: + +```py +>>> from datasets import load_dataset + +>>> swag = load_dataset("swag", "regular") +``` + +Then take a look at an example: + +```py +>>> swag["train"][0] +{'ending0': 'passes by walking down the street playing their instruments.', + 'ending1': 'has heard approaching them.', + 'ending2': "arrives and they're outside dancing and asleep.", + 'ending3': 'turns the lead singer watches the performance.', + 'fold-ind': '3416', + 'gold-source': 'gold', + 'label': 0, + 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', + 'sent2': 'A drum line', + 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', + 'video-id': 'anetv_jkn6uvmqwh4'} +``` + +While it looks like there are a lot of fields here, it is actually pretty straightforward: + +- `sent1` and `sent2`: these fields show how a sentence starts, and if you put the two together, you get the `startphrase` field. +- `ending`: suggests a possible ending for how a sentence can end, but only one of them is correct. +- `label`: identifies the correct sentence ending. + +## Preprocess + +The next step is to load a BERT tokenizer to process the sentence starts and the four possible endings: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +The preprocessing function you want to create needs to: + +1. Make four copies of the `sent1` field and combine each of them with `sent2` to recreate how a sentence starts. +2. Combine `sent2` with each of the four possible sentence endings. +3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding `input_ids`, `attention_mask`, and `labels` field. + +```py +>>> ending_names = ["ending0", "ending1", "ending2", "ending3"] + + +>>> def preprocess_function(examples): +... first_sentences = [[context] * 4 for context in examples["sent1"]] +... question_headers = examples["sent2"] +... second_sentences = [ +... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) +... ] + +... first_sentences = sum(first_sentences, []) +... second_sentences = sum(second_sentences, []) + +... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) +... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: + +```py +tokenized_swag = swag.map(preprocess_function, batched=True) +``` + +🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + +`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results: + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import torch + + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="pt", +... ) + +... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} +... batch["labels"] = torch.tensor(labels, dtype=torch.int64) +... return batch +``` + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import tensorflow as tf + + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="tf", +... ) + +... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} +... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) +... return batch +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: + +```py +>>> import numpy as np + + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load BERT with [`AutoModelForMultipleChoice`]: + +```py +>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer + +>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased") +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_swag_model", +... evaluation_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... learning_rate=5e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_swag["train"], +... eval_dataset=tokenized_swag["validation"], +... tokenizer=tokenizer, +... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 2 +>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs +>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +Then you can load BERT with [`TFAutoModelForMultipleChoice`]: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased") +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_swag["train"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_swag["validation"], +... shuffle=False, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks). + +Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_model", +... tokenizer=tokenizer, +... ) +``` + +Then bundle your callbacks together: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + + +For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Come up with some text and two candidate answers: + +```py +>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette." +>>> candidate1 = "The law does not apply to croissants and brioche." +>>> candidate2 = "The law applies to baguettes." +``` + + + +Tokenize each prompt and candidate answer pair and return PyTorch tensors. You should also create some `labels`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True) +>>> labels = torch.tensor(0).unsqueeze(0) +``` + +Pass your inputs and labels to the model and return the `logits`: + +```py +>>> from transformers import AutoModelForMultipleChoice + +>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") +>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) +>>> logits = outputs.logits +``` + +Get the class with the highest probability: + +```py +>>> predicted_class = logits.argmax().item() +>>> predicted_class +'0' +``` + + +Tokenize each prompt and candidate answer pair and return TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True) +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") +>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()} +>>> outputs = model(inputs) +>>> logits = outputs.logits +``` + +Get the class with the highest probability: + +```py +>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) +>>> predicted_class +'0' +``` + + diff --git a/docs/source/en/tasks/multiple_choice.mdx b/docs/source/en/tasks/multiple_choice.mdx deleted file mode 100644 index 1a1a517df7da..000000000000 --- a/docs/source/en/tasks/multiple_choice.mdx +++ /dev/null @@ -1,447 +0,0 @@ - - -# Multiple choice - -A multiple choice task is similar to question answering, except several candidate answers are provided along with a context and the model is trained to select the correct answer. - -This guide will show you how to: - -1. Finetune [BERT](https://huggingface.co/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context. -2. Use your finetuned model for inference. - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load SWAG dataset - -Start by loading the `regular` configuration of the SWAG dataset from the 🤗 Datasets library: - -```py ->>> from datasets import load_dataset - ->>> swag = load_dataset("swag", "regular") -``` - -Then take a look at an example: - -```py ->>> swag["train"][0] -{'ending0': 'passes by walking down the street playing their instruments.', - 'ending1': 'has heard approaching them.', - 'ending2': "arrives and they're outside dancing and asleep.", - 'ending3': 'turns the lead singer watches the performance.', - 'fold-ind': '3416', - 'gold-source': 'gold', - 'label': 0, - 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', - 'sent2': 'A drum line', - 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', - 'video-id': 'anetv_jkn6uvmqwh4'} -``` - -While it looks like there are a lot of fields here, it is actually pretty straightforward: - -- `sent1` and `sent2`: these fields show how a sentence starts, and if you put the two together, you get the `startphrase` field. -- `ending`: suggests a possible ending for how a sentence can end, but only one of them is correct. -- `label`: identifies the correct sentence ending. - -## Preprocess - -The next step is to load a BERT tokenizer to process the sentence starts and the four possible endings: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -``` - -The preprocessing function you want to create needs to: - -1. Make four copies of the `sent1` field and combine each of them with `sent2` to recreate how a sentence starts. -2. Combine `sent2` with each of the four possible sentence endings. -3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding `input_ids`, `attention_mask`, and `labels` field. - -```py ->>> ending_names = ["ending0", "ending1", "ending2", "ending3"] - - ->>> def preprocess_function(examples): -... first_sentences = [[context] * 4 for context in examples["sent1"]] -... question_headers = examples["sent2"] -... second_sentences = [ -... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) -... ] - -... first_sentences = sum(first_sentences, []) -... second_sentences = sum(second_sentences, []) - -... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) -... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: - -```py -tokenized_swag = swag.map(preprocess_function, batched=True) -``` - -🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length. - -`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results: - - - -```py ->>> from dataclasses import dataclass ->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy ->>> from typing import Optional, Union ->>> import torch - - ->>> @dataclass -... class DataCollatorForMultipleChoice: -... """ -... Data collator that will dynamically pad the inputs for multiple choice received. -... """ - -... tokenizer: PreTrainedTokenizerBase -... padding: Union[bool, str, PaddingStrategy] = True -... max_length: Optional[int] = None -... pad_to_multiple_of: Optional[int] = None - -... def __call__(self, features): -... label_name = "label" if "label" in features[0].keys() else "labels" -... labels = [feature.pop(label_name) for feature in features] -... batch_size = len(features) -... num_choices = len(features[0]["input_ids"]) -... flattened_features = [ -... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features -... ] -... flattened_features = sum(flattened_features, []) - -... batch = self.tokenizer.pad( -... flattened_features, -... padding=self.padding, -... max_length=self.max_length, -... pad_to_multiple_of=self.pad_to_multiple_of, -... return_tensors="pt", -... ) - -... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} -... batch["labels"] = torch.tensor(labels, dtype=torch.int64) -... return batch -``` - - -```py ->>> from dataclasses import dataclass ->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy ->>> from typing import Optional, Union ->>> import tensorflow as tf - - ->>> @dataclass -... class DataCollatorForMultipleChoice: -... """ -... Data collator that will dynamically pad the inputs for multiple choice received. -... """ - -... tokenizer: PreTrainedTokenizerBase -... padding: Union[bool, str, PaddingStrategy] = True -... max_length: Optional[int] = None -... pad_to_multiple_of: Optional[int] = None - -... def __call__(self, features): -... label_name = "label" if "label" in features[0].keys() else "labels" -... labels = [feature.pop(label_name) for feature in features] -... batch_size = len(features) -... num_choices = len(features[0]["input_ids"]) -... flattened_features = [ -... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features -... ] -... flattened_features = sum(flattened_features, []) - -... batch = self.tokenizer.pad( -... flattened_features, -... padding=self.padding, -... max_length=self.max_length, -... pad_to_multiple_of=self.pad_to_multiple_of, -... return_tensors="tf", -... ) - -... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} -... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) -... return batch -``` - - - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> accuracy = evaluate.load("accuracy") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: - -```py ->>> import numpy as np - - ->>> def compute_metrics(eval_pred): -... predictions, labels = eval_pred -... predictions = np.argmax(predictions, axis=1) -... return accuracy.compute(predictions=predictions, references=labels) -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load BERT with [`AutoModelForMultipleChoice`]: - -```py ->>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer - ->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased") -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_swag_model", -... evaluation_strategy="epoch", -... save_strategy="epoch", -... load_best_model_at_end=True, -... learning_rate=5e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... num_train_epochs=3, -... weight_decay=0.01, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_swag["train"], -... eval_dataset=tokenized_swag["validation"], -... tokenizer=tokenizer, -... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer - ->>> batch_size = 16 ->>> num_train_epochs = 2 ->>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs ->>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) -``` - -Then you can load BERT with [`TFAutoModelForMultipleChoice`]: - -```py ->>> from transformers import TFAutoModelForMultipleChoice - ->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased") -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_swag["train"], -... shuffle=True, -... batch_size=batch_size, -... collate_fn=data_collator, -... ) - ->>> tf_validation_set = model.prepare_tf_dataset( -... tokenized_swag["validation"], -... shuffle=False, -... batch_size=batch_size, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> model.compile(optimizer=optimizer) -``` - -The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks). - -Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: - -```py ->>> from transformers.keras_callbacks import KerasMetricCallback - ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) -``` - -Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="my_awesome_model", -... tokenizer=tokenizer, -... ) -``` - -Then bundle your callbacks together: - -```py ->>> callbacks = [metric_callback, push_to_hub_callback] -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - - -For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Come up with some text and two candidate answers: - -```py ->>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette." ->>> candidate1 = "The law does not apply to croissants and brioche." ->>> candidate2 = "The law applies to baguettes." -``` - - - -Tokenize each prompt and candidate answer pair and return PyTorch tensors. You should also create some `labels`: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") ->>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True) ->>> labels = torch.tensor(0).unsqueeze(0) -``` - -Pass your inputs and labels to the model and return the `logits`: - -```py ->>> from transformers import AutoModelForMultipleChoice - ->>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") ->>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) ->>> logits = outputs.logits -``` - -Get the class with the highest probability: - -```py ->>> predicted_class = logits.argmax().item() ->>> predicted_class -'0' -``` - - -Tokenize each prompt and candidate answer pair and return TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") ->>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True) -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import TFAutoModelForMultipleChoice - ->>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") ->>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()} ->>> outputs = model(inputs) ->>> logits = outputs.logits -``` - -Get the class with the highest probability: - -```py ->>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) ->>> predicted_class -'0' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md new file mode 100644 index 000000000000..7511ee66dd0b --- /dev/null +++ b/docs/source/en/tasks/object_detection.md @@ -0,0 +1,593 @@ + + +# Object detection + +[[open-in-colab]] + +Object detection is the computer vision task of detecting instances (such as humans, buildings, or cars) in an image. Object detection models receive an image as input and output +coordinates of the bounding boxes and associated labels of the detected objects. An image can contain multiple objects, +each with its own bounding box and a label (e.g. it can have a car and a building), and each object can +be present in different parts of an image (e.g. the image can have several cars). +This task is commonly used in autonomous driving for detecting things like pedestrians, road signs, and traffic lights. +Other applications include counting objects in images, image search, and more. + +In this guide, you will learn how to: + + 1. Finetune [DETR](https://huggingface.co/docs/transformers/model_doc/detr), a model that combines a convolutional + backbone with an encoder-decoder Transformer, on the [CPPE-5](https://huggingface.co/datasets/cppe-5) + dataset. + 2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install -q datasets transformers evaluate timm albumentations +``` + +You'll use 🤗 Datasets to load a dataset from the Hugging Face Hub, 🤗 Transformers to train your model, +and `albumentations` to augment the data. `timm` is currently required to load a convolutional backbone for the DETR model. + +We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the Hub. +When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load the CPPE-5 dataset + +The [CPPE-5 dataset](https://huggingface.co/datasets/cppe-5) contains images with +annotations identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic. + +Start by loading the dataset: + +```py +>>> from datasets import load_dataset + +>>> cppe5 = load_dataset("cppe-5") +>>> cppe5 +DatasetDict({ + train: Dataset({ + features: ['image_id', 'image', 'width', 'height', 'objects'], + num_rows: 1000 + }) + test: Dataset({ + features: ['image_id', 'image', 'width', 'height', 'objects'], + num_rows: 29 + }) +}) +``` + +You'll see that this dataset already comes with a training set containing 1000 images and a test set with 29 images. + +To get familiar with the data, explore what the examples look like. + +```py +>>> cppe5["train"][0] +{'image_id': 15, + 'image': , + 'width': 943, + 'height': 663, + 'objects': {'id': [114, 115, 116, 117], + 'area': [3796, 1596, 152768, 81002], + 'bbox': [[302.0, 109.0, 73.0, 52.0], + [810.0, 100.0, 57.0, 28.0], + [160.0, 31.0, 248.0, 616.0], + [741.0, 68.0, 202.0, 401.0]], + 'category': [4, 4, 0, 0]}} +``` + +The examples in the dataset have the following fields: +- `image_id`: the example image id +- `image`: a `PIL.Image.Image` object containing the image +- `width`: width of the image +- `height`: height of the image +- `objects`: a dictionary containing bounding box metadata for the objects in the image: + - `id`: the annotation id + - `area`: the area of the bounding box + - `bbox`: the object's bounding box (in the [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco) ) + - `category`: the object's category, with possible values including `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` and `Mask (4)` + +You may notice that the `bbox` field follows the COCO format, which is the format that the DETR model expects. +However, the grouping of the fields inside `objects` differs from the annotation format DETR requires. You will +need to apply some preprocessing transformations before using this data for training. + +To get an even better understanding of the data, visualize an example in the dataset. + +```py +>>> import numpy as np +>>> import os +>>> from PIL import Image, ImageDraw + +>>> image = cppe5["train"][0]["image"] +>>> annotations = cppe5["train"][0]["objects"] +>>> draw = ImageDraw.Draw(image) + +>>> categories = cppe5["train"].features["objects"].feature["category"].names + +>>> id2label = {index: x for index, x in enumerate(categories, start=0)} +>>> label2id = {v: k for k, v in id2label.items()} + +>>> for i in range(len(annotations["id"])): +... box = annotations["bbox"][i] +... class_idx = annotations["category"][i] +... x, y, w, h = tuple(box) +... draw.rectangle((x, y, x + w, y + h), outline="red", width=1) +... draw.text((x, y), id2label[class_idx], fill="white") + +>>> image +``` + +
+ CPPE-5 Image Example +
+ +To visualize the bounding boxes with associated labels, you can get the labels from the dataset's metadata, specifically +the `category` field. +You'll also want to create dictionaries that map a label id to a label class (`id2label`) and the other way around (`label2id`). +You can use them later when setting up the model. Including these maps will make your model reusable by others if you share +it on the Hugging Face Hub. + +As a final step of getting familiar with the data, explore it for potential issues. One common problem with datasets for +object detection is bounding boxes that "stretch" beyond the edge of the image. Such "runaway" bounding boxes can raise +errors during training and should be addressed at this stage. There are a few examples with this issue in this dataset. +To keep things simple in this guide, we remove these images from the data. + +```py +>>> remove_idx = [590, 821, 822, 875, 876, 878, 879] +>>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx] +>>> cppe5["train"] = cppe5["train"].select(keep) +``` + +## Preprocess the data + +To finetune a model, you must preprocess the data you plan to use to match precisely the approach used for the pre-trained model. +[`AutoImageProcessor`] takes care of processing image data to create `pixel_values`, `pixel_mask`, and +`labels` that a DETR model can train with. The image processor has some attributes that you won't have to worry about: + +- `image_mean = [0.485, 0.456, 0.406 ]` +- `image_std = [0.229, 0.224, 0.225]` + +These are the mean and standard deviation used to normalize images during the model pre-training. These values are crucial +to replicate when doing inference or finetuning a pre-trained image model. + +Instantiate the image processor from the same checkpoint as the model you want to finetune. + +```py +>>> from transformers import AutoImageProcessor + +>>> checkpoint = "facebook/detr-resnet-50" +>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint) +``` + +Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset: +- Augmenting images +- Reformatting annotations to meet DETR expectations + +First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ... +This library ensures that transformations affect the image and update the bounding boxes accordingly. +The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection), +and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480), +flip it horizontally, and brighten it: + +```py +>>> import albumentations +>>> import numpy as np +>>> import torch + +>>> transform = albumentations.Compose( +... [ +... albumentations.Resize(480, 480), +... albumentations.HorizontalFlip(p=1.0), +... albumentations.RandomBrightnessContrast(p=1.0), +... ], +... bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]), +... ) +``` + +The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`, + where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example: + +```py +>>> def formatted_anns(image_id, category, area, bbox): +... annotations = [] +... for i in range(0, len(category)): +... new_ann = { +... "image_id": image_id, +... "category_id": category[i], +... "isCrowd": 0, +... "area": area[i], +... "bbox": list(bbox[i]), +... } +... annotations.append(new_ann) + +... return annotations +``` + +Now you can combine the image and annotation transformations to use on a batch of examples: + +```py +>>> # transforming a batch +>>> def transform_aug_ann(examples): +... image_ids = examples["image_id"] +... images, bboxes, area, categories = [], [], [], [] +... for image, objects in zip(examples["image"], examples["objects"]): +... image = np.array(image.convert("RGB"))[:, :, ::-1] +... out = transform(image=image, bboxes=objects["bbox"], category=objects["category"]) + +... area.append(objects["area"]) +... images.append(out["image"]) +... bboxes.append(out["bboxes"]) +... categories.append(out["category"]) + +... targets = [ +... {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)} +... for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes) +... ] + +... return image_processor(images=images, annotations=targets, return_tensors="pt") +``` + +Apply this preprocessing function to the entire dataset using 🤗 Datasets [`~datasets.Dataset.with_transform`] method. This method applies +transformations on the fly when you load an element of the dataset. + +At this point, you can check what an example from the dataset looks like after the transformations. You should see a tensor +with `pixel_values`, a tensor with `pixel_mask`, and `labels`. + +```py +>>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann) +>>> cppe5["train"][15] +{'pixel_values': tensor([[[ 0.9132, 0.9132, 0.9132, ..., -1.9809, -1.9809, -1.9809], + [ 0.9132, 0.9132, 0.9132, ..., -1.9809, -1.9809, -1.9809], + [ 0.9132, 0.9132, 0.9132, ..., -1.9638, -1.9638, -1.9638], + ..., + [-1.5699, -1.5699, -1.5699, ..., -1.9980, -1.9980, -1.9980], + [-1.5528, -1.5528, -1.5528, ..., -1.9980, -1.9809, -1.9809], + [-1.5528, -1.5528, -1.5528, ..., -1.9980, -1.9809, -1.9809]], + + [[ 1.3081, 1.3081, 1.3081, ..., -1.8431, -1.8431, -1.8431], + [ 1.3081, 1.3081, 1.3081, ..., -1.8431, -1.8431, -1.8431], + [ 1.3081, 1.3081, 1.3081, ..., -1.8256, -1.8256, -1.8256], + ..., + [-1.3179, -1.3179, -1.3179, ..., -1.8606, -1.8606, -1.8606], + [-1.3004, -1.3004, -1.3004, ..., -1.8606, -1.8431, -1.8431], + [-1.3004, -1.3004, -1.3004, ..., -1.8606, -1.8431, -1.8431]], + + [[ 1.4200, 1.4200, 1.4200, ..., -1.6476, -1.6476, -1.6476], + [ 1.4200, 1.4200, 1.4200, ..., -1.6476, -1.6476, -1.6476], + [ 1.4200, 1.4200, 1.4200, ..., -1.6302, -1.6302, -1.6302], + ..., + [-1.0201, -1.0201, -1.0201, ..., -1.5604, -1.5604, -1.5604], + [-1.0027, -1.0027, -1.0027, ..., -1.5604, -1.5430, -1.5430], + [-1.0027, -1.0027, -1.0027, ..., -1.5604, -1.5430, -1.5430]]]), + 'pixel_mask': tensor([[1, 1, 1, ..., 1, 1, 1], + [1, 1, 1, ..., 1, 1, 1], + [1, 1, 1, ..., 1, 1, 1], + ..., + [1, 1, 1, ..., 1, 1, 1], + [1, 1, 1, ..., 1, 1, 1], + [1, 1, 1, ..., 1, 1, 1]]), + 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}} +``` + +You have successfully augmented the individual images and prepared their annotations. However, preprocessing isn't +complete yet. In the final step, create a custom `collate_fn` to batch images together. +Pad images (which are now `pixel_values`) to the largest image in a batch, and create a corresponding `pixel_mask` +to indicate which pixels are real (1) and which are padding (0). + +```py +>>> def collate_fn(batch): +... pixel_values = [item["pixel_values"] for item in batch] +... encoding = image_processor.pad(pixel_values, return_tensors="pt") +... labels = [item["labels"] for item in batch] +... batch = {} +... batch["pixel_values"] = encoding["pixel_values"] +... batch["pixel_mask"] = encoding["pixel_mask"] +... batch["labels"] = labels +... return batch +``` + +## Training the DETR model +You have done most of the heavy lifting in the previous sections, so now you are ready to train your model! +The images in this dataset are still quite large, even after resizing. This means that finetuning this model will +require at least one GPU. + +Training involves the following steps: +1. Load the model with [`AutoModelForObjectDetection`] using the same checkpoint as in the preprocessing. +2. Define your training hyperparameters in [`TrainingArguments`]. +3. Pass the training arguments to [`Trainer`] along with the model, dataset, image processor, and data collator. +4. Call [`~Trainer.train`] to finetune your model. + +When loading the model from the same checkpoint that you used for the preprocessing, remember to pass the `label2id` +and `id2label` maps that you created earlier from the dataset's metadata. Additionally, we specify `ignore_mismatched_sizes=True` to replace the existing classification head with a new one. + +```py +>>> from transformers import AutoModelForObjectDetection + +>>> model = AutoModelForObjectDetection.from_pretrained( +... checkpoint, +... id2label=id2label, +... label2id=label2id, +... ignore_mismatched_sizes=True, +... ) +``` + +In the [`TrainingArguments`] use `output_dir` to specify where to save your model, then configure hyperparameters as you see fit. +It is important you do not remove unused columns because this will drop the image column. Without the image column, you +can't create `pixel_values`. For this reason, set `remove_unused_columns` to `False`. +If you wish to share your model by pushing to the Hub, set `push_to_hub` to `True` (you must be signed in to Hugging +Face to upload your model). + +```py +>>> from transformers import TrainingArguments + +>>> training_args = TrainingArguments( +... output_dir="detr-resnet-50_finetuned_cppe5", +... per_device_train_batch_size=8, +... num_train_epochs=10, +... fp16=True, +... save_steps=200, +... logging_steps=50, +... learning_rate=1e-5, +... weight_decay=1e-4, +... save_total_limit=2, +... remove_unused_columns=False, +... push_to_hub=True, +... ) +``` + +Finally, bring everything together, and call [`~transformers.Trainer.train`]: + +```py +>>> from transformers import Trainer + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... data_collator=collate_fn, +... train_dataset=cppe5["train"], +... tokenizer=image_processor, +... ) + +>>> trainer.train() +``` + +If you have set `push_to_hub` to `True` in the `training_args`, the training checkpoints are pushed to the +Hugging Face Hub. Upon training completion, push the final model to the Hub as well by calling the [`~transformers.Trainer.push_to_hub`] method. + +```py +>>> trainer.push_to_hub() +``` + +## Evaluate +Object detection models are commonly evaluated with a set of COCO-style metrics. +You can use one of the existing metrics implementations, but here you'll use the one from `torchvision` to evaluate the final +model that you pushed to the Hub. + +To use the `torchvision` evaluator, you'll need to prepare a ground truth COCO dataset. The API to build a COCO dataset +requires the data to be stored in a certain format, so you'll need to save images and annotations to disk first. Just like +when you prepared your data for training, the annotations from the `cppe5["test"]` need to be formatted. However, images +should stay as they are. + +The evaluation step requires a bit of work, but it can be split in three major steps. +First, prepare the `cppe5["test"]` set: format the annotations and save the data to disk. + +```py +>>> import json + + +>>> # format annotations the same as for training, no need for data augmentation +>>> def val_formatted_anns(image_id, objects): +... annotations = [] +... for i in range(0, len(objects["id"])): +... new_ann = { +... "id": objects["id"][i], +... "category_id": objects["category"][i], +... "iscrowd": 0, +... "image_id": image_id, +... "area": objects["area"][i], +... "bbox": objects["bbox"][i], +... } +... annotations.append(new_ann) + +... return annotations + + +>>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects +>>> def save_cppe5_annotation_file_images(cppe5): +... output_json = {} +... path_output_cppe5 = f"{os.getcwd()}/cppe5/" + +... if not os.path.exists(path_output_cppe5): +... os.makedirs(path_output_cppe5) + +... path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json") +... categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label] +... output_json["images"] = [] +... output_json["annotations"] = [] +... for example in cppe5: +... ann = val_formatted_anns(example["image_id"], example["objects"]) +... output_json["images"].append( +... { +... "id": example["image_id"], +... "width": example["image"].width, +... "height": example["image"].height, +... "file_name": f"{example['image_id']}.png", +... } +... ) +... output_json["annotations"].extend(ann) +... output_json["categories"] = categories_json + +... with open(path_anno, "w") as file: +... json.dump(output_json, file, ensure_ascii=False, indent=4) + +... for im, img_id in zip(cppe5["image"], cppe5["image_id"]): +... path_img = os.path.join(path_output_cppe5, f"{img_id}.png") +... im.save(path_img) + +... return path_output_cppe5, path_anno +``` + +Next, prepare an instance of a `CocoDetection` class that can be used with `cocoevaluator`. + +```py +>>> import torchvision + + +>>> class CocoDetection(torchvision.datasets.CocoDetection): +... def __init__(self, img_folder, image_processor, ann_file): +... super().__init__(img_folder, ann_file) +... self.image_processor = image_processor + +... def __getitem__(self, idx): +... # read in PIL image and target in COCO format +... img, target = super(CocoDetection, self).__getitem__(idx) + +... # preprocess image and target: converting target to DETR format, +... # resizing + normalization of both image and target) +... image_id = self.ids[idx] +... target = {"image_id": image_id, "annotations": target} +... encoding = self.image_processor(images=img, annotations=target, return_tensors="pt") +... pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension +... target = encoding["labels"][0] # remove batch dimension + +... return {"pixel_values": pixel_values, "labels": target} + + +>>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") + +>>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"]) +>>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno) +``` + +Finally, load the metrics and run the evaluation. + +```py +>>> import evaluate +>>> from tqdm import tqdm + +>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") +>>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco) +>>> val_dataloader = torch.utils.data.DataLoader( +... test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn +... ) + +>>> with torch.no_grad(): +... for idx, batch in enumerate(tqdm(val_dataloader)): +... pixel_values = batch["pixel_values"] +... pixel_mask = batch["pixel_mask"] + +... labels = [ +... {k: v for k, v in t.items()} for t in batch["labels"] +... ] # these are in DETR format, resized + normalized + +... # forward pass +... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + +... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) +... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api + +... module.add(prediction=results, reference=labels) +... del batch + +>>> results = module.compute() +>>> print(results) +Accumulating evaluation results... +DONE (t=0.08s). +IoU metric: bbox + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.352 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.681 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.292 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.274 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.484 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.501 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590 +``` +These results can be further improved by adjusting the hyperparameters in [`~transformers.TrainingArguments`]. Give it a go! + +## Inference +Now that you have finetuned a DETR model, evaluated it, and uploaded it to the Hugging Face Hub, you can use it for inference. +The simplest way to try out your finetuned model for inference is to use it in a [`Pipeline`]. Instantiate a pipeline +for object detection with your model, and pass an image to it: + +```py +>>> from transformers import pipeline +>>> import requests + +>>> url = "https://i.imgur.com/2lnWoly.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5") +>>> obj_detector(image) +``` + +You can also manually replicate the results of the pipeline if you'd like: + +```py +>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") +>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") + +>>> with torch.no_grad(): +... inputs = image_processor(images=image, return_tensors="pt") +... outputs = model(**inputs) +... target_sizes = torch.tensor([image.size[::-1]]) +... results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0] + +>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): +... box = [round(i, 2) for i in box.tolist()] +... print( +... f"Detected {model.config.id2label[label.item()]} with confidence " +... f"{round(score.item(), 3)} at location {box}" +... ) +Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08] +Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9] +``` + +Let's plot the result: +```py +>>> draw = ImageDraw.Draw(image) + +>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): +... box = [round(i, 2) for i in box.tolist()] +... x, y, x2, y2 = tuple(box) +... draw.rectangle((x, y, x2, y2), outline="red", width=1) +... draw.text((x, y), model.config.id2label[label.item()], fill="white") + +>>> image +``` + +
+ Object detection result on a new image +
diff --git a/docs/source/en/tasks/object_detection.mdx b/docs/source/en/tasks/object_detection.mdx deleted file mode 100644 index a2b8a12fb60d..000000000000 --- a/docs/source/en/tasks/object_detection.mdx +++ /dev/null @@ -1,584 +0,0 @@ - - -# Object detection - -[[open-in-colab]] - -Object detection is the computer vision task of detecting instances (such as humans, buildings, or cars) in an image. Object detection models receive an image as input and output -coordinates of the bounding boxes and associated labels of the detected objects. An image can contain multiple objects, -each with its own bounding box and a label (e.g. it can have a car and a building), and each object can -be present in different parts of an image (e.g. the image can have several cars). -This task is commonly used in autonomous driving for detecting things like pedestrians, road signs, and traffic lights. -Other applications include counting objects in images, image search, and more. - - -Check out the object detection task page to learn about use cases, -models, metrics, and datasets associated with this task. - - -In this guide, you will learn how to: - - 1. Finetune [DETR](https://huggingface.co/docs/transformers/model_doc/detr), a model that combines a convolutional - backbone with an encoder-decoder Transformer, on the [CPPE-5](https://huggingface.co/datasets/cppe-5) - dataset. - 2. Use your finetuned model for inference. - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install -q datasets transformers evaluate timm albumentations -``` - -You'll use 🤗 Datasets to load a dataset from the Hugging Face Hub, 🤗 Transformers to train your model, -and `albumentations` to augment the data. `timm` is currently required to load a convolutional backbone for the DETR model. - -We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the Hub. -When prompted, enter your token to log in: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load the CPPE-5 dataset - -The [CPPE-5 dataset](https://huggingface.co/datasets/cppe-5) contains images with -annotations identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic. - -Start by loading the dataset: - -```py ->>> from datasets import load_dataset - ->>> cppe5 = load_dataset("cppe-5") ->>> cppe5 -DatasetDict({ - train: Dataset({ - features: ['image_id', 'image', 'width', 'height', 'objects'], - num_rows: 1000 - }) - test: Dataset({ - features: ['image_id', 'image', 'width', 'height', 'objects'], - num_rows: 29 - }) -}) -``` - -You'll see that this dataset already comes with a training set containing 1000 images and a test set with 29 images. - -To get familiar with the data, explore what the examples look like. - -```py ->>> cppe5["train"][0] -{'image_id': 15, - 'image': , - 'width': 943, - 'height': 663, - 'objects': {'id': [114, 115, 116, 117], - 'area': [3796, 1596, 152768, 81002], - 'bbox': [[302.0, 109.0, 73.0, 52.0], - [810.0, 100.0, 57.0, 28.0], - [160.0, 31.0, 248.0, 616.0], - [741.0, 68.0, 202.0, 401.0]], - 'category': [4, 4, 0, 0]}} -``` - -The examples in the dataset have the following fields: -- `image_id`: the example image id -- `image`: a `PIL.Image.Image` object containing the image -- `width`: width of the image -- `height`: height of the image -- `objects`: a dictionary containing bounding box metadata for the objects in the image: - - `id`: the annotation id - - `area`: the area of the bounding box - - `bbox`: the object's bounding box (in the [COCO format](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco) ) - - `category`: the object's category, with possible values including `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` and `Mask (4)` - -You may notice that the `bbox` field follows the COCO format, which is the format that the DETR model expects. -However, the grouping of the fields inside `objects` differs from the annotation format DETR requires. You will -need to apply some preprocessing transformations before using this data for training. - -To get an even better understanding of the data, visualize an example in the dataset. - -```py ->>> import numpy as np ->>> import os ->>> from PIL import Image, ImageDraw - ->>> image = cppe5["train"][0]["image"] ->>> annotations = cppe5["train"][0]["objects"] ->>> draw = ImageDraw.Draw(image) - ->>> categories = cppe5["train"].features["objects"].feature["category"].names - ->>> id2label = {index: x for index, x in enumerate(categories, start=0)} ->>> label2id = {v: k for k, v in id2label.items()} - ->>> for i in range(len(annotations["id"])): -... box = annotations["bbox"][i - 1] -... class_idx = annotations["category"][i - 1] -... x, y, w, h = tuple(box) -... draw.rectangle((x, y, x + w, y + h), outline="red", width=1) -... draw.text((x, y), id2label[class_idx], fill="white") - ->>> image -``` - -
- CPPE-5 Image Example -
- -To visualize the bounding boxes with associated labels, you can get the labels from the dataset's metadata, specifically -the `category` field. -You'll also want to create dictionaries that map a label id to a label class (`id2label`) and the other way around (`label2id`). -You can use them later when setting up the model. Including these maps will make your model reusable by others if you share -it on the Hugging Face Hub. - -As a final step of getting familiar with the data, explore it for potential issues. One common problem with datasets for -object detection is bounding boxes that "stretch" beyond the edge of the image. Such "runaway" bounding boxes can raise -errors during training and should be addressed at this stage. There are a few examples with this issue in this dataset. -To keep things simple in this guide, we remove these images from the data. - -```py ->>> remove_idx = [590, 821, 822, 875, 876, 878, 879] ->>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx] ->>> cppe5["train"] = cppe5["train"].select(keep) -``` - -## Preprocess the data - -To finetune a model, you must preprocess the data you plan to use to match precisely the approach used for the pre-trained model. -[`AutoImageProcessor`] takes care of processing image data to create `pixel_values`, `pixel_mask`, and -`labels` that a DETR model can train with. The image processor has some attributes that you won't have to worry about: - -- `image_mean = [0.485, 0.456, 0.406 ]` -- `image_std = [0.229, 0.224, 0.225]` - -These are the mean and standard deviation used to normalize images during the model pre-training. These values are crucial -to replicate when doing inference or finetuning a pre-trained image model. - -Instantiate the image processor from the same checkpoint as the model you want to finetune. - -```py ->>> from transformers import AutoImageProcessor - ->>> checkpoint = "facebook/detr-resnet-50" ->>> image_processor = AutoImageProcessor.from_pretrained(checkpoint) -``` - -Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset: -- Augmenting images -- Reformatting annotations to meet DETR expectations - -First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ... -This library ensures that transformations affect the image and update the bounding boxes accordingly. -The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection), -and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480), -flip it horizontally, and brighten it: - -```py ->>> import albumentations ->>> import numpy as np ->>> import torch - ->>> transform = albumentations.Compose( -... [ -... albumentations.Resize(480, 480), -... albumentations.HorizontalFlip(p=1.0), -... albumentations.RandomBrightnessContrast(p=1.0), -... ], -... bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]), -... ) -``` - -The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`, - where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example: - -```py ->>> def formatted_anns(image_id, category, area, bbox): - -... annotations = [] -... for i in range(0, len(category)): -... new_ann = { -... "image_id": image_id, -... "category_id": category[i], -... "isCrowd": 0, -... "area": area[i], -... "bbox": list(bbox[i]), -... } -... annotations.append(new_ann) - -... return annotations -``` - -Now you can combine the image and annotation transformations to use on a batch of examples: - -```py ->>> # transforming a batch ->>> def transform_aug_ann(examples): -... image_ids = examples["image_id"] -... images, bboxes, area, categories = [], [], [], [] -... for image, objects in zip(examples["image"], examples["objects"]): -... image = np.array(image.convert("RGB"))[:, :, ::-1] -... out = transform(image=image, bboxes=objects["bbox"], category=objects["category"]) - -... area.append(objects["area"]) -... images.append(out["image"]) -... bboxes.append(out["bboxes"]) -... categories.append(out["category"]) - -... targets = [ -... {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)} -... for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes) -... ] - -... return image_processor(images=images, annotations=targets, return_tensors="pt") -``` - -Apply this preprocessing function to the entire dataset using 🤗 Datasets [`~datasets.Dataset.with_transform`] method. This method applies -transformations on the fly when you load an element of the dataset. - -At this point, you can check what an example from the dataset looks like after the transformations. You should see a tensor -with `pixel_values`, a tensor with `pixel_mask`, and `labels`. - -```py ->>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann) ->>> cppe5["train"][15] -{'pixel_values': tensor([[[ 0.9132, 0.9132, 0.9132, ..., -1.9809, -1.9809, -1.9809], - [ 0.9132, 0.9132, 0.9132, ..., -1.9809, -1.9809, -1.9809], - [ 0.9132, 0.9132, 0.9132, ..., -1.9638, -1.9638, -1.9638], - ..., - [-1.5699, -1.5699, -1.5699, ..., -1.9980, -1.9980, -1.9980], - [-1.5528, -1.5528, -1.5528, ..., -1.9980, -1.9809, -1.9809], - [-1.5528, -1.5528, -1.5528, ..., -1.9980, -1.9809, -1.9809]], - - [[ 1.3081, 1.3081, 1.3081, ..., -1.8431, -1.8431, -1.8431], - [ 1.3081, 1.3081, 1.3081, ..., -1.8431, -1.8431, -1.8431], - [ 1.3081, 1.3081, 1.3081, ..., -1.8256, -1.8256, -1.8256], - ..., - [-1.3179, -1.3179, -1.3179, ..., -1.8606, -1.8606, -1.8606], - [-1.3004, -1.3004, -1.3004, ..., -1.8606, -1.8431, -1.8431], - [-1.3004, -1.3004, -1.3004, ..., -1.8606, -1.8431, -1.8431]], - - [[ 1.4200, 1.4200, 1.4200, ..., -1.6476, -1.6476, -1.6476], - [ 1.4200, 1.4200, 1.4200, ..., -1.6476, -1.6476, -1.6476], - [ 1.4200, 1.4200, 1.4200, ..., -1.6302, -1.6302, -1.6302], - ..., - [-1.0201, -1.0201, -1.0201, ..., -1.5604, -1.5604, -1.5604], - [-1.0027, -1.0027, -1.0027, ..., -1.5604, -1.5430, -1.5430], - [-1.0027, -1.0027, -1.0027, ..., -1.5604, -1.5430, -1.5430]]]), - 'pixel_mask': tensor([[1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1], - ..., - [1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1]]), - 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}} -``` - -You have successfully augmented the individual images and prepared their annotations. However, preprocessing isn't -complete yet. In the final step, create a custom `collate_fn` to batch images together. -Pad images (which are now `pixel_values`) to the largest image in a batch, and create a corresponding `pixel_mask` -to indicate which pixels are real (1) and which are padding (0). - -```py ->>> def collate_fn(batch): -... pixel_values = [item["pixel_values"] for item in batch] -... encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt") -... labels = [item["labels"] for item in batch] -... batch = {} -... batch["pixel_values"] = encoding["pixel_values"] -... batch["pixel_mask"] = encoding["pixel_mask"] -... batch["labels"] = labels -... return batch -``` - -## Training the DETR model -You have done most of the heavy lifting in the previous sections, so now you are ready to train your model! -The images in this dataset are still quite large, even after resizing. This means that finetuning this model will -require at least one GPU. - -Training involves the following steps: -1. Load the model with [`AutoModelForObjectDetection`] using the same checkpoint as in the preprocessing. -2. Define your training hyperparameters in [`TrainingArguments`]. -3. Pass the training arguments to [`Trainer`] along with the model, dataset, image processor, and data collator. -4. Call [`~Trainer.train`] to finetune your model. - -When loading the model from the same checkpoint that you used for the preprocessing, remember to pass the `label2id` -and `id2label` maps that you created earlier from the dataset's metadata. Additionally, we specify `ignore_mismatched_sizes=True` to replace the existing classification head with a new one. - -```py ->>> from transformers import AutoModelForObjectDetection - ->>> model = AutoModelForObjectDetection.from_pretrained( -... checkpoint, -... id2label=id2label, -... label2id=label2id, -... ignore_mismatched_sizes=True, -... ) -``` - -In the [`TrainingArguments`] use `output_dir` to specify where to save your model, then configure hyperparameters as you see fit. -It is important you do not remove unused columns because this will drop the image column. Without the image column, you -can't create `pixel_values`. For this reason, set `remove_unused_columns` to `False`. -If you wish to share your model by pushing to the Hub, set `push_to_hub` to `True` (you must be signed in to Hugging -Face to upload your model). - -```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments( -... output_dir="detr-resnet-50_finetuned_cppe5", -... per_device_train_batch_size=8, -... num_train_epochs=10, -... fp16=True, -... save_steps=200, -... logging_steps=50, -... learning_rate=1e-5, -... weight_decay=1e-4, -... save_total_limit=2, -... remove_unused_columns=False, -... push_to_hub=True, -... ) -``` - -Finally, bring everything together, and call [`~transformers.Trainer.train`]: - -```py ->>> from transformers import Trainer - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... data_collator=collate_fn, -... train_dataset=cppe5["train"], -... tokenizer=image_processor, -... ) - ->>> trainer.train() -``` - -If you have set `push_to_hub` to `True` in the `training_args`, the training checkpoints are pushed to the -Hugging Face Hub. Upon training completion, push the final model to the Hub as well by calling the [`~transformers.Trainer.push_to_hub`] method. - -```py ->>> trainer.push_to_hub() -``` - -## Evaluate -Object detection models are commonly evaluated with a set of COCO-style metrics. -You can use one of the existing metrics implementations, but here you'll use the one from `torchvision` to evaluate the final -model that you pushed to the Hub. - -To use the `torchvision` evaluator, you'll need to prepare a ground truth COCO dataset. The API to build a COCO dataset -requires the data to be stored in a certain format, so you'll need to save images and annotations to disk first. Just like -when you prepared your data for training, the annotations from the `cppe5["test"]` need to be formatted. However, images -should stay as they are. - -The evaluation step requires a bit of work, but it can be split in three major steps. -First, prepare the `cppe5["test"]` set: format the annotations and save the data to disk. - -```py ->>> import json - ->>> # format annotations the same as for training, no need for data augmentation ->>> def val_formatted_anns(image_id, objects): -... annotations = [] -... for i in range(0, len(objects["id"])): -... new_ann = { -... "id": objects["id"][i], -... "category_id": objects["category"][i], -... "iscrowd": 0, -... "image_id": image_id, -... "area": objects["area"][i], -... "bbox": objects["bbox"][i], -... } -... annotations.append(new_ann) - -... return annotations - - ->>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects ->>> def save_cppe5_annotation_file_images(cppe5): -... output_json = {} -... path_output_cppe5 = f"{os.getcwd()}/cppe5/" - -... if not os.path.exists(path_output_cppe5): -... os.makedirs(path_output_cppe5) - -... path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json") -... categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label] -... output_json["images"] = [] -... output_json["annotations"] = [] -... for example in cppe5: -... ann = val_formatted_anns(example["image_id"], example["objects"]) -... output_json["images"].append( -... { -... "id": example["image_id"], -... "width": example["image"].width, -... "height": example["image"].height, -... "file_name": f"{example['image_id']}.png", -... } -... ) -... output_json["annotations"].extend(ann) -... output_json["categories"] = categories_json - -... with open(path_anno, "w") as file: -... json.dump(output_json, file, ensure_ascii=False, indent=4) - -... for im, img_id in zip(cppe5["image"], cppe5["image_id"]): -... path_img = os.path.join(path_output_cppe5, f"{img_id}.png") -... im.save(path_img) - -... return path_output_cppe5, path_anno -``` - -Next, prepare an instance of a `CocoDetection` class that can be used with `cocoevaluator`. - -```py ->>> import torchvision - - ->>> class CocoDetection(torchvision.datasets.CocoDetection): -... def __init__(self, img_folder, feature_extractor, ann_file): -... super().__init__(img_folder, ann_file) -... self.feature_extractor = feature_extractor - -... def __getitem__(self, idx): -... # read in PIL image and target in COCO format -... img, target = super(CocoDetection, self).__getitem__(idx) - -... # preprocess image and target: converting target to DETR format, -... # resizing + normalization of both image and target) -... image_id = self.ids[idx] -... target = {"image_id": image_id, "annotations": target} -... encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt") -... pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension -... target = encoding["labels"][0] # remove batch dimension - -... return {"pixel_values": pixel_values, "labels": target} - - ->>> im_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5") - ->>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"]) ->>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno) -``` - -Finally, load the metrics and run the evaluation. - -```py ->>> import evaluate ->>> from tqdm import tqdm - ->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5") ->>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco) ->>> val_dataloader = torch.utils.data.DataLoader( -... test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn -... ) - ->>> with torch.no_grad(): -... for idx, batch in enumerate(tqdm(val_dataloader)): -... pixel_values = batch["pixel_values"] -... pixel_mask = batch["pixel_mask"] - -... labels = [ -... {k: v for k, v in t.items()} for t in batch["labels"] -... ] # these are in DETR format, resized + normalized - -... # forward pass -... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) - -... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) -... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api - -... module.add(prediction=results, reference=labels) -... del batch - ->>> results = module.compute() ->>> print(results) -Accumulating evaluation results... -DONE (t=0.08s). -IoU metric: bbox - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.150 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.280 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.130 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.182 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.166 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.317 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.335 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.146 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.382 -``` -These results can be further improved by adjusting the hyperparameters in [`~transformers.TrainingArguments`]. Give it a go! - -## Inference -Now that you have finetuned a DETR model, evaluated it, and uploaded it to the Hugging Face Hub, you can use it for inference. -The simplest way to try out your finetuned model for inference is to use it in a [`Pipeline`]. Instantiate a pipeline -for object detection with your model, and pass an image to it: - -```py ->>> from transformers import pipeline ->>> import requests - ->>> url = "https://i.imgur.com/2lnWoly.jpg" ->>> image = Image.open(requests.get(url, stream=True).raw) - ->>> obj_detector = pipeline("object-detection", model="MariaK/detr-resnet-50_finetuned_cppe5") ->>> obj_detector(image) -``` - -You can also manually replicate the results of the pipeline if you'd like: - -```py ->>> image_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5") ->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5") - ->>> with torch.no_grad(): -... inputs = image_processor(images=image, return_tensors="pt") -... outputs = model(**inputs) -... target_sizes = torch.tensor([image.size[::-1]]) -... results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0] - ->>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): -... box = [round(i, 2) for i in box.tolist()] -... print( -... f"Detected {model.config.id2label[label.item()]} with confidence " -... f"{round(score.item(), 3)} at location {box}" -... ) -Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08] -Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9] -``` - -Let's plot the result: -```py ->>> draw = ImageDraw.Draw(image) - ->>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): -... box = [round(i, 2) for i in box.tolist()] -... x, y, x2, y2 = tuple(box) -... draw.rectangle((x, y, x2, y2), outline="red", width=1) -... draw.text((x, y), model.config.id2label[label.item()], fill="white") - ->>> image -``` - -
- Object detection result on a new image -
- diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md new file mode 100644 index 000000000000..0db26ab8cbb7 --- /dev/null +++ b/docs/source/en/tasks/question_answering.md @@ -0,0 +1,434 @@ + + +# Question answering + +[[open-in-colab]] + + + +Question answering tasks return an answer given a question. If you've ever asked a virtual assistant like Alexa, Siri or Google what the weather is, then you've used a question answering model before. There are two common types of question answering tasks: + +- Extractive: extract the answer from the given context. +- Abstractive: generate an answer from the context that correctly answers the question. + +This guide will show you how to: + +1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [SQuAD](https://huggingface.co/datasets/squad) dataset for extractive question answering. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + + +[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) + + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load SQuAD dataset + +Start by loading a smaller subset of the SQuAD dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from datasets import load_dataset + +>>> squad = load_dataset("squad", split="train[:5000]") +``` + +Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> squad = squad.train_test_split(test_size=0.2) +``` + +Then take a look at an example: + +```py +>>> squad["train"][0] +{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, + 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', + 'id': '5733be284776f41900661182', + 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', + 'title': 'University_of_Notre_Dame' +} +``` + +There are several important fields here: + +- `answers`: the starting location of the answer token and the answer text. +- `context`: background information from which the model needs to extract the answer. +- `question`: the question a model should answer. + +## Preprocess + + + +The next step is to load a DistilBERT tokenizer to process the `question` and `context` fields: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +``` + +There are a few preprocessing steps particular to question answering tasks you should be aware of: + +1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`. +2. Next, map the start and end positions of the answer to the original `context` by setting + `return_offset_mapping=True`. +3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [`~tokenizers.Encoding.sequence_ids`] method to + find which part of the offset corresponds to the `question` and which corresponds to the `context`. + +Here is how you can create a function to truncate and map the start and end tokens of the `answer` to the `context`: + +```py +>>> def preprocess_function(examples): +... questions = [q.strip() for q in examples["question"]] +... inputs = tokenizer( +... questions, +... examples["context"], +... max_length=384, +... truncation="only_second", +... return_offsets_mapping=True, +... padding="max_length", +... ) + +... offset_mapping = inputs.pop("offset_mapping") +... answers = examples["answers"] +... start_positions = [] +... end_positions = [] + +... for i, offset in enumerate(offset_mapping): +... answer = answers[i] +... start_char = answer["answer_start"][0] +... end_char = answer["answer_start"][0] + len(answer["text"][0]) +... sequence_ids = inputs.sequence_ids(i) + +... # Find the start and end of the context +... idx = 0 +... while sequence_ids[idx] != 1: +... idx += 1 +... context_start = idx +... while sequence_ids[idx] == 1: +... idx += 1 +... context_end = idx - 1 + +... # If the answer is not fully inside the context, label it (0, 0) +... if offset[context_start][0] > end_char or offset[context_end][1] < start_char: +... start_positions.append(0) +... end_positions.append(0) +... else: +... # Otherwise it's the start and end token positions +... idx = context_start +... while idx <= context_end and offset[idx][0] <= start_char: +... idx += 1 +... start_positions.append(idx - 1) + +... idx = context_end +... while idx >= context_start and offset[idx][1] >= end_char: +... idx -= 1 +... end_positions.append(idx + 1) + +... inputs["start_positions"] = start_positions +... inputs["end_positions"] = end_positions +... return inputs +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove any columns you don't need: + +```py +>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names) +``` + +Now create a batch of examples using [`DefaultDataCollator`]. Unlike other data collators in 🤗 Transformers, the [`DefaultDataCollator`] does not apply any additional preprocessing such as padding. + + + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") +``` + + + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load DistilBERT with [`AutoModelForQuestionAnswering`]: + +```py +>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer + +>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, and data collator. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_qa_model", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_squad["train"], +... eval_dataset=tokenized_squad["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_epochs = 2 +>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs +>>> optimizer, schedule = create_optimizer( +... init_lr=2e-5, +... num_warmup_steps=0, +... num_train_steps=total_train_steps, +... ) +``` + +Then you can load DistilBERT with [`TFAutoModelForQuestionAnswering`]: + +```py +>>> from transformers import TFAutoModelForQuestionAnswering + +>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased") +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_squad["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_squad["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) +``` + +The last thing to setup before you start training is to provide a way to push your model to the Hub. This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> callback = PushToHubCallback( +... output_dir="my_awesome_qa_model", +... tokenizer=tokenizer, +... ) +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback]) +``` +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for question answering, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). + + + +## Evaluate + +Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance. + +If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) chapter from the 🤗 Hugging Face Course! + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Come up with a question and some context you'd like the model to predict: + +```py +>>> question = "How many programming languages does BLOOM support?" +>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for question answering with your model, and pass your text to it: + +```py +>>> from transformers import pipeline + +>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model") +>>> question_answerer(question=question, context=context) +{'score': 0.2058267742395401, + 'start': 10, + 'end': 95, + 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'} +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Tokenize the text and return PyTorch tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") +>>> inputs = tokenizer(question, context, return_tensors="pt") +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> import torch +>>> from transformers import AutoModelForQuestionAnswering + +>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") +>>> with torch.no_grad(): +... outputs = model(**inputs) +``` + +Get the highest probability from the model output for the start and end positions: + +```py +>>> answer_start_index = outputs.start_logits.argmax() +>>> answer_end_index = outputs.end_logits.argmax() +``` + +Decode the predicted tokens to get the answer: + +```py +>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] +>>> tokenizer.decode(predict_answer_tokens) +'176 billion parameters and can generate text in 46 languages natural languages and 13' +``` + + +Tokenize the text and return TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") +>>> inputs = tokenizer(question, text, return_tensors="tf") +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> from transformers import TFAutoModelForQuestionAnswering + +>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") +>>> outputs = model(**inputs) +``` + +Get the highest probability from the model output for the start and end positions: + +```py +>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) +>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) +``` + +Decode the predicted tokens to get the answer: + +```py +>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] +>>> tokenizer.decode(predict_answer_tokens) +'176 billion parameters and can generate text in 46 languages natural languages and 13' +``` + + diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx deleted file mode 100644 index deabef4f04f4..000000000000 --- a/docs/source/en/tasks/question_answering.mdx +++ /dev/null @@ -1,421 +0,0 @@ - - -# Question answering - -[[open-in-colab]] - - - -Question answering tasks return an answer given a question. If you've ever asked a virtual assistant like Alexa, Siri or Google what the weather is, then you've used a question answering model before. There are two common types of question answering tasks: - -- Extractive: extract the answer from the given context. -- Abstractive: generate an answer from the context that correctly answers the question. - -This guide will show you how to: - -1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [SQuAD](https://huggingface.co/datasets/squad) dataset for extractive question answering. -2. Use your finetuned model for inference. - - - -See the question answering [task page](https://huggingface.co/tasks/question-answering) for more information about other forms of question answering and their associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load SQuAD dataset - -Start by loading a smaller subset of the SQuAD dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset. - -```py ->>> from datasets import load_dataset - ->>> squad = load_dataset("squad", split="train[:5000]") -``` - -Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - -```py ->>> squad = squad.train_test_split(test_size=0.2) -``` - -Then take a look at an example: - -```py ->>> squad["train"][0] -{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, - 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', - 'id': '5733be284776f41900661182', - 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', - 'title': 'University_of_Notre_Dame' -} -``` - -There are several important fields here: - -- `answers`: the starting location of the answer token and the answer text. -- `context`: background information from which the model needs to extract the answer. -- `question`: the question a model should answer. - -## Preprocess - - - -The next step is to load a DistilBERT tokenizer to process the `question` and `context` fields: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") -``` - -There are a few preprocessing steps particular to question answering tasks you should be aware of: - -1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`. -2. Next, map the start and end positions of the answer to the original `context` by setting - `return_offset_mapping=True`. -3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [`~tokenizers.Encoding.sequence_ids`] method to - find which part of the offset corresponds to the `question` and which corresponds to the `context`. - -Here is how you can create a function to truncate and map the start and end tokens of the `answer` to the `context`: - -```py ->>> def preprocess_function(examples): -... questions = [q.strip() for q in examples["question"]] -... inputs = tokenizer( -... questions, -... examples["context"], -... max_length=384, -... truncation="only_second", -... return_offsets_mapping=True, -... padding="max_length", -... ) - -... offset_mapping = inputs.pop("offset_mapping") -... answers = examples["answers"] -... start_positions = [] -... end_positions = [] - -... for i, offset in enumerate(offset_mapping): -... answer = answers[i] -... start_char = answer["answer_start"][0] -... end_char = answer["answer_start"][0] + len(answer["text"][0]) -... sequence_ids = inputs.sequence_ids(i) - -... # Find the start and end of the context -... idx = 0 -... while sequence_ids[idx] != 1: -... idx += 1 -... context_start = idx -... while sequence_ids[idx] == 1: -... idx += 1 -... context_end = idx - 1 - -... # If the answer is not fully inside the context, label it (0, 0) -... if offset[context_start][0] > end_char or offset[context_end][1] < start_char: -... start_positions.append(0) -... end_positions.append(0) -... else: -... # Otherwise it's the start and end token positions -... idx = context_start -... while idx <= context_end and offset[idx][0] <= start_char: -... idx += 1 -... start_positions.append(idx - 1) - -... idx = context_end -... while idx >= context_start and offset[idx][1] >= end_char: -... idx -= 1 -... end_positions.append(idx + 1) - -... inputs["start_positions"] = start_positions -... inputs["end_positions"] = end_positions -... return inputs -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove any columns you don't need: - -```py ->>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names) -``` - -Now create a batch of examples using [`DefaultDataCollator`]. Unlike other data collators in 🤗 Transformers, the [`DefaultDataCollator`] does not apply any additional preprocessing such as padding. - - - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator() -``` - - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator(return_tensors="tf") -``` - - - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load DistilBERT with [`AutoModelForQuestionAnswering`]: - -```py ->>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer - ->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, and data collator. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_qa_model", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... num_train_epochs=3, -... weight_decay=0.01, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_squad["train"], -... eval_dataset=tokenized_squad["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer - ->>> batch_size = 16 ->>> num_epochs = 2 ->>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs ->>> optimizer, schedule = create_optimizer( -... init_lr=2e-5, -... num_warmup_steps=0, -... num_train_steps=total_train_steps, -... ) -``` - -Then you can load DistilBERT with [`TFAutoModelForQuestionAnswering`]: - -```py ->>> from transformers import TFAutoModelForQuestionAnswering - ->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased") -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_squad["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_validation_set = model.prepare_tf_dataset( -... tokenized_squad["test"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -The last thing to setup before you start training is to provide a way to push your model to the Hub. This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> callback = PushToHubCallback( -... output_dir="my_awesome_qa_model", -... tokenizer=tokenizer, -... ) -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callback to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback]) -``` -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for question answering, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). - - - -## Evaluate - -Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance. - -If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) chapter from the 🤗 Hugging Face Course! - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Come up with a question and some context you'd like the model to predict: - -```py ->>> question = "How many programming languages does BLOOM support?" ->>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for question answering with your model, and pass your text to it: - -```py ->>> from transformers import pipeline - ->>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model") ->>> question_answerer(question=question, context=context) -{'score': 0.2058267742395401, - 'start': 10, - 'end': 95, - 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'} -``` - -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Tokenize the text and return PyTorch tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") ->>> inputs = tokenizer(question, context, return_tensors="pt") -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import AutoModelForQuestionAnswering - ->>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") ->>> with torch.no_grad(): -... outputs = model(**inputs) -``` - -Get the highest probability from the model output for the start and end positions: - -```py ->>> answer_start_index = outputs.start_logits.argmax() ->>> answer_end_index = outputs.end_logits.argmax() -``` - -Decode the predicted tokens to get the answer: - -```py ->>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] ->>> tokenizer.decode(predict_answer_tokens) -'176 billion parameters and can generate text in 46 languages natural languages and 13' -``` - - -Tokenize the text and return TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") ->>> inputs = tokenizer(question, text, return_tensors="tf") -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import TFAutoModelForQuestionAnswering - ->>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") ->>> outputs = model(**inputs) -``` - -Get the highest probability from the model output for the start and end positions: - -```py ->>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) ->>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) -``` - -Decode the predicted tokens to get the answer: - -```py ->>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] ->>> tokenizer.decode(predict_answer_tokens) -'176 billion parameters and can generate text in 46 languages natural languages and 13' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md new file mode 100644 index 000000000000..c3ad3e00f61a --- /dev/null +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -0,0 +1,598 @@ + + +# Semantic segmentation + +[[open-in-colab]] + + + +Semantic segmentation assigns a label or class to each individual pixel of an image. There are several types of segmentation, and in the case of semantic segmentation, no distinction is made between unique instances of the same object. Both objects are given the same label (for example, "car" instead of "car-1" and "car-2"). Common real-world applications of semantic segmentation include training self-driving cars to identify pedestrians and important traffic information, identifying cells and abnormalities in medical imagery, and monitoring environmental changes from satellite imagery. + +This guide will show you how to: + +1. Finetune [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) on the [SceneParse150](https://huggingface.co/datasets/scene_parse_150) dataset. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install -q datasets transformers evaluate +``` + +We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load SceneParse150 dataset + +Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from datasets import load_dataset + +>>> ds = load_dataset("scene_parse_150", split="train[:50]") +``` + +Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> ds = ds.train_test_split(test_size=0.2) +>>> train_ds = ds["train"] +>>> test_ds = ds["test"] +``` + +Then take a look at an example: + +```py +>>> train_ds[0] +{'image': , + 'annotation': , + 'scene_category': 368} +``` + +- `image`: a PIL image of the scene. +- `annotation`: a PIL image of the segmentation map, which is also the model's target. +- `scene_category`: a category id that describes the image scene like "kitchen" or "office". In this guide, you'll only need `image` and `annotation`, both of which are PIL images. + +You'll also want to create a dictionary that maps a label id to a label class which will be useful when you set up the model later. Download the mappings from the Hub and create the `id2label` and `label2id` dictionaries: + +```py +>>> import json +>>> from huggingface_hub import cached_download, hf_hub_url + +>>> repo_id = "huggingface/label-files" +>>> filename = "ade20k-id2label.json" +>>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) +>>> id2label = {int(k): v for k, v in id2label.items()} +>>> label2id = {v: k for k, v in id2label.items()} +>>> num_labels = len(id2label) +``` + +## Preprocess + +The next step is to load a SegFormer image processor to prepare the images and annotations for the model. Some datasets, like this one, use the zero-index as the background class. However, the background class isn't actually included in the 150 classes, so you'll need to set `reduce_labels=True` to subtract one from all the labels. The zero-index is replaced by `255` so it's ignored by SegFormer's loss function: + +```py +>>> from transformers import AutoImageProcessor + +>>> checkpoint = "nvidia/mit-b0" +>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True) +``` + + + + +It is common to apply some data augmentations to an image dataset to make a model more robust against overfitting. In this guide, you'll use the [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) function from [torchvision](https://pytorch.org/vision/stable/index.html) to randomly change the color properties of an image, but you can also use any image library you like. + +```py +>>> from torchvision.transforms import ColorJitter + +>>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) +``` + +Now create two preprocessing functions to prepare the images and annotations for the model. These functions convert the images into `pixel_values` and annotations to `labels`. For the training set, `jitter` is applied before providing the images to the image processor. For the test set, the image processor crops and normalizes the `images`, and only crops the `labels` because no data augmentation is applied during testing. + +```py +>>> def train_transforms(example_batch): +... images = [jitter(x) for x in example_batch["image"]] +... labels = [x for x in example_batch["annotation"]] +... inputs = image_processor(images, labels) +... return inputs + + +>>> def val_transforms(example_batch): +... images = [x for x in example_batch["image"]] +... labels = [x for x in example_batch["annotation"]] +... inputs = image_processor(images, labels) +... return inputs +``` + +To apply the `jitter` over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.set_transform`] function. The transform is applied on the fly which is faster and consumes less disk space: + +```py +>>> train_ds.set_transform(train_transforms) +>>> test_ds.set_transform(val_transforms) +``` + + + + + + +It is common to apply some data augmentations to an image dataset to make a model more robust against overfitting. +In this guide, you'll use [`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image) to randomly change the color properties of an image, but you can also use any image +library you like. +Define two separate transformation functions: +- training data transformations that include image augmentation +- validation data transformations that only transpose the images, since computer vision models in 🤗 Transformers expect channels-first layout + +```py +>>> import tensorflow as tf + + +>>> def aug_transforms(image): +... image = tf.keras.utils.img_to_array(image) +... image = tf.image.random_brightness(image, 0.25) +... image = tf.image.random_contrast(image, 0.5, 2.0) +... image = tf.image.random_saturation(image, 0.75, 1.25) +... image = tf.image.random_hue(image, 0.1) +... image = tf.transpose(image, (2, 0, 1)) +... return image + + +>>> def transforms(image): +... image = tf.keras.utils.img_to_array(image) +... image = tf.transpose(image, (2, 0, 1)) +... return image +``` + +Next, create two preprocessing functions to prepare batches of images and annotations for the model. These functions apply +the image transformations and use the earlier loaded `image_processor` to convert the images into `pixel_values` and +annotations to `labels`. `ImageProcessor` also takes care of resizing and normalizing the images. + +```py +>>> def train_transforms(example_batch): +... images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]] +... labels = [x for x in example_batch["annotation"]] +... inputs = image_processor(images, labels) +... return inputs + + +>>> def val_transforms(example_batch): +... images = [transforms(x.convert("RGB")) for x in example_batch["image"]] +... labels = [x for x in example_batch["annotation"]] +... inputs = image_processor(images, labels) +... return inputs +``` + +To apply the preprocessing transformations over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.set_transform`] function. +The transform is applied on the fly which is faster and consumes less disk space: + +```py +>>> train_ds.set_transform(train_transforms) +>>> test_ds.set_transform(val_transforms) +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> metric = evaluate.load("mean_iou") +``` + +Then create a function to [`~evaluate.EvaluationModule.compute`] the metrics. Your predictions need to be converted to +logits first, and then reshaped to match the size of the labels before you can call [`~evaluate.EvaluationModule.compute`]: + + + + +```py +>>> import numpy as np +>>> import torch +>>> from torch import nn + +>>> def compute_metrics(eval_pred): +... with torch.no_grad(): +... logits, labels = eval_pred +... logits_tensor = torch.from_numpy(logits) +... logits_tensor = nn.functional.interpolate( +... logits_tensor, +... size=labels.shape[-2:], +... mode="bilinear", +... align_corners=False, +... ).argmax(dim=1) + +... pred_labels = logits_tensor.detach().cpu().numpy() +... metrics = metric.compute( +... predictions=pred_labels, +... references=labels, +... num_labels=num_labels, +... ignore_index=255, +... reduce_labels=False, +... ) +... for key, value in metrics.items(): +... if type(value) is np.ndarray: +... metrics[key] = value.tolist() +... return metrics +``` + + + + + + + + +```py +>>> def compute_metrics(eval_pred): +... logits, labels = eval_pred +... logits = tf.transpose(logits, perm=[0, 2, 3, 1]) +... logits_resized = tf.image.resize( +... logits, +... size=tf.shape(labels)[1:], +... method="bilinear", +... ) + +... pred_labels = tf.argmax(logits_resized, axis=-1) +... metrics = metric.compute( +... predictions=pred_labels, +... references=labels, +... num_labels=num_labels, +... ignore_index=-1, +... reduce_labels=image_processor.do_reduce_labels, +... ) + +... per_category_accuracy = metrics.pop("per_category_accuracy").tolist() +... per_category_iou = metrics.pop("per_category_iou").tolist() + +... metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)}) +... metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)}) +... return {"val_" + k: v for k, v in metrics.items()} +``` + + + + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#finetune-with-trainer)! + + + +You're ready to start training your model now! Load SegFormer with [`AutoModelForSemanticSegmentation`], and pass the model the mapping between label ids and label classes: + +```py +>>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer + +>>> model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the IoU metric and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="segformer-b0-scene-parse-150", +... learning_rate=6e-5, +... num_train_epochs=50, +... per_device_train_batch_size=2, +... per_device_eval_batch_size=2, +... save_total_limit=3, +... evaluation_strategy="steps", +... save_strategy="steps", +... save_steps=20, +... eval_steps=20, +... logging_steps=1, +... eval_accumulation_steps=5, +... remove_unused_columns=False, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=train_ds, +... eval_dataset=test_ds, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + + + + +If you are unfamiliar with fine-tuning a model with Keras, check out the [basic tutorial](./training#train-a-tensorflow-model-with-keras) first! + + + +To fine-tune a model in TensorFlow, follow these steps: +1. Define the training hyperparameters, and set up an optimizer and a learning rate schedule. +2. Instantiate a pretrained model. +3. Convert a 🤗 Dataset to a `tf.data.Dataset`. +4. Compile your model. +5. Add callbacks to calculate metrics and upload your model to 🤗 Hub +6. Use the `fit()` method to run the training. + +Start by defining the hyperparameters, optimizer and learning rate schedule: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 2 +>>> num_epochs = 50 +>>> num_train_steps = len(train_ds) * num_epochs +>>> learning_rate = 6e-5 +>>> weight_decay_rate = 0.01 + +>>> optimizer, lr_schedule = create_optimizer( +... init_lr=learning_rate, +... num_train_steps=num_train_steps, +... weight_decay_rate=weight_decay_rate, +... num_warmup_steps=0, +... ) +``` + +Then, load SegFormer with [`TFAutoModelForSemanticSegmentation`] along with the label mappings, and compile it with the +optimizer. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> from transformers import TFAutoModelForSemanticSegmentation + +>>> model = TFAutoModelForSemanticSegmentation.from_pretrained( +... checkpoint, +... id2label=id2label, +... label2id=label2id, +... ) +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Dataset.to_tf_dataset`] and the [`DefaultDataCollator`]: + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") + +>>> tf_train_dataset = train_ds.to_tf_dataset( +... columns=["pixel_values", "label"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) + +>>> tf_eval_dataset = test_ds.to_tf_dataset( +... columns=["pixel_values", "label"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) +``` + +To compute the accuracy from the predictions and push your model to the 🤗 Hub, use [Keras callbacks](../main_classes/keras_callbacks). +Pass your `compute_metrics` function to [`KerasMetricCallback`], +and use the [`PushToHubCallback`] to upload the model: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback + +>>> metric_callback = KerasMetricCallback( +... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] +... ) + +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) + +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you are ready to train your model! Call `fit()` with your training and validation datasets, the number of epochs, +and your callbacks to fine-tune the model: + +```py +>>> model.fit( +... tf_train_dataset, +... validation_data=tf_eval_dataset, +... callbacks=callbacks, +... epochs=num_epochs, +... ) +``` + +Congratulations! You have fine-tuned your model and shared it on the 🤗 Hub. You can now use it for inference! + + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Load an image for inference: + +```py +>>> image = ds[0]["image"] +>>> image +``` + +
+ Image of bedroom +
+ + + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image segmentation with your model, and pass your image to it: + +```py +>>> from transformers import pipeline + +>>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model") +>>> segmenter(image) +[{'score': None, + 'label': 'wall', + 'mask': }, + {'score': None, + 'label': 'sky', + 'mask': }, + {'score': None, + 'label': 'floor', + 'mask': }, + {'score': None, + 'label': 'ceiling', + 'mask': }, + {'score': None, + 'label': 'bed ', + 'mask': }, + {'score': None, + 'label': 'windowpane', + 'mask': }, + {'score': None, + 'label': 'cabinet', + 'mask': }, + {'score': None, + 'label': 'chair', + 'mask': }, + {'score': None, + 'label': 'armchair', + 'mask': }] +``` + +You can also manually replicate the results of the `pipeline` if you'd like. Process the image with an image processor and place the `pixel_values` on a GPU: + +```py +>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available, otherwise use a CPU +>>> encoding = image_processor(image, return_tensors="pt") +>>> pixel_values = encoding.pixel_values.to(device) +``` + +Pass your input to the model and return the `logits`: + +```py +>>> outputs = model(pixel_values=pixel_values) +>>> logits = outputs.logits.cpu() +``` + +Next, rescale the logits to the original image size: + +```py +>>> upsampled_logits = nn.functional.interpolate( +... logits, +... size=image.size[::-1], +... mode="bilinear", +... align_corners=False, +... ) + +>>> pred_seg = upsampled_logits.argmax(dim=1)[0] +``` + + + + + + +Load an image processor to preprocess the image and return the input as TensorFlow tensors: + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation") +>>> inputs = image_processor(image, return_tensors="tf") +``` + +Pass your input to the model and return the `logits`: + +```py +>>> from transformers import TFAutoModelForSemanticSegmentation + +>>> model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation") +>>> logits = model(**inputs).logits +``` + +Next, rescale the logits to the original image size and apply argmax on the class dimension: +```py +>>> logits = tf.transpose(logits, [0, 2, 3, 1]) + +>>> upsampled_logits = tf.image.resize( +... logits, +... # We reverse the shape of `image` because `image.size` returns width and height. +... image.size[::-1], +... ) + +>>> pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0] +``` + + + + +To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) as `ade_palette()` that maps each class to their RGB values. Then you can combine and plot your image and the predicted segmentation map: + +```py +>>> import matplotlib.pyplot as plt +>>> import numpy as np + +>>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8) +>>> palette = np.array(ade_palette()) +>>> for label, color in enumerate(palette): +... color_seg[pred_seg == label, :] = color +>>> color_seg = color_seg[..., ::-1] # convert to BGR + +>>> img = np.array(image) * 0.5 + color_seg * 0.5 # plot the image with the segmentation map +>>> img = img.astype(np.uint8) + +>>> plt.figure(figsize=(15, 10)) +>>> plt.imshow(img) +>>> plt.show() +``` + +
+ Image of bedroom overlaid with segmentation map +
diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.mdx deleted file mode 100644 index f1ab7ee0ea68..000000000000 --- a/docs/source/en/tasks/semantic_segmentation.mdx +++ /dev/null @@ -1,333 +0,0 @@ - - -# Semantic segmentation - -[[open-in-colab]] - - - -Semantic segmentation assigns a label or class to each individual pixel of an image. There are several types of segmentation, and in the case of semantic segmentation, no distinction is made between unique instances of the same object. Both objects are given the same label (for example, "car" instead of "car-1" and "car-2"). Common real-world applications of semantic segmentation include training self-driving cars to identify pedestrians and important traffic information, identifying cells and abnormalities in medical imagery, and monitoring environmental changes from satellite imagery. - -This guide will show you how to: - -1. Finetune [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) on the [SceneParse150](https://huggingface.co/datasets/scene_parse_150) dataset. -2. Use your finetuned model for inference. - - - -See the image segmentation [task page](https://huggingface.co/tasks/image-segmentation) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install -q datasets transformers evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load SceneParse150 dataset - -Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everythings works before spending more time training on the full dataset. - -```py ->>> from datasets import load_dataset - ->>> ds = load_dataset("scene_parse_150", split="train[:50]") -``` - -Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - -```py ->>> ds = ds.train_test_split(test_size=0.2) ->>> train_ds = ds["train"] ->>> test_ds = ds["test"] -``` - -Then take a look at an example: - -```py ->>> train_ds[0] -{'image': , - 'annotation': , - 'scene_category': 368} -``` - -- `image`: a PIL image of the scene. -- `annotation`: a PIL image of the segmentation map, which is also the model's target. -- `scene_category`: a category id that describes the image scene like "kitchen" or "office". In this guide, you'll only need `image` and `annotation`, both of which are PIL images. - -You'll also want to create a dictionary that maps a label id to a label class which will be useful when you set up the model later. Download the mappings from the Hub and create the `id2label` and `label2id` dictionaries: - -```py ->>> import json ->>> from huggingface_hub import cached_download, hf_hub_url - ->>> repo_id = "huggingface/label-files" ->>> filename = "ade20k-id2label.json" ->>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) ->>> id2label = {int(k): v for k, v in id2label.items()} ->>> label2id = {v: k for k, v in id2label.items()} ->>> num_labels = len(id2label) -``` - -## Preprocess - -The next step is to load a SegFormer image processor to prepare the images and annotations for the model. Some datasets, like this one, use the zero-index as the background class. However, the background class isn't actually included in the 150 classes, so you'll need to set `reduce_labels=True` to subtract one from all the labels. The zero-index is replaced by `255` so it's ignored by SegFormer's loss function: - -```py ->>> from transformers import AutoImageProcessor - ->>> feature_extractor = AutoImageProcessor.from_pretrained("nvidia/mit-b0", reduce_labels=True) -``` - -It is common to apply some data augmentations to an image dataset to make a model more robust against overfitting. In this guide, you'll use the [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) function from [torchvision](https://pytorch.org/vision/stable/index.html) to randomly change the color properties of an image, but you can also use any image library you like. - -```py ->>> from torchvision.transforms import ColorJitter - ->>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) -``` - -Now create two preprocessing functions to prepare the images and annotations for the model. These functions convert the images into `pixel_values` and annotations to `labels`. For the training set, `jitter` is applied before providing the images to the image processor. For the test set, the image processor crops and normalizes the `images`, and only crops the `labels` because no data augmentation is applied during testing. - -```py ->>> def train_transforms(example_batch): -... images = [jitter(x) for x in example_batch["image"]] -... labels = [x for x in example_batch["annotation"]] -... inputs = feature_extractor(images, labels) -... return inputs - - ->>> def val_transforms(example_batch): -... images = [x for x in example_batch["image"]] -... labels = [x for x in example_batch["annotation"]] -... inputs = feature_extractor(images, labels) -... return inputs -``` - -To apply the `jitter` over the entire dataset, use the 🤗 Datasets [`~datasets.Dataset.set_transform`] function. The transform is applied on the fly which is faster and consumes less disk space: - -```py ->>> train_ds.set_transform(train_transforms) ->>> test_ds.set_transform(val_transforms) -``` - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> metric = evaluate.load("mean_iou") -``` - -Then create a function to [`~evaluate.EvaluationModule.compute`] the metrics. Your predictions need to be converted to logits first, and then reshaped to match the size of the labels before you can call [`~evaluate.EvaluationModule.compute`]: - -```py ->>> def compute_metrics(eval_pred): -... with torch.no_grad(): -... logits, labels = eval_pred -... logits_tensor = torch.from_numpy(logits) -... logits_tensor = nn.functional.interpolate( -... logits_tensor, -... size=labels.shape[-2:], -... mode="bilinear", -... align_corners=False, -... ).argmax(dim=1) - -... pred_labels = logits_tensor.detach().cpu().numpy() -... metrics = metric.compute( -... predictions=pred_labels, -... references=labels, -... num_labels=num_labels, -... ignore_index=255, -... reduce_labels=False, -... ) -... for key, value in metrics.items(): -... if type(value) is np.ndarray: -... metrics[key] = value.tolist() -... return metrics -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#finetune-with-trainer)! - - - -You're ready to start training your model now! Load SegFormer with [`AutoModelForSemanticSegmentation`], and pass the model the mapping between label ids and label classes: - -```py ->>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer - ->>> pretrained_model_name = "nvidia/mit-b0" ->>> model = AutoModelForSemanticSegmentation.from_pretrained( -... pretrained_model_name, id2label=id2label, label2id=label2id -... ) -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the IoU metric and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="segformer-b0-scene-parse-150", -... learning_rate=6e-5, -... num_train_epochs=50, -... per_device_train_batch_size=2, -... per_device_eval_batch_size=2, -... save_total_limit=3, -... evaluation_strategy="steps", -... save_strategy="steps", -... save_steps=20, -... eval_steps=20, -... logging_steps=1, -... eval_accumulation_steps=5, -... remove_unused_columns=False, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=train_ds, -... eval_dataset=test_ds, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Load an image for inference: - -```py ->>> image = ds[0]["image"] ->>> image -``` - -
- Image of bedroom -
- -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image segmentation with your model, and pass your image to it: - -```py ->>> from transformers import pipeline - ->>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model") ->>> segmenter(image) -[{'score': None, - 'label': 'wall', - 'mask': }, - {'score': None, - 'label': 'sky', - 'mask': }, - {'score': None, - 'label': 'floor', - 'mask': }, - {'score': None, - 'label': 'ceiling', - 'mask': }, - {'score': None, - 'label': 'bed ', - 'mask': }, - {'score': None, - 'label': 'windowpane', - 'mask': }, - {'score': None, - 'label': 'cabinet', - 'mask': }, - {'score': None, - 'label': 'chair', - 'mask': }, - {'score': None, - 'label': 'armchair', - 'mask': }] -``` - -You can also manually replicate the results of the `pipeline` if you'd like. Process the image with an image processor and place the `pixel_values` on a GPU: - -```py ->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available, otherwise use a CPU ->>> encoding = feature_extractor(image, return_tensors="pt") ->>> pixel_values = encoding.pixel_values.to(device) -``` - -Pass your input to the model and return the `logits`: - -```py ->>> outputs = model(pixel_values=pixel_values) ->>> logits = outputs.logits.cpu() -``` - -Next, rescale the logits to the original image size: - -```py ->>> upsampled_logits = nn.functional.interpolate( -... logits, -... size=image.size[::-1], -... mode="bilinear", -... align_corners=False, -... ) - ->>> pred_seg = upsampled_logits.argmax(dim=1)[0] -``` - -To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) that maps each class to their RGB values. Then you can combine and plot your image and the predicted segmentation map: - -```py ->>> import matplotlib.pyplot as plt - ->>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8) ->>> palette = np.array(ade_palette()) ->>> for label, color in enumerate(palette): -... color_seg[pred_seg == label, :] = color ->>> color_seg = color_seg[..., ::-1] # convert to BGR - ->>> img = np.array(image) * 0.5 + color_seg * 0.5 # plot the image with the segmentation map ->>> img = img.astype(np.uint8) - ->>> plt.figure(figsize=(15, 10)) ->>> plt.imshow(img) ->>> plt.show() -``` - -
- Image of bedroom overlaid with segmentation map -
\ No newline at end of file diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md new file mode 100644 index 000000000000..b67d43453d27 --- /dev/null +++ b/docs/source/en/tasks/sequence_classification.md @@ -0,0 +1,399 @@ + + +# Text classification + +[[open-in-colab]] + + + +Text classification is a common NLP task that assigns a label or class to text. Some of the largest companies run text classification in production for a wide range of practical applications. One of the most popular forms of text classification is sentiment analysis, which assigns a label like 🙂 positive, 🙁 negative, or 😐 neutral to a sequence of text. + +This guide will show you how to: + +1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [IMDb](https://huggingface.co/datasets/imdb) dataset to determine whether a movie review is positive or negative. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + + +[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) + + + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load IMDb dataset + +Start by loading the IMDb dataset from the 🤗 Datasets library: + +```py +>>> from datasets import load_dataset + +>>> imdb = load_dataset("imdb") +``` + +Then take a look at an example: + +```py +>>> imdb["test"][0] +{ + "label": 0, + "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.", +} +``` + +There are two fields in this dataset: + +- `text`: the movie review text. +- `label`: a value that is either `0` for a negative review or `1` for a positive review. + +## Preprocess + +The next step is to load a DistilBERT tokenizer to preprocess the `text` field: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +``` + +Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length: + +```py +>>> def preprocess_function(examples): +... return tokenizer(examples["text"], truncation=True) +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once: + +```py +tokenized_imdb = imdb.map(preprocess_function, batched=True) +``` + +Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + + + +```py +>>> from transformers import DataCollatorWithPadding + +>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) +``` + + +```py +>>> from transformers import DataCollatorWithPadding + +>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: + +```py +>>> import numpy as np + + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + +Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`: + +```py +>>> id2label = {0: "NEGATIVE", 1: "POSITIVE"} +>>> label2id = {"NEGATIVE": 0, "POSITIVE": 1} +``` + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load DistilBERT with [`AutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings: + +```py +>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer + +>>> model = AutoModelForSequenceClassification.from_pretrained( +... "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id +... ) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_model", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=2, +... weight_decay=0.01, +... evaluation_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_imdb["train"], +... eval_dataset=tokenized_imdb["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + + + +[`Trainer`] applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly. + + + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer +>>> import tensorflow as tf + +>>> batch_size = 16 +>>> num_epochs = 5 +>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size +>>> total_train_steps = int(batches_per_epoch * num_epochs) +>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +Then you can load DistilBERT with [`TFAutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained( +... "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id +... ) +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_imdb["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_imdb["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks). + +Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_model", +... tokenizer=tokenizer, +... ) +``` + +Then bundle your callbacks together: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Grab some text you'd like to run inference on: + +```py +>>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three." +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model") +>>> classifier(text) +[{'label': 'POSITIVE', 'score': 0.9994940757751465}] +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Tokenize the text and return PyTorch tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") +>>> inputs = tokenizer(text, return_tensors="pt") +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: + +```py +>>> predicted_class_id = logits.argmax().item() +>>> model.config.id2label[predicted_class_id] +'POSITIVE' +``` + + +Tokenize the text and return TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") +>>> inputs = tokenizer(text, return_tensors="tf") +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") +>>> logits = model(**inputs).logits +``` + +Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: + +```py +>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) +>>> model.config.id2label[predicted_class_id] +'POSITIVE' +``` + + diff --git a/docs/source/en/tasks/sequence_classification.mdx b/docs/source/en/tasks/sequence_classification.mdx deleted file mode 100644 index bc9c5f20e722..000000000000 --- a/docs/source/en/tasks/sequence_classification.mdx +++ /dev/null @@ -1,386 +0,0 @@ - - -# Text classification - -[[open-in-colab]] - - - -Text classification is a common NLP task that assigns a label or class to text. Some of the largest companies run text classification in production for a wide range of practical applications. One of the most popular forms of text classification is sentiment analysis, which assigns a label like 🙂 positive, 🙁 negative, or 😐 neutral to a sequence of text. - -This guide will show you how to: - -1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [IMDb](https://huggingface.co/datasets/imdb) dataset to determine whether a movie review is positive or negative. -2. Use your finetuned model for inference. - - - -See the text classification [task page](https://huggingface.co/tasks/text-classification) for more information about other forms of text classification and their associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load IMDb dataset - -Start by loading the IMDb dataset from the 🤗 Datasets library: - -```py ->>> from datasets import load_dataset - ->>> imdb = load_dataset("imdb") -``` - -Then take a look at an example: - -```py ->>> imdb["test"][0] -{ - "label": 0, - "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.", -} -``` - -There are two fields in this dataset: - -- `text`: the movie review text. -- `label`: a value that is either `0` for a negative review or `1` for a positive review. - -## Preprocess - -The next step is to load a DistilBERT tokenizer to preprocess the `text` field: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") -``` - -Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length: - -```py ->>> def preprocess_function(examples): -... return tokenizer(examples["text"], truncation=True) -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once: - -```py -tokenized_imdb = imdb.map(preprocess_function, batched=True) -``` - -Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length. - - - -```py ->>> from transformers import DataCollatorWithPadding - ->>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) -``` - - -```py ->>> from transformers import DataCollatorWithPadding - ->>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") -``` - - - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> accuracy = evaluate.load("accuracy") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the accuracy: - -```py ->>> import numpy as np - - ->>> def compute_metrics(eval_pred): -... predictions, labels = eval_pred -... predictions = np.argmax(predictions, axis=1) -... return accuracy.compute(predictions=predictions, references=labels) -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - -Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`: - -```py ->>> id2label = {0: "NEGATIVE", 1: "POSITIVE"} ->>> label2id = {"NEGATIVE": 0, "POSITIVE": 1} -``` - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load DistilBERT with [`AutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings: - -```py ->>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer - ->>> model = AutoModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id -... ) -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_model", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... num_train_epochs=2, -... weight_decay=0.01, -... evaluation_strategy="epoch", -... save_strategy="epoch", -... load_best_model_at_end=True, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_imdb["train"], -... eval_dataset=tokenized_imdb["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - - - -[`Trainer`] applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly. - - - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer ->>> import tensorflow as tf - ->>> batch_size = 16 ->>> num_epochs = 5 ->>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size ->>> total_train_steps = int(batches_per_epoch * num_epochs) ->>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) -``` - -Then you can load DistilBERT with [`TFAutoModelForSequenceClassification`] along with the number of expected labels, and the label mappings: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id -... ) -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_imdb["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_validation_set = model.prepare_tf_dataset( -... tokenized_imdb["test"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks). - -Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: - -```py ->>> from transformers.keras_callbacks import KerasMetricCallback - ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) -``` - -Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="my_awesome_model", -... tokenizer=tokenizer, -... ) -``` - -Then bundle your callbacks together: - -```py ->>> callbacks = [metric_callback, push_to_hub_callback] -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Grab some text you'd like to run inference on: - -```py ->>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three." -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model") ->>> classifier(text) -[{'label': 'POSITIVE', 'score': 0.9994940757751465}] -``` - -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Tokenize the text and return PyTorch tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") ->>> inputs = tokenizer(text, return_tensors="pt") -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") ->>> with torch.no_grad(): -... logits = model(**inputs).logits -``` - -Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: - -```py ->>> predicted_class_id = logits.argmax().item() ->>> model.config.id2label[predicted_class_id] -'POSITIVE' -``` - - -Tokenize the text and return TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") ->>> inputs = tokenizer(text, return_tensors="tf") -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") ->>> logits = model(**inputs).logits -``` - -Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: - -```py ->>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) ->>> model.config.id2label[predicted_class_id] -'POSITIVE' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md new file mode 100644 index 000000000000..ecdf37ce6efb --- /dev/null +++ b/docs/source/en/tasks/summarization.md @@ -0,0 +1,403 @@ + + +# Summarization + +[[open-in-colab]] + + + +Summarization creates a shorter version of a document or an article that captures all the important information. Along with translation, it is another example of a task that can be formulated as a sequence-to-sequence task. Summarization can be: + +- Extractive: extract the most relevant information from a document. +- Abstractive: generate new text that captures the most relevant information. + +This guide will show you how to: + +1. Finetune [T5](https://huggingface.co/t5-small) on the California state bill subset of the [BillSum](https://huggingface.co/datasets/billsum) dataset for abstractive summarization. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate rouge_score +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load BillSum dataset + +Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library: + +```py +>>> from datasets import load_dataset + +>>> billsum = load_dataset("billsum", split="ca_test") +``` + +Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> billsum = billsum.train_test_split(test_size=0.2) +``` + +Then take a look at an example: + +```py +>>> billsum["train"][0] +{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.', + 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.', + 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'} +``` + +There are two fields that you'll want to use: + +- `text`: the text of the bill which'll be the input to the model. +- `summary`: a condensed version of `text` which'll be the model target. + +## Preprocess + +The next step is to load a T5 tokenizer to process `text` and `summary`: + +```py +>>> from transformers import AutoTokenizer + +>>> checkpoint = "t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +``` + +The preprocessing function you want to create needs to: + +1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks. +2. Use the keyword `text_target` argument when tokenizing labels. +3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. + +```py +>>> prefix = "summarize: " + + +>>> def preprocess_function(examples): +... inputs = [prefix + doc for doc in examples["text"]] +... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) + +... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) + +... model_inputs["labels"] = labels["input_ids"] +... return model_inputs +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: + +```py +>>> tokenized_billsum = billsum.map(preprocess_function, batched=True) +``` + +Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) +``` + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> rouge = evaluate.load("rouge") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the ROUGE metric: + +```py +>>> import numpy as np + + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) +... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) +... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + +... result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + +... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] +... result["gen_len"] = np.mean(prediction_lens) + +... return {k: round(v, 4) for k, v in result.items()} +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the ROUGE metric and save the training checkpoint. +2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="my_awesome_billsum_model", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... weight_decay=0.01, +... save_total_limit=3, +... num_train_epochs=4, +... predict_with_generate=True, +... fp16=True, +... push_to_hub=True, +... ) + +>>> trainer = Seq2SeqTrainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_billsum["train"], +... eval_dataset=tokenized_billsum["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_billsum["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... tokenized_billsum["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +The last two things to setup before you start training is to compute the ROUGE score from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks). + +Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_billsum_model", +... tokenizer=tokenizer, +... ) +``` + +Then bundle your callbacks together: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Come up with some text you'd like to summarize. For T5, you need to prefix your input depending on the task you're working on. For summarization you should prefix your input as shown below: + +```py +>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes." +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for summarization with your model, and pass your text to it: + +```py +>>> from transformers import pipeline + +>>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model") +>>> summarizer(text) +[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}] +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + + +Tokenize the text and return the `input_ids` as PyTorch tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model") +>>> inputs = tokenizer(text, return_tensors="pt").input_ids +``` + +Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API. + +```py +>>> from transformers import AutoModelForSeq2SeqLM + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model") +>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) +``` + +Decode the generated token ids back into text: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.' +``` + + +Tokenize the text and return the `input_ids` as TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model") +>>> inputs = tokenizer(text, return_tensors="tf").input_ids +``` + +Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API. + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model") +>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) +``` + +Decode the generated token ids back into text: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.' +``` + + diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.mdx deleted file mode 100644 index 1663c1f71348..000000000000 --- a/docs/source/en/tasks/summarization.mdx +++ /dev/null @@ -1,390 +0,0 @@ - - -# Summarization - - - -Summarization creates a shorter version of a document or an article that captures all the important information. Along with translation, it is another example of a task that can be formulated as a sequence-to-sequence task. Summarization can be: - -- Extractive: extract the most relevant information from a document. -- Abstractive: generate new text that captures the most relevant information. - -This guide will show you how to: - -1. Finetune [T5](https://huggingface.co/t5-small) on the California state bill subset of the [BillSum](https://huggingface.co/datasets/billsum) dataset for abstractive summarization. -2. Use your finetuned model for inference. - - - -See the summarization [task page](https://huggingface.co/tasks/summarization) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load BillSum dataset - -Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library: - -```py ->>> from datasets import load_dataset - ->>> billsum = load_dataset("billsum", split="ca_test") -``` - -Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - -```py ->>> billsum = billsum.train_test_split(test_size=0.2) -``` - -Then take a look at an example: - -```py ->>> billsum["train"][0] -{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.', - 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.', - 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'} -``` - -There are two fields that you'll want to use: - -- `text`: the text of the bill which'll be the input to the model. -- `summary`: a condensed version of `text` which'll be the model target. - -## Preprocess - -The next step is to load a T5 tokenizer to process `text` and `summary`: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("t5-small") -``` - -The preprocessing function you want to create needs to: - -1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks. -2. Use the keyword `text_target` argument when tokenizing labels. -3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. - -```py ->>> prefix = "summarize: " - - ->>> def preprocess_function(examples): -... inputs = [prefix + doc for doc in examples["text"]] -... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) - -... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) - -... model_inputs["labels"] = labels["input_ids"] -... return model_inputs -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: - -```py ->>> tokenized_billsum = billsum.map(preprocess_function, batched=True) -``` - -Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length. - - - -```py ->>> from transformers import DataCollatorForSeq2Seq - ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) -``` - - -```py ->>> from transformers import DataCollatorForSeq2Seq - ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") -``` - - - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> rouge = evaluate.load("rouge") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the ROUGE metric: - -```py ->>> import numpy as np - - ->>> def compute_metrics(eval_pred): -... predictions, labels = eval_pred -... decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) -... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) -... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - -... result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) - -... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] -... result["gen_len"] = np.mean(prediction_lens) - -... return {k: round(v, 4) for k, v in result.items()} -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]: - -```py ->>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the ROUGE metric and save the training checkpoint. -2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = Seq2SeqTrainingArguments( -... output_dir="my_awesome_billsum_model", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... weight_decay=0.01, -... save_total_limit=3, -... num_train_epochs=4, -... predict_with_generate=True, -... fp16=True, -... push_to_hub=True, -... ) - ->>> trainer = Seq2SeqTrainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_billsum["train"], -... eval_dataset=tokenized_billsum["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer, AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]: - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_billsum["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = model.prepare_tf_dataset( -... tokenized_billsum["test"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -The last two things to setup before you start training is to compute the ROUGE score from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks). - -Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: - -```py ->>> from transformers.keras_callbacks import KerasMetricCallback - ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) -``` - -Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="my_awesome_billsum_model", -... tokenizer=tokenizer, -... ) -``` - -Then bundle your callbacks together: - -```py ->>> callbacks = [metric_callback, push_to_hub_callback] -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Come up with some text you'd like to summarize. For T5, you need to prefix your input depending on the task you're working on. For summarization you should prefix your input as shown below: - -```py ->>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes." -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for summarization with your model, and pass your text to it: - -```py ->>> from transformers import pipeline - ->>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model") ->>> summarizer(text) -[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}] -``` - -You can also manually replicate the results of the `pipeline` if you'd like: - - - - -Tokenize the text and return the `input_ids` as PyTorch tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model") ->>> inputs = tokenizer(text, return_tensors="pt").input_ids -``` - -Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API. - -```py ->>> from transformers import AutoModelForSeq2SeqLM - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model") ->>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) -``` - -Decode the generated token ids back into text: - -```py ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.' -``` - - -Tokenize the text and return the `input_ids` as TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model") ->>> inputs = tokenizer(text, return_tensors="tf").input_ids -``` - -Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API. - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model") ->>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) -``` - -Decode the generated token ids back into text: - -```py ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md new file mode 100644 index 000000000000..86a0d49fd04d --- /dev/null +++ b/docs/source/en/tasks/text-to-speech.md @@ -0,0 +1,633 @@ + + +# Text to speech + +[[open-in-colab]] + +Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple +languages and for multiple speakers. Several text-to-speech models are currently available in 🤗 Transformers, such as +[Bark](../model_doc/bark), [MMS](../model_doc/mms), [VITS](../model_doc/vits) and [SpeechT5](../model_doc/speecht5). + +You can easily generate audio using the `"text-to-audio"` pipeline (or its alias - `"text-to-speech"`). Some models, like Bark, +can also be conditioned to generate non-verbal communications such as laughing, sighing and crying, or even add music. +Here's an example of how you would use the `"text-to-speech"` pipeline with Bark: + +```py +>>> from transformers import pipeline + +>>> pipe = pipeline("text-to-speech", model="suno/bark-small") +>>> text = "[clears throat] This is a test ... and I just took a long pause." +>>> output = pipe(text) +``` + +Here's a code snippet you can use to listen to the resulting audio in a notebook: + +```python +>>> from IPython.display import Audio +>>> Audio(output["audio"], rate=output["sampling_rate"]) +``` + +For more examples on what Bark and other pretrained TTS models can do, refer to our +[Audio course](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models). + +If you are looking to fine-tune a TTS model, you can currently fine-tune SpeechT5 only. SpeechT5 is pre-trained on a combination of +speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text +and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 +supports multiple speakers through x-vector speaker embeddings. + +The remainder of this guide illustrates how to: + +1. Fine-tune [SpeechT5](../model_doc/speecht5) that was originally trained on English speech on the Dutch (`nl`) language subset of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset. +2. Use your refined model for inference in one of two ways: using a pipeline or directly. + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install datasets soundfile speechbrain accelerate +``` + +Install 🤗Transformers from source as not all the SpeechT5 features have been merged into an official release yet: + +```bash +pip install git+https://github.com/huggingface/transformers.git +``` + + + +To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available: + +```bash +!nvidia-smi +``` + + + +We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load the dataset + +[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of +data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 +European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset. + +Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable +option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are +typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite +challenging. + +Let's load the data: + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train") +>>> len(dataset) +20968 +``` + +20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so +make sure the examples in the dataset meet this requirement: + +```py +dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +``` + +## Preprocess the data + +Let's begin by defining the model checkpoint to use and loading the appropriate processor: + +```py +>>> from transformers import SpeechT5Processor + +>>> checkpoint = "microsoft/speecht5_tts" +>>> processor = SpeechT5Processor.from_pretrained(checkpoint) +``` + +### Text cleanup for SpeechT5 tokenization + +Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text: + +```py +>>> tokenizer = processor.tokenizer +``` + +The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, +consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written +out as text. Thus, it is a better fit, and we recommend using `normalized_text` as input text. + +Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If +left as is, these characters will be converted to `` tokens. However, in Dutch, certain characters like `à` are +used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`. + +To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which +works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates +the transcriptions from all examples into one string and converts it to a set of characters. +Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for +the mapping function. + +```py +>>> def extract_all_chars(batch): +... all_text = " ".join(batch["normalized_text"]) +... vocab = list(set(all_text)) +... return {"vocab": [vocab], "all_text": [all_text]} + + +>>> vocabs = dataset.map( +... extract_all_chars, +... batched=True, +... batch_size=-1, +... keep_in_memory=True, +... remove_columns=dataset.column_names, +... ) + +>>> dataset_vocab = set(vocabs["vocab"][0]) +>>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()} +``` + +Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. +To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting +set will contain the characters that are in the dataset but not in the tokenizer. + +```py +>>> dataset_vocab - tokenizer_vocab +{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'} +``` + +To handle the unsupported characters identified in the previous step, define a function that maps these characters to +valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately. + +```py +>>> replacements = [ +... ("à", "a"), +... ("ç", "c"), +... ("è", "e"), +... ("ë", "e"), +... ("í", "i"), +... ("ï", "i"), +... ("ö", "o"), +... ("ü", "u"), +... ] + + +>>> def cleanup_text(inputs): +... for src, dst in replacements: +... inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst) +... return inputs + + +>>> dataset = dataset.map(cleanup_text) +``` + +Now that you have dealt with special characters in the text, it's time to shift focus to the audio data. + +### Speakers + +The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To +determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. +With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of +speakers and examples in the data. + +```py +>>> from collections import defaultdict + +>>> speaker_counts = defaultdict(int) + +>>> for speaker_id in dataset["speaker_id"]: +... speaker_counts[speaker_id] += 1 +``` + +By plotting a histogram you can get a sense of how much data there is for each speaker. + +```py +>>> import matplotlib.pyplot as plt + +>>> plt.figure() +>>> plt.hist(speaker_counts.values(), bins=20) +>>> plt.ylabel("Speakers") +>>> plt.xlabel("Examples") +>>> plt.show() +``` + +
+ Speakers histogram +
+ +The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while +around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit +the data to speakers with between 100 and 400 examples. + +```py +>>> def select_speaker(speaker_id): +... return 100 <= speaker_counts[speaker_id] <= 400 + + +>>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"]) +``` + +Let's check how many speakers remain: + +```py +>>> len(set(dataset["speaker_id"])) +42 +``` + +Let's see how many examples are left: + +```py +>>> len(dataset) +9973 +``` + +You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient. + +Note that some speakers with few examples may actually have more audio available if the examples are long. However, +determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a +time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here. + +### Speaker embeddings + +To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. +The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics. +To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) +model from SpeechBrain. + +Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector +containing the corresponding speaker embedding. + +```py +>>> import os +>>> import torch +>>> from speechbrain.pretrained import EncoderClassifier + +>>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb" + +>>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> speaker_model = EncoderClassifier.from_hparams( +... source=spk_model_name, +... run_opts={"device": device}, +... savedir=os.path.join("/tmp", spk_model_name), +... ) + + +>>> def create_speaker_embedding(waveform): +... with torch.no_grad(): +... speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) +... speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) +... speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() +... return speaker_embeddings +``` + +It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb +dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate +reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases. + +For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model +is better able to capture the unique voice characteristics present in the Dutch language. + +### Processing the dataset + +Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a +single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. +It should also add the speaker embeddings as an additional input. + +```py +>>> def prepare_dataset(example): +... audio = example["audio"] + +... example = processor( +... text=example["normalized_text"], +... audio_target=audio["array"], +... sampling_rate=audio["sampling_rate"], +... return_attention_mask=False, +... ) + +... # strip off the batch dimension +... example["labels"] = example["labels"][0] + +... # use SpeechBrain to obtain x-vector +... example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) + +... return example +``` + +Verify the processing is correct by looking at a single example: + +```py +>>> processed_example = prepare_dataset(dataset[0]) +>>> list(processed_example.keys()) +['input_ids', 'labels', 'stop_labels', 'speaker_embeddings'] +``` + +Speaker embeddings should be a 512-element vector: + +```py +>>> processed_example["speaker_embeddings"].shape +(512,) +``` + +The labels should be a log-mel spectrogram with 80 mel bins. + +```py +>>> import matplotlib.pyplot as plt + +>>> plt.figure() +>>> plt.imshow(processed_example["labels"].T) +>>> plt.show() +``` + +
+ Log-mel spectrogram with 80 mel bins +
+ +Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies +at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library, +the y-axis is flipped and the spectrograms appear upside down. + +Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes. + +```py +>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names) +``` + +You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). +Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens. + +```py +>>> def is_not_too_long(input_ids): +... input_length = len(input_ids) +... return input_length < 200 + + +>>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"]) +>>> len(dataset) +8259 +``` + +Next, create a basic train/test split: + +```py +>>> dataset = dataset.train_test_split(test_size=0.1) +``` + +### Data collator + +In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding +tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value +instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss. + +```py +>>> from dataclasses import dataclass +>>> from typing import Any, Dict, List, Union + + +>>> @dataclass +... class TTSDataCollatorWithPadding: +... processor: Any + +... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: +... input_ids = [{"input_ids": feature["input_ids"]} for feature in features] +... label_features = [{"input_values": feature["labels"]} for feature in features] +... speaker_features = [feature["speaker_embeddings"] for feature in features] + +... # collate the inputs and targets into a batch +... batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt") + +... # replace padding with -100 to ignore loss correctly +... batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100) + +... # not used during fine-tuning +... del batch["decoder_attention_mask"] + +... # round down target lengths to multiple of reduction factor +... if model.config.reduction_factor > 1: +... target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features]) +... target_lengths = target_lengths.new( +... [length - length % model.config.reduction_factor for length in target_lengths] +... ) +... max_length = max(target_lengths) +... batch["labels"] = batch["labels"][:, :max_length] + +... # also add in the speaker embeddings +... batch["speaker_embeddings"] = torch.tensor(speaker_features) + +... return batch +``` + +In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every +other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original +target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a +multiple of 2. + +```py +>>> data_collator = TTSDataCollatorWithPadding(processor=processor) +``` + +## Train the model + +Load the pre-trained model from the same checkpoint as you used for loading the processor: + +```py +>>> from transformers import SpeechT5ForTextToSpeech + +>>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) +``` + +The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training. + +```py +>>> model.config.use_cache = False +``` + +Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll +only look at the loss: + +```python +>>> from transformers import Seq2SeqTrainingArguments + +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="speecht5_finetuned_voxpopuli_nl", # change to a repo name of your choice +... per_device_train_batch_size=4, +... gradient_accumulation_steps=8, +... learning_rate=1e-5, +... warmup_steps=500, +... max_steps=4000, +... gradient_checkpointing=True, +... fp16=True, +... evaluation_strategy="steps", +... per_device_eval_batch_size=2, +... save_steps=1000, +... eval_steps=1000, +... logging_steps=25, +... report_to=["tensorboard"], +... load_best_model_at_end=True, +... greater_is_better=False, +... label_names=["labels"], +... push_to_hub=True, +... ) +``` + +Instantiate the `Trainer` object and pass the model, dataset, and data collator to it. + +```py +>>> from transformers import Seq2SeqTrainer + +>>> trainer = Seq2SeqTrainer( +... args=training_args, +... model=model, +... train_dataset=dataset["train"], +... eval_dataset=dataset["test"], +... data_collator=data_collator, +... tokenizer=processor, +... ) +``` + +And with that, you're ready to start training! Training will take several hours. Depending on your GPU, +it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce +the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate. + +```py +>>> trainer.train() +``` + +To be able to use your checkpoint with a pipeline, make sure to save the processor with the checkpoint: + +```py +>>> processor.save_pretrained("YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl") +``` + +Push the final model to the 🤗 Hub: + +```py +>>> trainer.push_to_hub() +``` + +## Inference + +### Inference with a pipeline + +Great, now that you've fine-tuned a model, you can use it for inference! +First, let's see how you can use it with a corresponding pipeline. Let's create a `"text-to-speech"` pipeline with your +checkpoint: + +```py +>>> from transformers import pipeline + +>>> pipe = pipeline("text-to-speech", model="YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl") +``` + +Pick a piece of text in Dutch you'd like narrated, e.g.: + +```py +>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!" +``` + +To use SpeechT5 with the pipeline, you'll need a speaker embedding. Let's get it from an example in the test dataset: + +```py +>>> example = dataset["test"][304] +>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) +``` + +Now you can pass the text and speaker embeddings to the pipeline, and it will take care of the rest: + +```py +>>> forward_params = {"speaker_embeddings": speaker_embeddings} +>>> output = pipe(text, forward_params=forward_params) +>>> output +{'audio': array([-6.82714235e-05, -4.26525949e-04, 1.06134125e-04, ..., + -1.22392643e-03, -7.76011671e-04, 3.29112721e-04], dtype=float32), + 'sampling_rate': 16000} +``` + +You can then listen to the result: + +```py +>>> from IPython.display import Audio +>>> Audio(output['audio'], rate=output['sampling_rate']) +``` + +### Run inference manually + +You can achieve the same inference results without using the pipeline, however, more steps will be required. + +Load the model from the 🤗 Hub: + +```py +>>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl") +``` + +Pick an example from the test dataset obtain a speaker embedding. + +```py +>>> example = dataset["test"][304] +>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) +``` + +Define the input text and tokenize it. + +```py +>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!" +>>> inputs = processor(text=text, return_tensors="pt") +``` + +Create a spectrogram with your model: + +```py +>>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) +``` + +Visualize the spectrogram, if you'd like to: + +```py +>>> plt.figure() +>>> plt.imshow(spectrogram.T) +>>> plt.show() +``` + +
+ Generated log-mel spectrogram +
+ +Finally, use the vocoder to turn the spectrogram into sound. + +```py +>>> with torch.no_grad(): +... speech = vocoder(spectrogram) + +>>> from IPython.display import Audio + +>>> Audio(speech.numpy(), rate=16000) +``` + +In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker +embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best +when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding. + +Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does +capture the voice characteristics of the speaker (compare to the original audio in the example). +Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to +see if this improves the results. + +Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it +may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please +use TTS judiciously and responsibly. \ No newline at end of file diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md new file mode 100644 index 000000000000..289f2b05896a --- /dev/null +++ b/docs/source/en/tasks/token_classification.md @@ -0,0 +1,562 @@ + + +# Token classification + +[[open-in-colab]] + + + +Token classification assigns a label to individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization. + +This guide will show you how to: + +1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [WNUT 17](https://huggingface.co/datasets/wnut_17) dataset to detect new entities. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [BROS](../model_doc/bros), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate seqeval +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load WNUT 17 dataset + +Start by loading the WNUT 17 dataset from the 🤗 Datasets library: + +```py +>>> from datasets import load_dataset + +>>> wnut = load_dataset("wnut_17") +``` + +Then take a look at an example: + +```py +>>> wnut["train"][0] +{'id': '0', + 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0], + 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'] +} +``` + +Each number in `ner_tags` represents an entity. Convert the numbers to their label names to find out what the entities are: + +```py +>>> label_list = wnut["train"].features[f"ner_tags"].feature.names +>>> label_list +[ + "O", + "B-corporation", + "I-corporation", + "B-creative-work", + "I-creative-work", + "B-group", + "I-group", + "B-location", + "I-location", + "B-person", + "I-person", + "B-product", + "I-product", +] +``` + +The letter that prefixes each `ner_tag` indicates the token position of the entity: + +- `B-` indicates the beginning of an entity. +- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like + `Empire State Building`). +- `0` indicates the token doesn't correspond to any entity. + +## Preprocess + + + +The next step is to load a DistilBERT tokenizer to preprocess the `tokens` field: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +``` + +As you saw in the example `tokens` field above, it looks like the input has already been tokenized. But the input actually hasn't been tokenized yet and you'll need to set `is_split_into_words=True` to tokenize the words into subwords. For example: + +```py +>>> example = wnut["train"][0] +>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True) +>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"]) +>>> tokens +['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]'] +``` + +However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by: + +1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) method. +2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function (see [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)). +3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word. + +Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT's maximum input length: + +```py +>>> def tokenize_and_align_labels(examples): +... tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) + +... labels = [] +... for i, label in enumerate(examples[f"ner_tags"]): +... word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word. +... previous_word_idx = None +... label_ids = [] +... for word_idx in word_ids: # Set the special tokens to -100. +... if word_idx is None: +... label_ids.append(-100) +... elif word_idx != previous_word_idx: # Only label the first token of a given word. +... label_ids.append(label[word_idx]) +... else: +... label_ids.append(-100) +... previous_word_idx = word_idx +... labels.append(label_ids) + +... tokenized_inputs["labels"] = labels +... return tokenized_inputs +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: + +```py +>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True) +``` + +Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + + + +```py +>>> from transformers import DataCollatorForTokenClassification + +>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) +``` + + +```py +>>> from transformers import DataCollatorForTokenClassification + +>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) framework (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric). Seqeval actually produces several scores: precision, recall, F1, and accuracy. + +```py +>>> import evaluate + +>>> seqeval = evaluate.load("seqeval") +``` + +Get the NER labels first, and then create a function that passes your true predictions and true labels to [`~evaluate.EvaluationModule.compute`] to calculate the scores: + +```py +>>> import numpy as np + +>>> labels = [label_list[i] for i in example[f"ner_tags"]] + + +>>> def compute_metrics(p): +... predictions, labels = p +... predictions = np.argmax(predictions, axis=2) + +... true_predictions = [ +... [label_list[p] for (p, l) in zip(prediction, label) if l != -100] +... for prediction, label in zip(predictions, labels) +... ] +... true_labels = [ +... [label_list[l] for (p, l) in zip(prediction, label) if l != -100] +... for prediction, label in zip(predictions, labels) +... ] + +... results = seqeval.compute(predictions=true_predictions, references=true_labels) +... return { +... "precision": results["overall_precision"], +... "recall": results["overall_recall"], +... "f1": results["overall_f1"], +... "accuracy": results["overall_accuracy"], +... } +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + +Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`: + +```py +>>> id2label = { +... 0: "O", +... 1: "B-corporation", +... 2: "I-corporation", +... 3: "B-creative-work", +... 4: "I-creative-work", +... 5: "B-group", +... 6: "I-group", +... 7: "B-location", +... 8: "I-location", +... 9: "B-person", +... 10: "I-person", +... 11: "B-product", +... 12: "I-product", +... } +>>> label2id = { +... "O": 0, +... "B-corporation": 1, +... "I-corporation": 2, +... "B-creative-work": 3, +... "I-creative-work": 4, +... "B-group": 5, +... "I-group": 6, +... "B-location": 7, +... "I-location": 8, +... "B-person": 9, +... "I-person": 10, +... "B-product": 11, +... "I-product": 12, +... } +``` + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load DistilBERT with [`AutoModelForTokenClassification`] along with the number of expected labels, and the label mappings: + +```py +>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer + +>>> model = AutoModelForTokenClassification.from_pretrained( +... "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id +... ) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the seqeval scores and save the training checkpoint. +2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_wnut_model", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=2, +... weight_decay=0.01, +... evaluation_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_wnut["train"], +... eval_dataset=tokenized_wnut["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 3 +>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs +>>> optimizer, lr_schedule = create_optimizer( +... init_lr=2e-5, +... num_train_steps=num_train_steps, +... weight_decay_rate=0.01, +... num_warmup_steps=0, +... ) +``` + +Then you can load DistilBERT with [`TFAutoModelForTokenClassification`] along with the number of expected labels, and the label mappings: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained( +... "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id +... ) +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_wnut["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_wnut["validation"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +The last two things to setup before you start training is to compute the seqeval scores from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks). + +Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_wnut_model", +... tokenizer=tokenizer, +... ) +``` + +Then bundle your callbacks together: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Grab some text you'd like to run inference on: + +```py +>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco." +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for NER with your model, and pass your text to it: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model") +>>> classifier(text) +[{'entity': 'B-location', + 'score': 0.42658573, + 'index': 2, + 'word': 'golden', + 'start': 4, + 'end': 10}, + {'entity': 'I-location', + 'score': 0.35856336, + 'index': 3, + 'word': 'state', + 'start': 11, + 'end': 16}, + {'entity': 'B-group', + 'score': 0.3064001, + 'index': 4, + 'word': 'warriors', + 'start': 17, + 'end': 25}, + {'entity': 'B-location', + 'score': 0.65523505, + 'index': 13, + 'word': 'san', + 'start': 80, + 'end': 83}, + {'entity': 'B-location', + 'score': 0.4668663, + 'index': 14, + 'word': 'francisco', + 'start': 84, + 'end': 93}] +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Tokenize the text and return PyTorch tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> inputs = tokenizer(text, return_tensors="pt") +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: + +```py +>>> predictions = torch.argmax(logits, dim=2) +>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]] +>>> predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] +``` + + +Tokenize the text and return TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> inputs = tokenizer(text, return_tensors="tf") +``` + +Pass your inputs to the model and return the `logits`: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> logits = model(**inputs).logits +``` + +Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: + +```py +>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1) +>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] +>>> predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] +``` + + diff --git a/docs/source/en/tasks/token_classification.mdx b/docs/source/en/tasks/token_classification.mdx deleted file mode 100644 index 8c7ceac48f46..000000000000 --- a/docs/source/en/tasks/token_classification.mdx +++ /dev/null @@ -1,552 +0,0 @@ - - -# Token classification - -[[open-in-colab]] - - - -Token classification assigns a label to individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization. - -This guide will show you how to: - -1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [WNUT 17](https://huggingface.co/datasets/wnut_17) dataset to detect new entities. -2. Use your finetuned model for inference. - - - -See the token classification [task page](https://huggingface.co/tasks/token-classification) for more information about other forms of token classification and their associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load WNUT 17 dataset - -Start by loading the WNUT 17 dataset from the 🤗 Datasets library: - -```py ->>> from datasets import load_dataset - ->>> wnut = load_dataset("wnut_17") -``` - -Then take a look at an example: - -```py ->>> wnut["train"][0] -{'id': '0', - 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0], - 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'] -} -``` - -Each number in `ner_tags` represents an entity. Convert the numbers to their label names to find out what the entities are: - -```py ->>> label_list = wnut["train"].features[f"ner_tags"].feature.names ->>> label_list -[ - "O", - "B-corporation", - "I-corporation", - "B-creative-work", - "I-creative-work", - "B-group", - "I-group", - "B-location", - "I-location", - "B-person", - "I-person", - "B-product", - "I-product", -] -``` - -The letter that prefixes each `ner_tag` indicates the token position of the entity: - -- `B-` indicates the beginning of an entity. -- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like - `Empire State Building`). -- `0` indicates the token doesn't correspond to any entity. - -## Preprocess - - - -The next step is to load a DistilBERT tokenizer to preprocess the `tokens` field: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") -``` - -As you saw in the example `tokens` field above, it looks like the input has already been tokenized. But the input actually hasn't been tokenized yet and you'll need to set `is_split_into_words=True` to tokenize the words into subwords. For example: - -```py ->>> example = wnut["train"][0] ->>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True) ->>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"]) ->>> tokens -['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]'] -``` - -However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by: - -1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.word_ids) method. -2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function. -3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word. - -Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT's maximum input length: - -```py ->>> def tokenize_and_align_labels(examples): -... tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) - -... labels = [] -... for i, label in enumerate(examples[f"ner_tags"]): -... word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word. -... previous_word_idx = None -... label_ids = [] -... for word_idx in word_ids: # Set the special tokens to -100. -... if word_idx is None: -... label_ids.append(-100) -... elif word_idx != previous_word_idx: # Only label the first token of a given word. -... label_ids.append(label[word_idx]) -... else: -... label_ids.append(-100) -... previous_word_idx = word_idx -... labels.append(label_ids) - -... tokenized_inputs["labels"] = labels -... return tokenized_inputs -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: - -```py ->>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True) -``` - -Now create a batch of examples using [`DataCollatorWithPadding`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length. - - - -```py ->>> from transformers import DataCollatorForTokenClassification - ->>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) -``` - - -```py ->>> from transformers import DataCollatorForTokenClassification - ->>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") -``` - - - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) framework (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric). Seqeval actually produces several scores: precision, recall, F1, and accuracy. - -```py ->>> import evaluate - ->>> seqeval = evaluate.load("seqeval") -``` - -Get the NER labels first, and then create a function that passes your true predictions and true labels to [`~evaluate.EvaluationModule.compute`] to calculate the scores: - -```py ->>> import numpy as np - ->>> labels = [label_list[i] for i in example[f"ner_tags"]] - - ->>> def compute_metrics(p): -... predictions, labels = p -... predictions = np.argmax(predictions, axis=2) - -... true_predictions = [ -... [label_list[p] for (p, l) in zip(prediction, label) if l != -100] -... for prediction, label in zip(predictions, labels) -... ] -... true_labels = [ -... [label_list[l] for (p, l) in zip(prediction, label) if l != -100] -... for prediction, label in zip(predictions, labels) -... ] - -... results = seqeval.compute(predictions=true_predictions, references=true_labels) -... return { -... "precision": results["overall_precision"], -... "recall": results["overall_recall"], -... "f1": results["overall_f1"], -... "accuracy": results["overall_accuracy"], -... } -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - -Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`: - -```py ->>> id2label = { -... 0: "O", -... 1: "B-corporation", -... 2: "I-corporation", -... 3: "B-creative-work", -... 4: "I-creative-work", -... 5: "B-group", -... 6: "I-group", -... 7: "B-location", -... 8: "I-location", -... 9: "B-person", -... 10: "I-person", -... 11: "B-product", -... 12: "I-product", -... } ->>> label2id = { -... "O": 0, -... "B-corporation": 1, -... "I-corporation": 2, -... "B-creative-work": 3, -... "I-creative-work": 4, -... "B-group": 5, -... "I-group": 6, -... "B-location": 7, -... "I-location": 8, -... "B-person": 9, -... "I-person": 10, -... "B-product": 11, -... "I-product": 12, -... } -``` - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load DistilBERT with [`AutoModelForTokenClassification`] along with the number of expected labels, and the label mappings: - -```py ->>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer - ->>> model = AutoModelForTokenClassification.from_pretrained( -... "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id -... ) -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the seqeval scores and save the training checkpoint. -2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_wnut_model", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... num_train_epochs=2, -... weight_decay=0.01, -... evaluation_strategy="epoch", -... save_strategy="epoch", -... load_best_model_at_end=True, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_wnut["train"], -... eval_dataset=tokenized_wnut["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import create_optimizer - ->>> batch_size = 16 ->>> num_train_epochs = 3 ->>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs ->>> optimizer, lr_schedule = create_optimizer( -... init_lr=2e-5, -... num_train_steps=num_train_steps, -... weight_decay_rate=0.01, -... num_warmup_steps=0, -... ) -``` - -Then you can load DistilBERT with [`TFAutoModelForTokenClassification`] along with the number of expected labels, and the label mappings: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained( -... "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id -... ) -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_wnut["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_validation_set = model.prepare_tf_dataset( -... tokenized_wnut["validation"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -The last two things to setup before you start training is to compute the seqeval scores from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks). - -Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: - -```py ->>> from transformers.keras_callbacks import KerasMetricCallback - ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) -``` - -Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="my_awesome_wnut_model", -... tokenizer=tokenizer, -... ) -``` - -Then bundle your callbacks together: - -```py ->>> callbacks = [metric_callback, push_to_hub_callback] -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Grab some text you'd like to run inference on: - -```py ->>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco." -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for NER with your model, and pass your text to it: - -```py ->>> from transformers import pipeline - ->>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model") ->>> classifier(text) -[{'entity': 'B-location', - 'score': 0.42658573, - 'index': 2, - 'word': 'golden', - 'start': 4, - 'end': 10}, - {'entity': 'I-location', - 'score': 0.35856336, - 'index': 3, - 'word': 'state', - 'start': 11, - 'end': 16}, - {'entity': 'B-group', - 'score': 0.3064001, - 'index': 4, - 'word': 'warriors', - 'start': 17, - 'end': 25}, - {'entity': 'B-location', - 'score': 0.65523505, - 'index': 13, - 'word': 'san', - 'start': 80, - 'end': 83}, - {'entity': 'B-location', - 'score': 0.4668663, - 'index': 14, - 'word': 'francisco', - 'start': 84, - 'end': 93}] -``` - -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Tokenize the text and return PyTorch tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") ->>> inputs = tokenizer(text, return_tensors="pt") -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import AutoModelForTokenClassification - ->>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") ->>> with torch.no_grad(): -... logits = model(**inputs).logits -``` - -Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: - -```py ->>> predictions = torch.argmax(logits, dim=2) ->>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]] ->>> predicted_token_class -['O', - 'O', - 'B-location', - 'I-location', - 'B-group', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'B-location', - 'B-location', - 'O', - 'O'] -``` - - -Tokenize the text and return TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") ->>> inputs = tokenizer(text, return_tensors="tf") -``` - -Pass your inputs to the model and return the `logits`: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") ->>> logits = model(**inputs).logits -``` - -Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label: - -```py ->>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1) ->>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] ->>> predicted_token_class -['O', - 'O', - 'B-location', - 'I-location', - 'B-group', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'B-location', - 'B-location', - 'O', - 'O'] -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md new file mode 100644 index 000000000000..d5394caef838 --- /dev/null +++ b/docs/source/en/tasks/translation.md @@ -0,0 +1,411 @@ + + +# Translation + +[[open-in-colab]] + + + +Translation converts a sequence of text from one language to another. It is one of several tasks you can formulate as a sequence-to-sequence problem, a powerful framework for returning some output from an input, like translation or summarization. Translation systems are commonly used for translation between different language texts, but it can also be used for speech or some combination in between like text-to-speech or speech-to-text. + +This guide will show you how to: + +1. Finetune [T5](https://huggingface.co/t5-small) on the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset to translate English text to French. +2. Use your finetuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install transformers datasets evaluate sacrebleu +``` + +We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load OPUS Books dataset + +Start by loading the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset from the 🤗 Datasets library: + +```py +>>> from datasets import load_dataset + +>>> books = load_dataset("opus_books", "en-fr") +``` + +Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method: + +```py +>>> books = books["train"].train_test_split(test_size=0.2) +``` + +Then take a look at an example: + +```py +>>> books["train"][0] +{'id': '90560', + 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.', + 'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}} +``` + +`translation`: an English and French translation of the text. + +## Preprocess + + + +The next step is to load a T5 tokenizer to process the English-French language pairs: + +```py +>>> from transformers import AutoTokenizer + +>>> checkpoint = "t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +``` + +The preprocessing function you want to create needs to: + +1. Prefix the input with a prompt so T5 knows this is a translation task. Some models capable of multiple NLP tasks require prompting for specific tasks. +2. Tokenize the input (English) and target (French) separately because you can't tokenize French text with a tokenizer pretrained on an English vocabulary. +3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. + +```py +>>> source_lang = "en" +>>> target_lang = "fr" +>>> prefix = "translate English to French: " + + +>>> def preprocess_function(examples): +... inputs = [prefix + example[source_lang] for example in examples["translation"]] +... targets = [example[target_lang] for example in examples["translation"]] +... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) +... return model_inputs +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: + +```py +>>> tokenized_books = books.map(preprocess_function, batched=True) +``` + +Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) +``` + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") +``` + + + +## Evaluate + +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): + +```py +>>> import evaluate + +>>> metric = evaluate.load("sacrebleu") +``` + +Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the SacreBLEU score: + +```py +>>> import numpy as np + + +>>> def postprocess_text(preds, labels): +... preds = [pred.strip() for pred in preds] +... labels = [[label.strip()] for label in labels] + +... return preds, labels + + +>>> def compute_metrics(eval_preds): +... preds, labels = eval_preds +... if isinstance(preds, tuple): +... preds = preds[0] +... decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + +... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) +... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + +... decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + +... result = metric.compute(predictions=decoded_preds, references=decoded_labels) +... result = {"bleu": result["score"]} + +... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] +... result["gen_len"] = np.mean(prediction_lens) +... result = {k: round(v, 4) for k, v in result.items()} +... return result +``` + +Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. + +## Train + + + + + +If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! + + + +You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the SacreBLEU metric and save the training checkpoint. +2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="my_awesome_opus_books_model", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... weight_decay=0.01, +... save_total_limit=3, +... num_train_epochs=2, +... predict_with_generate=True, +... fp16=True, +... push_to_hub=True, +... ) + +>>> trainer = Seq2SeqTrainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_books["train"], +... eval_dataset=tokenized_books["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +```` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + + + + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! + + +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +```py +>>> from transformers import AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_books["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... tokenized_books["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +The last two things to setup before you start training is to compute the SacreBLEU metric from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks). + +Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_opus_books_model", +... tokenizer=tokenizer, +... ) +``` + +Then bundle your callbacks together: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) +``` + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! + + + + + +For a more in-depth example of how to finetune a model for translation, take a look at the corresponding +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) +or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb). + + + +## Inference + +Great, now that you've finetuned a model, you can use it for inference! + +Come up with some text you'd like to translate to another language. For T5, you need to prefix your input depending on the task you're working on. For translation from English to French, you should prefix your input as shown below: + +```py +>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." +``` + +The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for translation with your model, and pass your text to it: + +```py +>>> from transformers import pipeline + +>>> translator = pipeline("translation", model="my_awesome_opus_books_model") +>>> translator(text) +[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}] +``` + +You can also manually replicate the results of the `pipeline` if you'd like: + + + +Tokenize the text and return the `input_ids` as PyTorch tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model") +>>> inputs = tokenizer(text, return_tensors="pt").input_ids +``` + +Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API. + +```py +>>> from transformers import AutoModelForSeq2SeqLM + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model") +>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) +``` + +Decode the generated token ids back into text: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lignées partagent des ressources avec des bactéries enfixant l'azote.' +``` + + +Tokenize the text and return the `input_ids` as TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model") +>>> inputs = tokenizer(text, return_tensors="tf").input_ids +``` + +Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API. + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model") +>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) +``` + +Decode the generated token ids back into text: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.' +``` + + diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.mdx deleted file mode 100644 index 318cb2b1a9d2..000000000000 --- a/docs/source/en/tasks/translation.mdx +++ /dev/null @@ -1,398 +0,0 @@ - - -# Translation - - - -Translation converts a sequence of text from one language to another. It is one of several tasks you can formulate as a sequence-to-sequence problem, a powerful framework for returning some output from an input, like translation or summarization. Translation systems are commonly used for translation between different language texts, but it can also be used for speech or some combination in between like text-to-speech or speech-to-text. - -This guide will show you how to: - -1. Finetune [T5](https://huggingface.co/t5-small) on the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset to translate English text to French. -2. Use your finetuned model for inference. - - - -See the translation [task page](https://huggingface.co/tasks/translation) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install transformers datasets evaluate -``` - -We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load OPUS Books dataset - -Start by loading the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset from the 🤗 Datasets library: - -```py ->>> from datasets import load_dataset - ->>> books = load_dataset("opus_books", "en-fr") -``` - -Split the dataset into a train and test set with the [`~datasets.Dataset.train_test_split`] method: - -```py ->>> books = books["train"].train_test_split(test_size=0.2) -``` - -Then take a look at an example: - -```py ->>> books["train"][0] -{'id': '90560', - 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.', - 'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}} -``` - -`translation`: an English and French translation of the text. - -## Preprocess - - - -The next step is to load a T5 tokenizer to process the English-French language pairs: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("t5-small") -``` - -The preprocessing function you want to create needs to: - -1. Prefix the input with a prompt so T5 knows this is a translation task. Some models capable of multiple NLP tasks require prompting for specific tasks. -2. Tokenize the input (English) and target (French) separately because you can't tokenize French text with a tokenizer pretrained on an English vocabulary. -3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. - -```py ->>> source_lang = "en" ->>> target_lang = "fr" ->>> prefix = "translate English to French: " - - ->>> def preprocess_function(examples): -... inputs = [prefix + example[source_lang] for example in examples["translation"]] -... targets = [example[target_lang] for example in examples["translation"]] -... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) -... return model_inputs -``` - -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once: - -```py ->>> tokenized_books = books.map(preprocess_function, batched=True) -``` - -Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length. - - - -```py ->>> from transformers import DataCollatorForSeq2Seq - ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) -``` - - - -```py ->>> from transformers import DataCollatorForSeq2Seq - ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") -``` - - - -## Evaluate - -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): - -```py ->>> import evaluate - ->>> sacrebleu = evaluate.load("sacrebleu") -``` - -Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the SacreBLEU score: - -```py ->>> import numpy as np - - ->>> def postprocess_text(preds, labels): -... preds = [pred.strip() for pred in preds] -... labels = [[label.strip()] for label in labels] - -... return preds, labels - - ->>> def compute_metrics(eval_preds): -... preds, labels = eval_preds -... if isinstance(preds, tuple): -... preds = preds[0] -... decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - -... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) -... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - -... decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) - -... result = metric.compute(predictions=decoded_preds, references=decoded_labels) -... result = {"bleu": result["score"]} - -... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] -... result["gen_len"] = np.mean(prediction_lens) -... result = {k: round(v, 4) for k, v in result.items()} -... return result -``` - -Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training. - -## Train - - - - - -If you aren't familiar with finetuning a model with the [`Trainer`], take a look at the basic tutorial [here](../training#train-with-pytorch-trainer)! - - -You're ready to start training your model now! Load T5 with [`AutoModelForSeq2SeqLM`]: - -```py ->>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") -``` - -At this point, only three steps remain: - -1. Define your training hyperparameters in [`Seq2SeqTrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the SacreBLEU metric and save the training checkpoint. -2. Pass the training arguments to [`Seq2SeqTrainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. - -```py ->>> training_args = Seq2SeqTrainingArguments( -... output_dir="my_awesome_opus_books_model", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... weight_decay=0.01, -... save_total_limit=3, -... num_train_epochs=2, -... predict_with_generate=True, -... fp16=True, -... push_to_hub=True, -... ) - ->>> trainer = Seq2SeqTrainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_books["train"], -... eval_dataset=tokenized_books["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -```` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - - - - -If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial [here](../training#train-a-tensorflow-model-with-keras)! - - -To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: - -```py ->>> from transformers import AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]: - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") -``` - -Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_books["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = model.prepare_tf_dataset( -... tokenized_books["test"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - -Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -The last two things to setup before you start training is to compute the SacreBLEU metric from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](./main_classes/keras_callbacks). - -Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: - -```py ->>> from transformers.keras_callbacks import KerasMetricCallback - ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) -``` - -Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: - -```py ->>> from transformers.keras_callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="my_awesome_opus_books_model", -... tokenizer=tokenizer, -... ) -``` - -Then bundle your callbacks together: - -```py ->>> callbacks = [metric_callback, push_to_hub_callback] -``` - -Finally, you're ready to start training your model! Call [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) -``` - -Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! - - - - - -For a more in-depth example of how to finetune a model for translation, take a look at the corresponding -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) -or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb). - - - -## Inference - -Great, now that you've finetuned a model, you can use it for inference! - -Come up with some text you'd like to translate to another language. For T5, you need to prefix your input depending on the task you're working on. For translation from English to French, you should prefix your input as shown below: - -```py ->>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." -``` - -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for translation with your model, and pass your text to it: - -```py ->>> from transformers import pipeline - ->>> translator = pipeline("translation", model="my_awesome_opus_books_model") ->>> translator(text) -[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}] -``` - -You can also manually replicate the results of the `pipeline` if you'd like: - - - -Tokenize the text and return the `input_ids` as PyTorch tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model") ->>> inputs = tokenizer(text, return_tensors="pt").input_ids -``` - -Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API. - -```py ->>> from transformers import AutoModelForSeq2SeqLM - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model") ->>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) -``` - -Decode the generated token ids back into text: - -```py ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'Les lignées partagent des ressources avec des bactéries enfixant l'azote.' -``` - - -Tokenize the text and return the `input_ids` as TensorFlow tensors: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model") ->>> inputs = tokenizer(text, return_tensors="tf").input_ids -``` - -Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](./main_classes/text_generation) API. - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model") ->>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) -``` - -Decode the generated token ids back into text: - -```py ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.' -``` - - \ No newline at end of file diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md new file mode 100644 index 000000000000..a140ba373099 --- /dev/null +++ b/docs/source/en/tasks/video_classification.md @@ -0,0 +1,496 @@ + + +# Video classification + +[[open-in-colab]] + +Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to. These models can be used to categorize what a video is all about. A real-world application of video classification is action / activity recognition, which is useful for fitness applications. It is also helpful for vision-impaired individuals, especially when they are commuting. + +This guide will show you how to: + +1. Fine-tune [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) on a subset of the [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) dataset. +2. Use your fine-tuned model for inference. + + +The task illustrated in this tutorial is supported by the following model architectures: + + + +[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae), [ViViT](../model_doc/vivit) + + + + + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install -q pytorchvideo transformers evaluate +``` + +You will use [PyTorchVideo](https://pytorchvideo.org/) (dubbed `pytorchvideo`) to process and prepare the videos. + +We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Load UCF101 dataset + +Start by loading a subset of the [UCF-101 dataset](https://www.crcv.ucf.edu/data/UCF101.php). This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset. + +```py +>>> from huggingface_hub import hf_hub_download + +>>> hf_dataset_identifier = "sayakpaul/ucf101-subset" +>>> filename = "UCF101_subset.tar.gz" +>>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset") +``` + +After the subset has been downloaded, you need to extract the compressed archive: + +```py +>>> import tarfile + +>>> with tarfile.open(file_path) as t: +... t.extractall(".") +``` + +At a high level, the dataset is organized like so: + +```bash +UCF101_subset/ + train/ + BandMarching/ + video_1.mp4 + video_2.mp4 + ... + Archery + video_1.mp4 + video_2.mp4 + ... + ... + val/ + BandMarching/ + video_1.mp4 + video_2.mp4 + ... + Archery + video_1.mp4 + video_2.mp4 + ... + ... + test/ + BandMarching/ + video_1.mp4 + video_2.mp4 + ... + Archery + video_1.mp4 + video_2.mp4 + ... + ... +``` + +The (`sorted`) video paths appear like so: + +```bash +... +'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi', +'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi', +'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi', +'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi', +'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi' +... +``` + +You will notice that there are video clips belonging to the same group / scene where group is denoted by `g` in the video file paths. `v_ApplyEyeMakeup_g07_c04.avi` and `v_ApplyEyeMakeup_g07_c06.avi`, for example. + +For the validation and evaluation splits, you wouldn't want to have video clips from the same group / scene to prevent [data leakage](https://www.kaggle.com/code/alexisbcook/data-leakage). The subset that you are using in this tutorial takes this information into account. + +Next up, you will derive the set of labels present in the dataset. Also, create two dictionaries that'll be helpful when initializing the model: + +* `label2id`: maps the class names to integers. +* `id2label`: maps the integers to class names. + +```py +>>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths}) +>>> label2id = {label: i for i, label in enumerate(class_labels)} +>>> id2label = {i: label for label, i in label2id.items()} + +>>> print(f"Unique classes: {list(label2id.keys())}.") + +# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress']. +``` + +There are 10 unique classes. For each class, there are 30 videos in the training set. + +## Load a model to fine-tune + +Instantiate a video classification model from a pretrained checkpoint and its associated image processor. The model's encoder comes with pre-trained parameters, and the classification head is randomly initialized. The image processor will come in handy when writing the preprocessing pipeline for our dataset. + +```py +>>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification + +>>> model_ckpt = "MCG-NJU/videomae-base" +>>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt) +>>> model = VideoMAEForVideoClassification.from_pretrained( +... model_ckpt, +... label2id=label2id, +... id2label=id2label, +... ignore_mismatched_sizes=True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint +... ) +``` + +While the model is loading, you might notice the following warning: + +```bash +Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight'] +- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +``` + +The warning is telling us we are throwing away some weights (e.g. the weights and bias of the `classifier` layer) and randomly initializing some others (the weights and bias of a new `classifier` layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do. + +**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`. + +## Prepare the datasets for training + +For preprocessing the videos, you will leverage the [PyTorchVideo library](https://pytorchvideo.org/). Start by importing the dependencies we need. + +```py +>>> import pytorchvideo.data + +>>> from pytorchvideo.transforms import ( +... ApplyTransformToKey, +... Normalize, +... RandomShortSideScale, +... RemoveKey, +... ShortSideScale, +... UniformTemporalSubsample, +... ) + +>>> from torchvision.transforms import ( +... Compose, +... Lambda, +... RandomCrop, +... RandomHorizontalFlip, +... Resize, +... ) +``` + +For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. To learn more about the details of these transformations check out the [official documentation of PyTorchVideo](https://pytorchvideo.org). + +Use the `image_processor` associated with the pre-trained model to obtain the following information: + +* Image mean and standard deviation with which the video frame pixels will be normalized. +* Spatial resolution to which the video frames will be resized. + +Start by defining some constants. + +```py +>>> mean = image_processor.image_mean +>>> std = image_processor.image_std +>>> if "shortest_edge" in image_processor.size: +... height = width = image_processor.size["shortest_edge"] +>>> else: +... height = image_processor.size["height"] +... width = image_processor.size["width"] +>>> resize_to = (height, width) + +>>> num_frames_to_sample = model.config.num_frames +>>> sample_rate = 4 +>>> fps = 30 +>>> clip_duration = num_frames_to_sample * sample_rate / fps +``` + +Now, define the dataset-specific transformations and the datasets respectively. Starting with the training set: + +```py +>>> train_transform = Compose( +... [ +... ApplyTransformToKey( +... key="video", +... transform=Compose( +... [ +... UniformTemporalSubsample(num_frames_to_sample), +... Lambda(lambda x: x / 255.0), +... Normalize(mean, std), +... RandomShortSideScale(min_size=256, max_size=320), +... RandomCrop(resize_to), +... RandomHorizontalFlip(p=0.5), +... ] +... ), +... ), +... ] +... ) + +>>> train_dataset = pytorchvideo.data.Ucf101( +... data_path=os.path.join(dataset_root_path, "train"), +... clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration), +... decode_audio=False, +... transform=train_transform, +... ) +``` + +The same sequence of workflow can be applied to the validation and evaluation sets: + +```py +>>> val_transform = Compose( +... [ +... ApplyTransformToKey( +... key="video", +... transform=Compose( +... [ +... UniformTemporalSubsample(num_frames_to_sample), +... Lambda(lambda x: x / 255.0), +... Normalize(mean, std), +... Resize(resize_to), +... ] +... ), +... ), +... ] +... ) + +>>> val_dataset = pytorchvideo.data.Ucf101( +... data_path=os.path.join(dataset_root_path, "val"), +... clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration), +... decode_audio=False, +... transform=val_transform, +... ) + +>>> test_dataset = pytorchvideo.data.Ucf101( +... data_path=os.path.join(dataset_root_path, "test"), +... clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration), +... decode_audio=False, +... transform=val_transform, +... ) +``` + +**Note**: The above dataset pipelines are taken from the [official PyTorchVideo example](https://pytorchvideo.org/docs/tutorial_classification#dataset). We're using the [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) function because it's tailored for the UCF-101 dataset. Under the hood, it returns a [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) object. `LabeledVideoDataset` class is the base class for all things video in the PyTorchVideo dataset. So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the `LabeledVideoDataset` class accordingly. Refer to the `data` API [documentation to](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) learn more. Also, if your dataset follows a similar structure (as shown above), then using the `pytorchvideo.data.Ucf101()` should work just fine. + +You can access the `num_videos` argument to know the number of videos in the dataset. + +```py +>>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos) +# (300, 30, 75) +``` + +## Visualize the preprocessed video for better debugging + +```py +>>> import imageio +>>> import numpy as np +>>> from IPython.display import Image + +>>> def unnormalize_img(img): +... """Un-normalizes the image pixels.""" +... img = (img * std) + mean +... img = (img * 255).astype("uint8") +... return img.clip(0, 255) + +>>> def create_gif(video_tensor, filename="sample.gif"): +... """Prepares a GIF from a video tensor. +... +... The video tensor is expected to have the following shape: +... (num_frames, num_channels, height, width). +... """ +... frames = [] +... for video_frame in video_tensor: +... frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy()) +... frames.append(frame_unnormalized) +... kargs = {"duration": 0.25} +... imageio.mimsave(filename, frames, "GIF", **kargs) +... return filename + +>>> def display_gif(video_tensor, gif_name="sample.gif"): +... """Prepares and displays a GIF from a video tensor.""" +... video_tensor = video_tensor.permute(1, 0, 2, 3) +... gif_filename = create_gif(video_tensor, gif_name) +... return Image(filename=gif_filename) + +>>> sample_video = next(iter(train_dataset)) +>>> video_tensor = sample_video["video"] +>>> display_gif(video_tensor) +``` + +
+ Person playing basketball +
+ +## Train the model + +Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) from 🤗 Transformers for training the model. To instantiate a `Trainer`, you need to define the training configuration and an evaluation metric. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to configure the training. It requires an output folder name, which will be used to save the checkpoints of the model. It also helps sync all the information in the model repository on 🤗 Hub. + +Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs). + + +```py +>>> from transformers import TrainingArguments, Trainer + +>>> model_name = model_ckpt.split("/")[-1] +>>> new_model_name = f"{model_name}-finetuned-ucf101-subset" +>>> num_epochs = 4 + +>>> args = TrainingArguments( +... new_model_name, +... remove_unused_columns=False, +... evaluation_strategy="epoch", +... save_strategy="epoch", +... learning_rate=5e-5, +... per_device_train_batch_size=batch_size, +... per_device_eval_batch_size=batch_size, +... warmup_ratio=0.1, +... logging_steps=10, +... load_best_model_at_end=True, +... metric_for_best_model="accuracy", +... push_to_hub=True, +... max_steps=(train_dataset.num_videos // batch_size) * num_epochs, +... ) +``` + +The dataset returned by `pytorchvideo.data.Ucf101()` doesn't implement the `__len__` method. As such, we must define `max_steps` when instantiating `TrainingArguments`. + +Next, you need to define a function to compute the metrics from the predictions, which will use the `metric` you'll load now. The only preprocessing you have to do is to take the argmax of our predicted logits: + +```py +import evaluate + +metric = evaluate.load("accuracy") + + +def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=1) + return metric.compute(predictions=predictions, references=eval_pred.label_ids) +``` + +**A note on evaluation**: + +In the [VideoMAE paper](https://arxiv.org/abs/2203.12602), the authors use the following evaluation strategy. They evaluate the model on several clips from test videos and apply different crops to those clips and report the aggregate score. However, in the interest of simplicity and brevity, we don't consider that in this tutorial. + +Also, define a `collate_fn`, which will be used to batch examples together. Each batch consists of 2 keys, namely `pixel_values` and `labels`. + +```py +>>> def collate_fn(examples): +... # permute to (num_frames, num_channels, height, width) +... pixel_values = torch.stack( +... [example["video"].permute(1, 0, 2, 3) for example in examples] +... ) +... labels = torch.tensor([example["label"] for example in examples]) +... return {"pixel_values": pixel_values, "labels": labels} +``` + +Then you just pass all of this along with the datasets to `Trainer`: + +```py +>>> trainer = Trainer( +... model, +... args, +... train_dataset=train_dataset, +... eval_dataset=val_dataset, +... tokenizer=image_processor, +... compute_metrics=compute_metrics, +... data_collator=collate_fn, +... ) +``` + +You might wonder why you passed along the `image_processor` as a tokenizer when you preprocessed the data already. This is only to make sure the image processor configuration file (stored as JSON) will also be uploaded to the repo on the Hub. + +Now fine-tune our model by calling the `train` method: + +```py +>>> train_results = trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: + +```py +>>> trainer.push_to_hub() +``` + +## Inference + +Great, now that you have fine-tuned a model, you can use it for inference! + +Load a video for inference: + +```py +>>> sample_test_video = next(iter(test_dataset)) +``` + +
+ Teams playing basketball +
+ +The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline). Instantiate a `pipeline` for video classification with your model, and pass your video to it: + +```py +>>> from transformers import pipeline + +>>> video_cls = pipeline(model="my_awesome_video_cls_model") +>>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi") +[{'score': 0.9272987842559814, 'label': 'BasketballDunk'}, + {'score': 0.017777055501937866, 'label': 'BabyCrawling'}, + {'score': 0.01663011871278286, 'label': 'BalanceBeam'}, + {'score': 0.009560945443809032, 'label': 'BandMarching'}, + {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}] +``` + +You can also manually replicate the results of the `pipeline` if you'd like. + + +```py +>>> def run_inference(model, video): +... # (num_frames, num_channels, height, width) +... perumuted_sample_test_video = video.permute(1, 0, 2, 3) +... inputs = { +... "pixel_values": perumuted_sample_test_video.unsqueeze(0), +... "labels": torch.tensor( +... [sample_test_video["label"]] +... ), # this can be skipped if you don't have labels available. +... } + +... device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +... inputs = {k: v.to(device) for k, v in inputs.items()} +... model = model.to(device) + +... # forward pass +... with torch.no_grad(): +... outputs = model(**inputs) +... logits = outputs.logits + +... return logits +``` + +Now, pass your input to the model and return the `logits`: + +``` +>>> logits = run_inference(trained_model, sample_test_video["video"]) +``` + +Decoding the `logits`, we get: + +```py +>>> predicted_class_idx = logits.argmax(-1).item() +>>> print("Predicted class:", model.config.id2label[predicted_class_idx]) +# Predicted class: BasketballDunk +``` \ No newline at end of file diff --git a/docs/source/en/tasks/video_classification.mdx b/docs/source/en/tasks/video_classification.mdx deleted file mode 100644 index 948d4c09a5dc..000000000000 --- a/docs/source/en/tasks/video_classification.mdx +++ /dev/null @@ -1,487 +0,0 @@ - - -# Video classification - -[[open-in-colab]] - -Video classification is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video. Video classification models take a video as input and return a prediction about which class the video belongs to. These models can be used to categorize what a video is all about. A real-world application of video classification is action / activity recognition, which is useful for fitness applications. It is also helpful for vision-impaired individuals, especially when they are commuting. - -This guide will show you how to: - -1. Fine-tune [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) on a subset of the [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) dataset. -2. Use your fine-tuned model for inference. - - - -See the video classification [task page](https://huggingface.co/tasks/video-classification) for more information about its associated models, datasets, and metrics. - - - -Before you begin, make sure you have all the necessary libraries installed: - -```bash -pip install -q pytorchvideo transformers evaluate -``` - -You will use [PyTorchVideo](https://pytorchvideo.org/) (dubbed `pytorchvideo`) to process and prepare the videos. - -We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Load UCF101 dataset - -Start by loading a subset of the [UCF-101 dataset](https://www.crcv.ucf.edu/data/UCF101.php). This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset. - -```py ->>> from huggingface_hub import hf_hub_download - ->>> hf_dataset_identifier = "sayakpaul/ucf101-subset" ->>> filename = "UCF101_subset.tar.gz" ->>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset") -``` - -After the subset has been downloaded, you need to extract the compressed archive: - -```py ->>> import tarfile - ->>> with tarfile.open(file_path) as t: -... t.extractall(".") -``` - -At a high level, the dataset is organized like so: - -```bash -UCF101_subset/ - train/ - BandMarching/ - video_1.mp4 - video_2.mp4 - ... - Archery - video_1.mp4 - video_2.mp4 - ... - ... - val/ - BandMarching/ - video_1.mp4 - video_2.mp4 - ... - Archery - video_1.mp4 - video_2.mp4 - ... - ... - test/ - BandMarching/ - video_1.mp4 - video_2.mp4 - ... - Archery - video_1.mp4 - video_2.mp4 - ... - ... -``` - -The (`sorted`) video paths appear like so: - -```bash -... -'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi', -'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi', -'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi', -'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi', -'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi' -... -``` - -You will notice that there are video clips belonging to the same group / scene where group is denoted by `g` in the video file paths. `v_ApplyEyeMakeup_g07_c04.avi` and `v_ApplyEyeMakeup_g07_c06.avi`, for example. - -For the validation and evaluation splits, you wouldn't want to have video clips from the same group / scene to prevent [data leakage](https://www.kaggle.com/code/alexisbcook/data-leakage). The subset that you are using in this tutorial takes this information into account. - -Next up, you will derive the set of labels present in the dataset. Also, create two dictionaries that'll be helpful when initializing the model: - -* `label2id`: maps the class names to integers. -* `id2label`: maps the integers to class names. - -```py ->>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths}) ->>> label2id = {label: i for i, label in enumerate(class_labels)} ->>> id2label = {i: label for label, i in label2id.items()} - ->>> print(f"Unique classes: {list(label2id.keys())}.") - -# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress']. -``` - -There are 10 unique classes. For each class, there are 30 videos in the training set. - -## Load a model to fine-tune - -Instantiate a video classification model from a pretrained checkpoint and its associated image processor. The model's encoder comes with pre-trained parameters, and the classification head is randomly initialized. The image processor will come in handy when writing the preprocessing pipeline for our dataset. - -```py ->>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification - ->>> model_ckpt = "MCG-NJU/videomae-base" ->>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt) ->>> model = VideoMAEForVideoClassification.from_pretrained( -... model_ckpt, -... label2id=label2id, -... id2label=id2label, -... ignore_mismatched_sizes=True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint -... ) -``` - -While the model is loading, you might notice the following warning: - -```bash -Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight'] -- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight'] -You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. -``` - -The warning is telling us we are throwing away some weights (e.g. the weights and bias of the `classifier` layer) and randomly initializing some others (the weights and bias of a new `classifier` layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do. - -**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`. - -## Prepare the datasets for training - -For preprocessing the videos, you will leverage the [PyTorchVideo library](https://pytorchvideo.org/). Start by importing the dependencies we need. - -```py ->>> import pytorchvideo.data - ->>> from pytorchvideo.transforms import ( -... ApplyTransformToKey, -... Normalize, -... RandomShortSideScale, -... RemoveKey, -... ShortSideScale, -... UniformTemporalSubsample, -... ) - ->>> from torchvision.transforms import ( -... Compose, -... Lambda, -... RandomCrop, -... RandomHorizontalFlip, -... Resize, -... ) -``` - -For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. To learn more about the details of these transformations check out the [official documentation of PyTorchVideo](https://pytorchvideo.org). - -Use the `image_processor` associated with the pre-trained model to obtain the following information: - -* Image mean and standard deviation with which the video frame pixels will be normalized. -* Spatial resolution to which the video frames will be resized. - -Start by defining some constants. - -```py ->>> mean = image_processor.image_mean ->>> std = image_processor.image_std ->>> if "shortest_edge" in image_processor.size: -... height = width = image_processor.size["shortest_edge"] ->>> else: -... height = image_processor.size["height"] -... width = image_processor.size["width"] ->>> resize_to = (height, width) - ->>> num_frames_to_sample = model.config.num_frames ->>> sample_rate = 4 ->>> fps = 30 ->>> clip_duration = num_frames_to_sample * sample_rate / fps -``` - -Now, define the dataset-specific transformations and the datasets respectively. Starting with the training set: - -```py ->>> train_transform = Compose( -... [ -... ApplyTransformToKey( -... key="video", -... transform=Compose( -... [ -... UniformTemporalSubsample(num_frames_to_sample), -... Lambda(lambda x: x / 255.0), -... Normalize(mean, std), -... RandomShortSideScale(min_size=256, max_size=320), -... RandomCrop(resize_to), -... RandomHorizontalFlip(p=0.5), -... ] -... ), -... ), -... ] -... ) - ->>> train_dataset = pytorchvideo.data.Ucf101( -... data_path=os.path.join(dataset_root_path, "train"), -... clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration), -... decode_audio=False, -... transform=train_transform, -... ) -``` - -The same sequence of workflow can be applied to the validation and evaluation sets: - -```py ->>> val_transform = Compose( -... [ -... ApplyTransformToKey( -... key="video", -... transform=Compose( -... [ -... UniformTemporalSubsample(num_frames_to_sample), -... Lambda(lambda x: x / 255.0), -... Normalize(mean, std), -... Resize(resize_to), -... ] -... ), -... ), -... ] -... ) - ->>> val_dataset = pytorchvideo.data.Ucf101( -... data_path=os.path.join(dataset_root_path, "val"), -... clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration), -... decode_audio=False, -... transform=val_transform, -... ) - ->>> test_dataset = pytorchvideo.data.Ucf101( -... data_path=os.path.join(dataset_root_path, "test"), -... clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration), -... decode_audio=False, -... transform=val_transform, -... ) -``` - -**Note**: The above dataset pipelines are taken from the [official PyTorchVideo example](https://pytorchvideo.org/docs/tutorial_classification#dataset). We're using the [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) function because it's tailored for the UCF-101 dataset. Under the hood, it returns a [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) object. `LabeledVideoDataset` class is the base class for all things video in the PyTorchVideo dataset. So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the `LabeledVideoDataset` class accordingly. Refer to the `data` API [documentation to](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) learn more. Also, if your dataset follows a similar structure (as shown above), then using the `pytorchvideo.data.Ucf101()` should work just fine. - -You can access the `num_videos` argument to know the number of videos in the dataset. - -```py ->>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos) -# (300, 30, 75) -``` - -## Visualize the preprocessed video for better debugging - -```py ->>> import imageio ->>> import numpy as np ->>> from IPython.display import Image - ->>> def unnormalize_img(img): -... """Un-normalizes the image pixels.""" -... img = (img * std) + mean -... img = (img * 255).astype("uint8") -... return img.clip(0, 255) - ->>> def create_gif(video_tensor, filename="sample.gif"): -... """Prepares a GIF from a video tensor. -... -... The video tensor is expected to have the following shape: -... (num_frames, num_channels, height, width). -... """ -... frames = [] -... for video_frame in video_tensor: -... frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy()) -... frames.append(frame_unnormalized) -... kargs = {"duration": 0.25} -... imageio.mimsave(filename, frames, "GIF", **kargs) -... return filename - ->>> def display_gif(video_tensor, gif_name="sample.gif"): -... """Prepares and displays a GIF from a video tensor.""" -... video_tensor = video_tensor.permute(1, 0, 2, 3) -... gif_filename = create_gif(video_tensor, gif_name) -... return Image(filename=gif_filename) - ->>> sample_video = next(iter(train_dataset)) ->>> video_tensor = sample_video["video"] ->>> display_gif(video_tensor) -``` - -
- Person playing basketball -
- -## Train the model - -Leverage [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) from 🤗 Transformers for training the model. To instantiate a `Trainer`, you need to define the training configuration and an evaluation metric. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to configure the training. It requires an output folder name, which will be used to save the checkpoints of the model. It also helps sync all the information in the model repository on 🤗 Hub. - -Most of the training arguments are self-explanatory, but one that is quite important here is `remove_unused_columns=False`. This one will drop any features not used by the model's call function. By default it's `True` because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in this case, you need the unused features ('video' in particular) in order to create `pixel_values` (which is a mandatory key our model expects in its inputs). - - -```py ->>> from transformers import TrainingArguments, Trainer - ->>> model_name = model_ckpt.split("/")[-1] ->>> new_model_name = f"{model_name}-finetuned-ucf101-subset" ->>> num_epochs = 4 - ->>> args = TrainingArguments( -... new_model_name, -... remove_unused_columns=False, -... evaluation_strategy="epoch", -... save_strategy="epoch", -... learning_rate=5e-5, -... per_device_train_batch_size=batch_size, -... per_device_eval_batch_size=batch_size, -... warmup_ratio=0.1, -... logging_steps=10, -... load_best_model_at_end=True, -... metric_for_best_model="accuracy", -... push_to_hub=True, -... max_steps=(train_dataset.num_videos // batch_size) * num_epochs, -... ) -``` - -The dataset returned by `pytorchvideo.data.Ucf101()` doesn't implement the `__len__` method. As such, we must define `max_steps` when instantiating `TrainingArguments`. - -Next, you need to define a function to compute the metrics from the predictions, which will use the `metric` you'll load now. The only preprocessing you have to do is to take the argmax of our predicted logits: - -```py -import evaluate - -metric = evaluate.load("accuracy") - - -def compute_metrics(eval_pred): - predictions = np.argmax(eval_pred.predictions, axis=1) - return metric.compute(predictions=predictions, references=eval_pred.label_ids) -``` - -**A note on evaluation**: - -In the [VideoMAE paper](https://arxiv.org/abs/2203.12602), the authors use the following evaluation strategy. They evaluate the model on several clips from test videos and apply different crops to those clips and report the aggregate score. However, in the interest of simplicity and brevity, we don't consider that in this tutorial. - -Also, define a `collate_fn`, which will be used to batch examples together. Each batch consists of 2 keys, namely `pixel_values` and `labels`. - -```py ->>> def collate_fn(examples): -... # permute to (num_frames, num_channels, height, width) -... pixel_values = torch.stack( -... [example["video"].permute(1, 0, 2, 3) for example in examples] -... ) -... labels = torch.tensor([example["label"] for example in examples]) -... return {"pixel_values": pixel_values, "labels": labels} -``` - -Then you just pass all of this along with the datasets to `Trainer`: - -```py ->>> trainer = Trainer( -... model, -... args, -... train_dataset=train_dataset, -... eval_dataset=val_dataset, -... tokenizer=image_processor, -... compute_metrics=compute_metrics, -... data_collator=collate_fn, -... ) -``` - -You might wonder why you passed along the `image_processor` as a tokenizer when you preprocessed the data already. This is only to make sure the image processor configuration file (stored as JSON) will also be uploaded to the repo on the Hub. - -Now fine-tune our model by calling the `train` method: - -```py ->>> train_results = trainer.train() -``` - -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: - -```py ->>> trainer.push_to_hub() -``` - -## Inference - -Great, now that you have fine-tuned a model, you can use it for inference! - -Load a video for inference: - -```py ->>> sample_test_video = next(iter(test_dataset)) -``` - -
- Teams playing basketball -
- -The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline). Instantiate a `pipeline` for video classification with your model, and pass your video to it: - -```py ->>> from transformers import pipeline - ->>> video_cls = pipeline(model="my_awesome_video_cls_model") ->>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi") -[{'score': 0.9272987842559814, 'label': 'BasketballDunk'}, - {'score': 0.017777055501937866, 'label': 'BabyCrawling'}, - {'score': 0.01663011871278286, 'label': 'BalanceBeam'}, - {'score': 0.009560945443809032, 'label': 'BandMarching'}, - {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}] -``` - -You can also manually replicate the results of the `pipeline` if you'd like. - - -```py ->>> def run_inference(model, video): -... # (num_frames, num_channels, height, width) -... perumuted_sample_test_video = video.permute(1, 0, 2, 3) -... inputs = { -... "pixel_values": perumuted_sample_test_video.unsqueeze(0), -... "labels": torch.tensor( -... [sample_test_video["label"]] -... ), # this can be skipped if you don't have labels available. -... } - -... device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -... inputs = {k: v.to(device) for k, v in inputs.items()} -... model = model.to(device) - -... # forward pass -... with torch.no_grad(): -... outputs = model(**inputs) -... logits = outputs.logits - -... return logits -``` - -Now, pass your input to the model and return the `logits`: - -``` ->>> logits = run_inference(trained_model, sample_test_video["video"]) -``` - -Decoding the `logits`, we get: - -```py ->>> predicted_class_idx = logits.argmax(-1).item() ->>> print("Predicted class:", model.config.id2label[predicted_class_idx]) -# Predicted class: BasketballDunk -``` \ No newline at end of file diff --git a/docs/source/en/tasks/visual_question_answering.md b/docs/source/en/tasks/visual_question_answering.md new file mode 100644 index 000000000000..c45f12dbc1e7 --- /dev/null +++ b/docs/source/en/tasks/visual_question_answering.md @@ -0,0 +1,401 @@ + + +# Visual Question Answering + +[[open-in-colab]] + +Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. +The input to models supporting this task is typically a combination of an image and a question, and the output is an +answer expressed in natural language. + +Some noteworthy use case examples for VQA include: +* Accessibility applications for visually impaired individuals. +* Education: posing questions about visual materials presented in lectures or textbooks. VQA can also be utilized in interactive museum exhibits or historical sites. +* Customer service and e-commerce: VQA can enhance user experience by letting users ask questions about products. +* Image retrieval: VQA models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images. + +In this guide you'll learn how to: + +- Fine-tune a classification VQA model, specifically [ViLT](../model_doc/vilt), on the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa). +- Use your fine-tuned ViLT for inference. +- Run zero-shot VQA inference with a generative model, like BLIP-2. + +## Fine-tuning ViLT + +ViLT model incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design for +Vision-and-Language Pre-training (VLP). This model can be used for several downstream tasks. For the VQA task, a classifier +head is placed on top (a linear layer on top of the final hidden state of the `[CLS]` token) and randomly initialized. +Visual Question Answering is thus treated as a **classification problem**. + +More recent models, such as BLIP, BLIP-2, and InstructBLIP, treat VQA as a generative task. Later in this guide we +illustrate how to use them for zero-shot VQA inference. + +Before you begin, make sure you have all the necessary libraries installed. + +```bash +pip install -q transformers datasets +``` + +We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the 🤗 Hub. +When prompted, enter your token to log in: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +Let's define the model checkpoint as a global variable. + +```py +>>> model_checkpoint = "dandelin/vilt-b32-mlm" +``` + +## Load the data + +For illustration purposes, in this guide we use a very small sample of the annotated visual question answering `Graphcore/vqa` dataset. +You can find the full dataset on [🤗 Hub](https://huggingface.co/datasets/Graphcore/vqa). + +As an alternative to the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa), you can download the +same data manually from the official [VQA dataset page](https://visualqa.org/download.html). If you prefer to follow the +tutorial with your custom data, check out how to [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset#loading-script) +guide in the 🤗 Datasets documentation. + +Let's load the first 200 examples from the validation split and explore the dataset's features: + +```python +>>> from datasets import load_dataset + +>>> dataset = load_dataset("Graphcore/vqa", split="validation[:200]") +>>> dataset +Dataset({ + features: ['question', 'question_type', 'question_id', 'image_id', 'answer_type', 'label'], + num_rows: 200 +}) +``` + +Let's take a look at an example to understand the dataset's features: + +```py +>>> dataset[0] +{'question': 'Where is he looking?', + 'question_type': 'none of the above', + 'question_id': 262148000, + 'image_id': '/root/.cache/huggingface/datasets/downloads/extracted/ca733e0e000fb2d7a09fbcc94dbfe7b5a30750681d0e965f8e0a23b1c2f98c75/val2014/COCO_val2014_000000262148.jpg', + 'answer_type': 'other', + 'label': {'ids': ['at table', 'down', 'skateboard', 'table'], + 'weights': [0.30000001192092896, + 1.0, + 0.30000001192092896, + 0.30000001192092896]}} +``` + +The features relevant to the task include: +* `question`: the question to be answered from the image +* `image_id`: the path to the image the question refers to +* `label`: the annotations + +We can remove the rest of the features as they won't be necessary: + +```py +>>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type']) +``` + +As you can see, the `label` feature contains several answers to the same question (called `ids` here) collected by different human annotators. +This is because the answer to a question can be subjective. In this case, the question is "where is he looking?". Some people +annotated this with "down", others with "at table", another one with "skateboard", etc. + +Take a look at the image and consider which answer would you give: + +```python +>>> from PIL import Image + +>>> image = Image.open(dataset[0]['image_id']) +>>> image +``` + +
+ VQA Image Example +
+ +Due to the questions' and answers' ambiguity, datasets like this are treated as a multi-label classification problem (as +multiple answers are possibly valid). Moreover, rather than just creating a one-hot encoded vector, one creates a +soft encoding, based on the number of times a certain answer appeared in the annotations. + +For instance, in the example above, because the answer "down" is selected way more often than other answers, it has a +score (called `weight` in the dataset) of 1.0, and the rest of the answers have scores < 1.0. + +To later instantiate the model with an appropriate classification head, let's create two dictionaries: one that maps +the label name to an integer and vice versa: + +```py +>>> import itertools + +>>> labels = [item['ids'] for item in dataset['label']] +>>> flattened_labels = list(itertools.chain(*labels)) +>>> unique_labels = list(set(flattened_labels)) + +>>> label2id = {label: idx for idx, label in enumerate(unique_labels)} +>>> id2label = {idx: label for label, idx in label2id.items()} +``` + +Now that we have the mappings, we can replace the string answers with their ids, and flatten the dataset for a more convenient further preprocessing. + +```python +>>> def replace_ids(inputs): +... inputs["label"]["ids"] = [label2id[x] for x in inputs["label"]["ids"]] +... return inputs + + +>>> dataset = dataset.map(replace_ids) +>>> flat_dataset = dataset.flatten() +>>> flat_dataset.features +{'question': Value(dtype='string', id=None), + 'image_id': Value(dtype='string', id=None), + 'label.ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), + 'label.weights': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)} +``` + +## Preprocessing data + +The next step is to load a ViLT processor to prepare the image and text data for the model. +[`ViltProcessor`] wraps a BERT tokenizer and ViLT image processor into a convenient single processor: + +```py +>>> from transformers import ViltProcessor + +>>> processor = ViltProcessor.from_pretrained(model_checkpoint) +``` + +To preprocess the data we need to encode the images and questions using the [`ViltProcessor`]. The processor will use +the [`BertTokenizerFast`] to tokenize the text and create `input_ids`, `attention_mask` and `token_type_ids` for the text data. +As for images, the processor will leverage [`ViltImageProcessor`] to resize and normalize the image, and create `pixel_values` and `pixel_mask`. + +All these preprocessing steps are done under the hood, we only need to call the `processor`. However, we still need to +prepare the target labels. In this representation, each element corresponds to a possible answer (label). For correct answers, the element holds +their respective score (weight), while the remaining elements are set to zero. + +The following function applies the `processor` to the images and questions and formats the labels as described above: + +```py +>>> import torch + +>>> def preprocess_data(examples): +... image_paths = examples['image_id'] +... images = [Image.open(image_path) for image_path in image_paths] +... texts = examples['question'] + +... encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt") + +... for k, v in encoding.items(): +... encoding[k] = v.squeeze() + +... targets = [] + +... for labels, scores in zip(examples['label.ids'], examples['label.weights']): +... target = torch.zeros(len(id2label)) + +... for label, score in zip(labels, scores): +... target[label] = score + +... targets.append(target) + +... encoding["labels"] = targets + +... return encoding +``` + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.map`] function. You can speed up `map` by +setting `batched=True` to process multiple elements of the dataset at once. At this point, feel free to remove the columns you don't need. + +```py +>>> processed_dataset = flat_dataset.map(preprocess_data, batched=True, remove_columns=['question','question_type', 'question_id', 'image_id', 'answer_type', 'label.ids', 'label.weights']) +>>> processed_dataset +Dataset({ + features: ['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'], + num_rows: 200 +}) +``` + +As a final step, create a batch of examples using [`DefaultDataCollator`]: + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + +## Train the model + +You’re ready to start training your model now! Load ViLT with [`ViltForQuestionAnswering`]. Specify the number of labels +along with the label mappings: + +```py +>>> from transformers import ViltForQuestionAnswering + +>>> model = ViltForQuestionAnswering.from_pretrained(model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id) +``` + +At this point, only three steps remain: + +1. Define your training hyperparameters in [`TrainingArguments`]: + +```py +>>> from transformers import TrainingArguments + +>>> repo_id = "MariaK/vilt_finetuned_200" + +>>> training_args = TrainingArguments( +... output_dir=repo_id, +... per_device_train_batch_size=4, +... num_train_epochs=20, +... save_steps=200, +... logging_steps=50, +... learning_rate=5e-5, +... save_total_limit=2, +... remove_unused_columns=False, +... push_to_hub=True, +... ) +``` + +2. Pass the training arguments to [`Trainer`] along with the model, dataset, processor, and data collator. + +```py +>>> from transformers import Trainer + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... data_collator=data_collator, +... train_dataset=processed_dataset, +... tokenizer=processor, +... ) +``` + +3. Call [`~Trainer.train`] to finetune your model. + +```py +>>> trainer.train() +``` + +Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method to share your final model on the 🤗 Hub: + +```py +>>> trainer.push_to_hub() +``` + +## Inference + +Now that you have fine-tuned a ViLT model, and uploaded it to the 🤗 Hub, you can use it for inference. The simplest +way to try out your fine-tuned model for inference is to use it in a [`Pipeline`]. + +```py +>>> from transformers import pipeline + +>>> pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200") +``` + +The model in this guide has only been trained on 200 examples, so don't expect a lot from it. Let's see if it at least +learned something from the data and take the first example from the dataset to illustrate inference: + +```py +>>> example = dataset[0] +>>> image = Image.open(example['image_id']) +>>> question = example['question'] +>>> print(question) +>>> pipe(image, question, top_k=1) +"Where is he looking?" +[{'score': 0.5498199462890625, 'answer': 'down'}] +``` + +Even though not very confident, the model indeed has learned something. With more examples and longer training, you'll get far better results! + +You can also manually replicate the results of the pipeline if you'd like: +1. Take an image and a question, prepare them for the model using the processor from your model. +2. Forward the result or preprocessing through the model. +3. From the logits, get the most likely answer's id, and find the actual answer in the `id2label`. + +```py +>>> processor = ViltProcessor.from_pretrained("MariaK/vilt_finetuned_200") + +>>> image = Image.open(example['image_id']) +>>> question = example['question'] + +>>> # prepare inputs +>>> inputs = processor(image, question, return_tensors="pt") + +>>> model = ViltForQuestionAnswering.from_pretrained("MariaK/vilt_finetuned_200") + +>>> # forward pass +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> logits = outputs.logits +>>> idx = logits.argmax(-1).item() +>>> print("Predicted answer:", model.config.id2label[idx]) +Predicted answer: down +``` + +## Zero-shot VQA + +The previous model treated VQA as a classification task. Some recent models, such as BLIP, BLIP-2, and InstructBLIP approach +VQA as a generative task. Let's take [BLIP-2](../model_doc/blip-2) as an example. It introduced a new visual-language pre-training +paradigm in which any combination of pre-trained vision encoder and LLM can be used (learn more in the [BLIP-2 blog post](https://huggingface.co/blog/blip-2)). +This enables achieving state-of-the-art results on multiple visual-language tasks including visual question answering. + +Let's illustrate how you can use this model for VQA. First, let's load the model. Here we'll explicitly send the model to a +GPU, if available, which we didn't need to do earlier when training, as [`Trainer`] handles this automatically: + +```py +>>> from transformers import AutoProcessor, Blip2ForConditionalGeneration +>>> import torch + +>>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") +>>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) +>>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> model.to(device) +``` + +The model takes image and text as input, so let's use the exact same image/question pair from the first example in the VQA dataset: + +```py +>>> example = dataset[0] +>>> image = Image.open(example['image_id']) +>>> question = example['question'] +``` + +To use BLIP-2 for visual question answering task, the textual prompt has to follow a specific format: `Question: {} Answer:`. + +```py +>>> prompt = f"Question: {question} Answer:" +``` + +Now we need to preprocess the image/prompt with the model's processor, pass the processed input through the model, and decode the output: + +```py +>>> inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16) + +>>> generated_ids = model.generate(**inputs, max_new_tokens=10) +>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() +>>> print(generated_text) +"He is looking at the crowd" +``` + +As you can see, the model recognized the crowd, and the direction of the face (looking down), however, it seems to miss +the fact the crowd is behind the skater. Still, in cases where acquiring human-annotated datasets is not feasible, this +approach can quickly produce useful results. + diff --git a/docs/source/en/tasks/zero_shot_image_classification.md b/docs/source/en/tasks/zero_shot_image_classification.md new file mode 100644 index 000000000000..45775d40cad3 --- /dev/null +++ b/docs/source/en/tasks/zero_shot_image_classification.md @@ -0,0 +1,147 @@ + + +# Zero-shot image classification + +[[open-in-colab]] + +Zero-shot image classification is a task that involves classifying images into different categories using a model that was +not explicitly trained on data containing labeled examples from those specific categories. + +Traditionally, image classification requires training a model on a specific set of labeled images, and this model learns to +"map" certain image features to labels. When there's a need to use such model for a classification task that introduces a +new set of labels, fine-tuning is required to "recalibrate" the model. + +In contrast, zero-shot or open vocabulary image classification models are typically multi-modal models that have been trained on a large +dataset of images and associated descriptions. These models learn aligned vision-language representations that can be used for many downstream tasks including zero-shot image classification. + +This is a more flexible approach to image classification that allows models to generalize to new and unseen categories +without the need for additional training data and enables users to query images with free-form text descriptions of their target objects . + +In this guide you'll learn how to: + +* create a zero-shot image classification pipeline +* run zero-shot image classification inference by hand + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install -q transformers +``` + +## Zero-shot image classification pipeline + +The simplest way to try out inference with a model supporting zero-shot image classification is to use the corresponding [`pipeline`]. +Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads): + +```python +>>> from transformers import pipeline + +>>> checkpoint = "openai/clip-vit-large-patch14" +>>> detector = pipeline(model=checkpoint, task="zero-shot-image-classification") +``` + +Next, choose an image you'd like to classify. + +```py +>>> from PIL import Image +>>> import requests + +>>> url = "https://unsplash.com/photos/g8oS8-82DxI/download?ixid=MnwxMjA3fDB8MXx0b3BpY3x8SnBnNktpZGwtSGt8fHx8fDJ8fDE2NzgxMDYwODc&force=true&w=640" +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> image +``` + +
+ Photo of an owl +
+ +Pass the image and the candidate object labels to the pipeline. Here we pass the image directly; other suitable options +include a local path to an image or an image url. +The candidate labels can be simple words like in this example, or more descriptive. + +```py +>>> predictions = detector(image, candidate_labels=["fox", "bear", "seagull", "owl"]) +>>> predictions +[{'score': 0.9996670484542847, 'label': 'owl'}, + {'score': 0.000199399160919711, 'label': 'seagull'}, + {'score': 7.392891711788252e-05, 'label': 'fox'}, + {'score': 5.96074532950297e-05, 'label': 'bear'}] +``` + +## Zero-shot image classification by hand + +Now that you've seen how to use the zero-shot image classification pipeline, let's take a look how you can run zero-shot +image classification manually. + +Start by loading the model and associated processor from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads). +Here we'll use the same checkpoint as before: + +```py +>>> from transformers import AutoProcessor, AutoModelForZeroShotImageClassification + +>>> model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint) +>>> processor = AutoProcessor.from_pretrained(checkpoint) +``` + +Let's take a different image to switch things up. + +```py +>>> from PIL import Image +>>> import requests + +>>> url = "https://unsplash.com/photos/xBRQfR2bqNI/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjc4Mzg4ODEx&force=true&w=640" +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> image +``` + +
+ Photo of a car +
+ +Use the processor to prepare the inputs for the model. The processor combines an image processor that prepares the +image for the model by resizing and normalizing it, and a tokenizer that takes care of the text inputs. + +```py +>>> candidate_labels = ["tree", "car", "bike", "cat"] +>>> inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True) +``` + +Pass the inputs through the model, and post-process the results: + +```py +>>> import torch + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> logits = outputs.logits_per_image[0] +>>> probs = logits.softmax(dim=-1).numpy() +>>> scores = probs.tolist() + +>>> result = [ +... {"score": score, "label": candidate_label} +... for score, candidate_label in sorted(zip(probs, candidate_labels), key=lambda x: -x[0]) +... ] + +>>> result +[{'score': 0.998572, 'label': 'car'}, + {'score': 0.0010570387, 'label': 'bike'}, + {'score': 0.0003393686, 'label': 'tree'}, + {'score': 3.1572064e-05, 'label': 'cat'}] +``` \ No newline at end of file diff --git a/docs/source/en/tasks/zero_shot_object_detection.md b/docs/source/en/tasks/zero_shot_object_detection.md new file mode 100644 index 000000000000..3dfefb3c8b5e --- /dev/null +++ b/docs/source/en/tasks/zero_shot_object_detection.md @@ -0,0 +1,309 @@ + + +# Zero-shot object detection + +[[open-in-colab]] + +Traditionally, models used for [object detection](object_detection) require labeled image datasets for training, +and are limited to detecting the set of classes from the training data. + +Zero-shot object detection is supported by the [OWL-ViT](../model_doc/owlvit) model which uses a different approach. OWL-ViT +is an open-vocabulary object detector. It means that it can detect objects in images based on free-text queries without +the need to fine-tune the model on labeled datasets. + +OWL-ViT leverages multi-modal representations to perform open-vocabulary detection. It combines [CLIP](../model_doc/clip) with +lightweight object classification and localization heads. Open-vocabulary detection is achieved by embedding free-text queries with the text encoder of CLIP and using them as input to the object classification and localization heads. +associate images and their corresponding textual descriptions, and ViT processes image patches as inputs. The authors +of OWL-ViT first trained CLIP from scratch and then fine-tuned OWL-ViT end to end on standard object detection datasets using +a bipartite matching loss. + +With this approach, the model can detect objects based on textual descriptions without prior training on labeled datasets. + +In this guide, you will learn how to use OWL-ViT: +- to detect objects based on text prompts +- for batch object detection +- for image-guided object detection + +Before you begin, make sure you have all the necessary libraries installed: + +```bash +pip install -q transformers +``` + +## Zero-shot object detection pipeline + +The simplest way to try out inference with OWL-ViT is to use it in a [`pipeline`]. Instantiate a pipeline +for zero-shot object detection from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?other=owlvit): + +```python +>>> from transformers import pipeline + +>>> checkpoint = "google/owlvit-base-patch32" +>>> detector = pipeline(model=checkpoint, task="zero-shot-object-detection") +``` + +Next, choose an image you'd like to detect objects in. Here we'll use the image of astronaut Eileen Collins that is +a part of the [NASA](https://www.nasa.gov/multimedia/imagegallery/index.html) Great Images dataset. + +```py +>>> import skimage +>>> import numpy as np +>>> from PIL import Image + +>>> image = skimage.data.astronaut() +>>> image = Image.fromarray(np.uint8(image)).convert("RGB") + +>>> image +``` + +
+ Astronaut Eileen Collins +
+ +Pass the image and the candidate object labels to look for to the pipeline. +Here we pass the image directly; other suitable options include a local path to an image or an image url. We also pass text descriptions for all items we want to query the image for. + +```py +>>> predictions = detector( +... image, +... candidate_labels=["human face", "rocket", "nasa badge", "star-spangled banner"], +... ) +>>> predictions +[{'score': 0.3571370542049408, + 'label': 'human face', + 'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}}, + {'score': 0.28099656105041504, + 'label': 'nasa badge', + 'box': {'xmin': 129, 'ymin': 348, 'xmax': 206, 'ymax': 427}}, + {'score': 0.2110239565372467, + 'label': 'rocket', + 'box': {'xmin': 350, 'ymin': -1, 'xmax': 468, 'ymax': 288}}, + {'score': 0.13790413737297058, + 'label': 'star-spangled banner', + 'box': {'xmin': 1, 'ymin': 1, 'xmax': 105, 'ymax': 509}}, + {'score': 0.11950037628412247, + 'label': 'nasa badge', + 'box': {'xmin': 277, 'ymin': 338, 'xmax': 327, 'ymax': 380}}, + {'score': 0.10649408400058746, + 'label': 'rocket', + 'box': {'xmin': 358, 'ymin': 64, 'xmax': 424, 'ymax': 280}}] +``` + +Let's visualize the predictions: + +```py +>>> from PIL import ImageDraw + +>>> draw = ImageDraw.Draw(image) + +>>> for prediction in predictions: +... box = prediction["box"] +... label = prediction["label"] +... score = prediction["score"] + +... xmin, ymin, xmax, ymax = box.values() +... draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) +... draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white") + +>>> image +``` + +
+ Visualized predictions on NASA image +
+ +## Text-prompted zero-shot object detection by hand + +Now that you've seen how to use the zero-shot object detection pipeline, let's replicate the same +result manually. + +Start by loading the model and associated processor from a [checkpoint on the Hugging Face Hub](https://huggingface.co/models?other=owlvit). +Here we'll use the same checkpoint as before: + +```py +>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection + +>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint) +>>> processor = AutoProcessor.from_pretrained(checkpoint) +``` + +Let's take a different image to switch things up. + +```py +>>> import requests + +>>> url = "https://unsplash.com/photos/oj0zeY2Ltk4/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MTR8fHBpY25pY3xlbnwwfHx8fDE2Nzc0OTE1NDk&force=true&w=640" +>>> im = Image.open(requests.get(url, stream=True).raw) +>>> im +``` + +
+ Beach photo +
+ +Use the processor to prepare the inputs for the model. The processor combines an image processor that prepares the +image for the model by resizing and normalizing it, and a [`CLIPTokenizer`] that takes care of the text inputs. + +```py +>>> text_queries = ["hat", "book", "sunglasses", "camera"] +>>> inputs = processor(text=text_queries, images=im, return_tensors="pt") +``` + +Pass the inputs through the model, post-process, and visualize the results. Since the image processor resized images before +feeding them to the model, you need to use the [`~OwlViTImageProcessor.post_process_object_detection`] method to make sure the predicted bounding +boxes have the correct coordinates relative to the original image: + +```py +>>> import torch + +>>> with torch.no_grad(): +... outputs = model(**inputs) +... target_sizes = torch.tensor([im.size[::-1]]) +... results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0] + +>>> draw = ImageDraw.Draw(im) + +>>> scores = results["scores"].tolist() +>>> labels = results["labels"].tolist() +>>> boxes = results["boxes"].tolist() + +>>> for box, score, label in zip(boxes, scores, labels): +... xmin, ymin, xmax, ymax = box +... draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) +... draw.text((xmin, ymin), f"{text_queries[label]}: {round(score,2)}", fill="white") + +>>> im +``` + +
+ Beach photo with detected objects +
+ +## Batch processing + +You can pass multiple sets of images and text queries to search for different (or same) objects in several images. +Let's use both an astronaut image and the beach image together. +For batch processing, you should pass text queries as a nested list to the processor and images as lists of PIL images, +PyTorch tensors, or NumPy arrays. + +```py +>>> images = [image, im] +>>> text_queries = [ +... ["human face", "rocket", "nasa badge", "star-spangled banner"], +... ["hat", "book", "sunglasses", "camera"], +... ] +>>> inputs = processor(text=text_queries, images=images, return_tensors="pt") +``` + +Previously for post-processing you passed the single image's size as a tensor, but you can also pass a tuple, or, in case +of several images, a list of tuples. Let's create predictions for the two examples, and visualize the second one (`image_idx = 1`). + +```py +>>> with torch.no_grad(): +... outputs = model(**inputs) +... target_sizes = [x.size[::-1] for x in images] +... results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes) + +>>> image_idx = 1 +>>> draw = ImageDraw.Draw(images[image_idx]) + +>>> scores = results[image_idx]["scores"].tolist() +>>> labels = results[image_idx]["labels"].tolist() +>>> boxes = results[image_idx]["boxes"].tolist() + +>>> for box, score, label in zip(boxes, scores, labels): +... xmin, ymin, xmax, ymax = box +... draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) +... draw.text((xmin, ymin), f"{text_queries[image_idx][label]}: {round(score,2)}", fill="white") + +>>> images[image_idx] +``` + +
+ Beach photo with detected objects +
+ +## Image-guided object detection + +In addition to zero-shot object detection with text queries, OWL-ViT offers image-guided object detection. This means +you can use an image query to find similar objects in the target image. +Unlike text queries, only a single example image is allowed. + +Let's take an image with two cats on a couch as a target image, and an image of a single cat +as a query: + +```py +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image_target = Image.open(requests.get(url, stream=True).raw) + +>>> query_url = "http://images.cocodataset.org/val2017/000000524280.jpg" +>>> query_image = Image.open(requests.get(query_url, stream=True).raw) +``` + +Let's take a quick look at the images: + +```py +>>> import matplotlib.pyplot as plt + +>>> fig, ax = plt.subplots(1, 2) +>>> ax[0].imshow(image_target) +>>> ax[1].imshow(query_image) +``` + +
+ Cats +
+ +In the preprocessing step, instead of text queries, you now need to use `query_images`: + +```py +>>> inputs = processor(images=image_target, query_images=query_image, return_tensors="pt") +``` + +For predictions, instead of passing the inputs to the model, pass them to [`~OwlViTForObjectDetection.image_guided_detection`]. Draw the predictions +as before except now there are no labels. + +```py +>>> with torch.no_grad(): +... outputs = model.image_guided_detection(**inputs) +... target_sizes = torch.tensor([image_target.size[::-1]]) +... results = processor.post_process_image_guided_detection(outputs=outputs, target_sizes=target_sizes)[0] + +>>> draw = ImageDraw.Draw(image_target) + +>>> scores = results["scores"].tolist() +>>> boxes = results["boxes"].tolist() + +>>> for box, score, label in zip(boxes, scores, labels): +... xmin, ymin, xmax, ymax = box +... draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4) + +>>> image_target +``` + +
+ Cats with bounding boxes +
+ +If you'd like to interactively try out inference with OWL-ViT, check out this demo: + + diff --git a/docs/source/en/tasks_explained.md b/docs/source/en/tasks_explained.md new file mode 100644 index 000000000000..d453e38e86b9 --- /dev/null +++ b/docs/source/en/tasks_explained.md @@ -0,0 +1,295 @@ + + +# How 🤗 Transformers solve tasks + +In [What 🤗 Transformers can do](task_summary), you learned about natural language processing (NLP), speech and audio, computer vision tasks, and some important applications of them. This page will look closely at how models solve these tasks and explain what's happening under the hood. There are many ways to solve a given task, some models may implement certain techniques or even approach the task from a new angle, but for Transformer models, the general idea is the same. Owing to its flexible architecture, most models are a variant of an encoder, decoder, or encoder-decoder structure. In addition to Transformer models, our library also has several convolutional neural networks (CNNs), which are still used today for computer vision tasks. We'll also explain how a modern CNN works. + +To explain how tasks are solved, we'll walk through what goes on inside the model to output useful predictions. + +- [Wav2Vec2](model_doc/wav2vec2) for audio classification and automatic speech recognition (ASR) +- [Vision Transformer (ViT)](model_doc/vit) and [ConvNeXT](model_doc/convnext) for image classification +- [DETR](model_doc/detr) for object detection +- [Mask2Former](model_doc/mask2former) for image segmentation +- [GLPN](model_doc/glpn) for depth estimation +- [BERT](model_doc/bert) for NLP tasks like text classification, token classification and question answering that use an encoder +- [GPT2](model_doc/gpt2) for NLP tasks like text generation that use a decoder +- [BART](model_doc/bart) for NLP tasks like summarization and translation that use an encoder-decoder + + + +Before you go further, it is good to have some basic knowledge of the original Transformer architecture. Knowing how encoders, decoders, and attention work will aid you in understanding how different Transformer models work. If you're just getting started or need a refresher, check out our [course](https://huggingface.co/course/chapter1/4?fw=pt) for more information! + + + +## Speech and audio + +[Wav2Vec2](model_doc/wav2vec2) is a self-supervised model pretrained on unlabeled speech data and finetuned on labeled data for audio classification and automatic speech recognition. + +
+ +
+ +This model has four main components: + +1. A *feature encoder* takes the raw audio waveform, normalizes it to zero mean and unit variance, and converts it into a sequence of feature vectors that are each 20ms long. + +2. Waveforms are continuous by nature, so they can't be divided into separate units like a sequence of text can be split into words. That's why the feature vectors are passed to a *quantization module*, which aims to learn discrete speech units. The speech unit is chosen from a collection of codewords, known as a *codebook* (you can think of this as the vocabulary). From the codebook, the vector or speech unit, that best represents the continuous audio input is chosen and forwarded through the model. + +3. About half of the feature vectors are randomly masked, and the masked feature vector is fed to a *context network*, which is a Transformer encoder that also adds relative positional embeddings. + +4. The pretraining objective of the context network is a *contrastive task*. The model has to predict the true quantized speech representation of the masked prediction from a set of false ones, encouraging the model to find the most similar context vector and quantized speech unit (the target label). + +Now that wav2vec2 is pretrained, you can finetune it on your data for audio classification or automatic speech recognition! + +### Audio classification + +To use the pretrained model for audio classification, add a sequence classification head on top of the base Wav2Vec2 model. The classification head is a linear layer that accepts the encoder's hidden states. The hidden states represent the learned features from each audio frame which can have varying lengths. To create one vector of fixed-length, the hidden states are pooled first and then transformed into logits over the class labels. The cross-entropy loss is calculated between the logits and target to find the most likely class. + +Ready to try your hand at audio classification? Check out our complete [audio classification guide](tasks/audio_classification) to learn how to finetune Wav2Vec2 and use it for inference! + +### Automatic speech recognition + +To use the pretrained model for automatic speech recognition, add a language modeling head on top of the base Wav2Vec2 model for [connectionist temporal classification (CTC)](glossary#connectionist-temporal-classification-ctc). The language modeling head is a linear layer that accepts the encoder's hidden states and transforms them into logits. Each logit represents a token class (the number of tokens comes from the task vocabulary). The CTC loss is calculated between the logits and targets to find the most likely sequence of tokens, which are then decoded into a transcription. + +Ready to try your hand at automatic speech recognition? Check out our complete [automatic speech recognition guide](tasks/asr) to learn how to finetune Wav2Vec2 and use it for inference! + +## Computer vision + +There are two ways to approach computer vision tasks: + +1. Split an image into a sequence of patches and process them in parallel with a Transformer. +2. Use a modern CNN, like [ConvNeXT](model_doc/convnext), which relies on convolutional layers but adopts modern network designs. + + + +A third approach mixes Transformers with convolutions (for example, [Convolutional Vision Transformer](model_doc/cvt) or [LeViT](model_doc/levit)). We won't discuss those because they just combine the two approaches we examine here. + + + +ViT and ConvNeXT are commonly used for image classification, but for other vision tasks like object detection, segmentation, and depth estimation, we'll look at DETR, Mask2Former and GLPN, respectively; these models are better suited for those tasks. + +### Image classification + +ViT and ConvNeXT can both be used for image classification; the main difference is that ViT uses an attention mechanism while ConvNeXT uses convolutions. + +#### Transformer + +[ViT](model_doc/vit) replaces convolutions entirely with a pure Transformer architecture. If you're familiar with the original Transformer, then you're already most of the way toward understanding ViT. + +
+ +
+ +The main change ViT introduced was in how images are fed to a Transformer: + +1. An image is split into square non-overlapping patches, each of which gets turned into a vector or *patch embedding*. The patch embeddings are generated from a convolutional 2D layer which creates the proper input dimensions (which for a base Transformer is 768 values for each patch embedding). If you had a 224x224 pixel image, you could split it into 196 16x16 image patches. Just like how text is tokenized into words, an image is "tokenized" into a sequence of patches. + +2. A *learnable embedding* - a special `[CLS]` token - is added to the beginning of the patch embeddings just like BERT. The final hidden state of the `[CLS]` token is used as the input to the attached classification head; other outputs are ignored. This token helps the model learn how to encode a representation of the image. + +3. The last thing to add to the patch and learnable embeddings are the *position embeddings* because the model doesn't know how the image patches are ordered. The position embeddings are also learnable and have the same size as the patch embeddings. Finally, all of the embeddings are passed to the Transformer encoder. + +4. The output, specifically only the output with the `[CLS]` token, is passed to a multilayer perceptron head (MLP). ViT's pretraining objective is simply classification. Like other classification heads, the MLP head converts the output into logits over the class labels and calculates the cross-entropy loss to find the most likely class. + +Ready to try your hand at image classification? Check out our complete [image classification guide](tasks/image_classification) to learn how to finetune ViT and use it for inference! + +#### CNN + + + +This section briefly explains convolutions, but it'd be helpful to have a prior understanding of how they change an image's shape and size. If you're unfamiliar with convolutions, check out the [Convolution Neural Networks chapter](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb) from the fastai book! + + + +[ConvNeXT](model_doc/convnext) is a CNN architecture that adopts new and modern network designs to improve performance. However, convolutions are still at the core of the model. From a high-level perspective, a [convolution](glossary#convolution) is an operation where a smaller matrix (*kernel*) is multiplied by a small window of the image pixels. It computes some features from it, such as a particular texture or curvature of a line. Then it slides over to the next window of pixels; the distance the convolution travels is known as the *stride*. + +
+ +
+ +A basic convolution without padding or stride, taken from A guide to convolution arithmetic for deep learning. + +You can feed this output to another convolutional layer, and with each successive layer, the network learns more complex and abstract things like hotdogs or rockets. Between convolutional layers, it is common to add a pooling layer to reduce dimensionality and make the model more robust to variations of a feature's position. + +
+ +
+ +ConvNeXT modernizes a CNN in five ways: + +1. Change the number of blocks in each stage and "patchify" an image with a larger stride and corresponding kernel size. The non-overlapping sliding window makes this patchifying strategy similar to how ViT splits an image into patches. + +2. A *bottleneck* layer shrinks the number of channels and then restores it because it is faster to do a 1x1 convolution, and you can increase the depth. An inverted bottleneck does the opposite by expanding the number of channels and shrinking them, which is more memory efficient. + +3. Replace the typical 3x3 convolutional layer in the bottleneck layer with *depthwise convolution*, which applies a convolution to each input channel separately and then stacks them back together at the end. This widens the network width for improved performance. + +4. ViT has a global receptive field which means it can see more of an image at once thanks to its attention mechanism. ConvNeXT attempts to replicate this effect by increasing the kernel size to 7x7. + +5. ConvNeXT also makes several layer design changes that imitate Transformer models. There are fewer activation and normalization layers, the activation function is switched to GELU instead of ReLU, and it uses LayerNorm instead of BatchNorm. + +The output from the convolution blocks is passed to a classification head which converts the outputs into logits and calculates the cross-entropy loss to find the most likely label. + +### Object detection + +[DETR](model_doc/detr), *DEtection TRansformer*, is an end-to-end object detection model that combines a CNN with a Transformer encoder-decoder. + +
+ +
+ +1. A pretrained CNN *backbone* takes an image, represented by its pixel values, and creates a low-resolution feature map of it. A 1x1 convolution is applied to the feature map to reduce dimensionality and it creates a new feature map with a high-level image representation. Since the Transformer is a sequential model, the feature map is flattened into a sequence of feature vectors that are combined with positional embeddings. + +2. The feature vectors are passed to the encoder, which learns the image representations using its attention layers. Next, the encoder hidden states are combined with *object queries* in the decoder. Object queries are learned embeddings that focus on the different regions of an image, and they're updated as they progress through each attention layer. The decoder hidden states are passed to a feedforward network that predicts the bounding box coordinates and class label for each object query, or `no object` if there isn't one. + + DETR decodes each object query in parallel to output *N* final predictions, where *N* is the number of queries. Unlike a typical autoregressive model that predicts one element at a time, object detection is a set prediction task (`bounding box`, `class label`) that makes *N* predictions in a single pass. + +3. DETR uses a *bipartite matching loss* during training to compare a fixed number of predictions with a fixed set of ground truth labels. If there are fewer ground truth labels in the set of *N* labels, then they're padded with a `no object` class. This loss function encourages DETR to find a one-to-one assignment between the predictions and ground truth labels. If either the bounding boxes or class labels aren't correct, a loss is incurred. Likewise, if DETR predicts an object that doesn't exist, it is penalized. This encourages DETR to find other objects in an image instead of focusing on one really prominent object. + +An object detection head is added on top of DETR to find the class label and the coordinates of the bounding box. There are two components to the object detection head: a linear layer to transform the decoder hidden states into logits over the class labels, and a MLP to predict the bounding box. + +Ready to try your hand at object detection? Check out our complete [object detection guide](tasks/object_detection) to learn how to finetune DETR and use it for inference! + +### Image segmentation + +[Mask2Former](model_doc/mask2former) is a universal architecture for solving all types of image segmentation tasks. Traditional segmentation models are typically tailored towards a particular subtask of image segmentation, like instance, semantic or panoptic segmentation. Mask2Former frames each of those tasks as a *mask classification* problem. Mask classification groups pixels into *N* segments, and predicts *N* masks and their corresponding class label for a given image. We'll explain how Mask2Former works in this section, and then you can try finetuning SegFormer at the end. + +
+ +
+ +There are three main components to Mask2Former: + +1. A [Swin](model_doc/swin) backbone accepts an image and creates a low-resolution image feature map from 3 consecutive 3x3 convolutions. + +2. The feature map is passed to a *pixel decoder* which gradually upsamples the low-resolution features into high-resolution per-pixel embeddings. The pixel decoder actually generates multi-scale features (contains both low- and high-resolution features) with resolutions 1/32, 1/16, and 1/8th of the original image. + +3. Each of these feature maps of differing scales is fed successively to one Transformer decoder layer at a time in order to capture small objects from the high-resolution features. The key to Mask2Former is the *masked attention* mechanism in the decoder. Unlike cross-attention which can attend to the entire image, masked attention only focuses on a certain area of the image. This is faster and leads to better performance because the local features of an image are enough for the model to learn from. + +4. Like [DETR](tasks_explained#object-detection), Mask2Former also uses learned object queries and combines them with the image features from the pixel decoder to make a set prediction (`class label`, `mask prediction`). The decoder hidden states are passed into a linear layer and transformed into logits over the class labels. The cross-entropy loss is calculated between the logits and class label to find the most likely one. + + The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask. + +Ready to try your hand at object detection? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference! + +### Depth estimation + +[GLPN](model_doc/glpn), *Global-Local Path Network*, is a Transformer for depth estimation that combines a [SegFormer](model_doc/segformer) encoder with a lightweight decoder. + +
+ +
+ +1. Like ViT, an image is split into a sequence of patches, except these image patches are smaller. This is better for dense prediction tasks like segmentation or depth estimation. The image patches are transformed into patch embeddings (see the [image classification](#image-classification) section for more details about how patch embeddings are created), which are fed to the encoder. + +2. The encoder accepts the patch embeddings, and passes them through several encoder blocks. Each block consists of attention and Mix-FFN layers. The purpose of the latter is to provide positional information. At the end of each encoder block is a *patch merging* layer for creating hierarchical representations. The features of each group of neighboring patches are concatenated, and a linear layer is applied to the concatenated features to reduce the number of patches to a resolution of 1/4. This becomes the input to the next encoder block, where this whole process is repeated until you have image features with resolutions of 1/8, 1/16, and 1/32. + +3. A lightweight decoder takes the last feature map (1/32 scale) from the encoder and upsamples it to 1/16 scale. From here, the feature is passed into a *Selective Feature Fusion (SFF)* module, which selects and combines local and global features from an attention map for each feature and then upsamples it to 1/8th. This process is repeated until the decoded features are the same size as the original image. The output is passed through two convolution layers and then a sigmoid activation is applied to predict the depth of each pixel. + +## Natural language processing + +The Transformer was initially designed for machine translation, and since then, it has practically become the default architecture for solving all NLP tasks. Some tasks lend themselves to the Transformer's encoder structure, while others are better suited for the decoder. Still, other tasks make use of both the Transformer's encoder-decoder structure. + +### Text classification + +[BERT](model_doc/bert) is an encoder-only model and is the first model to effectively implement deep bidirectionality to learn richer representations of the text by attending to words on both sides. + +1. BERT uses [WordPiece](tokenizer_summary#wordpiece) tokenization to generate a token embedding of the text. To tell the difference between a single sentence and a pair of sentences, a special `[SEP]` token is added to differentiate them. A special `[CLS]` token is added to the beginning of every sequence of text. The final output with the `[CLS]` token is used as the input to the classification head for classification tasks. BERT also adds a segment embedding to denote whether a token belongs to the first or second sentence in a pair of sentences. + +2. BERT is pretrained with two objectives: masked language modeling and next-sentence prediction. In masked language modeling, some percentage of the input tokens are randomly masked, and the model needs to predict these. This solves the issue of bidirectionality, where the model could cheat and see all the words and "predict" the next word. The final hidden states of the predicted mask tokens are passed to a feedforward network with a softmax over the vocabulary to predict the masked word. + + The second pretraining object is next-sentence prediction. The model must predict whether sentence B follows sentence A. Half of the time sentence B is the next sentence, and the other half of the time, sentence B is a random sentence. The prediction, whether it is the next sentence or not, is passed to a feedforward network with a softmax over the two classes (`IsNext` and `NotNext`). + +3. The input embeddings are passed through multiple encoder layers to output some final hidden states. + +To use the pretrained model for text classification, add a sequence classification head on top of the base BERT model. The sequence classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. The cross-entropy loss is calculated between the logits and target to find the most likely label. + +Ready to try your hand at text classification? Check out our complete [text classification guide](tasks/sequence_classification) to learn how to finetune DistilBERT and use it for inference! + +### Token classification + +To use BERT for token classification tasks like named entity recognition (NER), add a token classification head on top of the base BERT model. The token classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. The cross-entropy loss is calculated between the logits and each token to find the most likely label. + +Ready to try your hand at token classification? Check out our complete [token classification guide](tasks/token_classification) to learn how to finetune DistilBERT and use it for inference! + +### Question answering + +To use BERT for question answering, add a span classification head on top of the base BERT model. This linear layer accepts the final hidden states and performs a linear transformation to compute the `span` start and end logits corresponding to the answer. The cross-entropy loss is calculated between the logits and the label position to find the most likely span of text corresponding to the answer. + +Ready to try your hand at question answering? Check out our complete [question answering guide](tasks/question_answering) to learn how to finetune DistilBERT and use it for inference! + + + +💡 Notice how easy it is to use BERT for different tasks once it's been pretrained. You only need to add a specific head to the pretrained model to manipulate the hidden states into your desired output! + + + +### Text generation + +[GPT-2](model_doc/gpt2) is a decoder-only model pretrained on a large amount of text. It can generate convincing (though not always true!) text given a prompt and complete other NLP tasks like question answering despite not being explicitly trained to. + +
+ +
+ +1. GPT-2 uses [byte pair encoding (BPE)](tokenizer_summary#bytepair-encoding-bpe) to tokenize words and generate a token embedding. Positional encodings are added to the token embeddings to indicate the position of each token in the sequence. The input embeddings are passed through multiple decoder blocks to output some final hidden state. Within each decoder block, GPT-2 uses a *masked self-attention* layer which means GPT-2 can't attend to future tokens. It is only allowed to attend to tokens on the left. This is different from BERT's [`mask`] token because, in masked self-attention, an attention mask is used to set the score to `0` for future tokens. + +2. The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. The label is the next token in the sequence, which are created by shifting the logits to the right by one. The cross-entropy loss is calculated between the shifted logits and the labels to output the next most likely token. + +GPT-2's pretraining objective is based entirely on [causal language modeling](glossary#causal-language-modeling), predicting the next word in a sequence. This makes GPT-2 especially good at tasks that involve generating text. + +Ready to try your hand at text generation? Check out our complete [causal language modeling guide](tasks/language_modeling#causal-language-modeling) to learn how to finetune DistilGPT-2 and use it for inference! + + + +For more information about text generation, check out the [text generation strategies](generation_strategies) guide! + + + +### Summarization + +Encoder-decoder models like [BART](model_doc/bart) and [T5](model_doc/t5) are designed for the sequence-to-sequence pattern of a summarization task. We'll explain how BART works in this section, and then you can try finetuning T5 at the end. + +
+ +
+ +1. BART's encoder architecture is very similar to BERT and accepts a token and positional embedding of the text. BART is pretrained by corrupting the input and then reconstructing it with the decoder. Unlike other encoders with specific corruption strategies, BART can apply any type of corruption. The *text infilling* corruption strategy works the best though. In text infilling, a number of text spans are replaced with a **single** [`mask`] token. This is important because the model has to predict the masked tokens, and it teaches the model to predict the number of missing tokens. The input embeddings and masked spans are passed through the encoder to output some final hidden states, but unlike BERT, BART doesn't add a final feedforward network at the end to predict a word. + +2. The encoder's output is passed to the decoder, which must predict the masked tokens and any uncorrupted tokens from the encoder's output. This gives additional context to help the decoder restore the original text. The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. The cross-entropy loss is calculated between the logits and the label, which is just the token shifted to the right. + +Ready to try your hand at summarization? Check out our complete [summarization guide](tasks/summarization) to learn how to finetune T5 and use it for inference! + + + +For more information about text generation, check out the [text generation strategies](generation_strategies) guide! + + + +### Translation + +Translation is another example of a sequence-to-sequence task, which means you can use an encoder-decoder model like [BART](model_doc/bart) or [T5](model_doc/t5) to do it. We'll explain how BART works in this section, and then you can try finetuning T5 at the end. + +BART adapts to translation by adding a separate randomly initialized encoder to map a source language to an input that can be decoded into the target language. This new encoder's embeddings are passed to the pretrained encoder instead of the original word embeddings. The source encoder is trained by updating the source encoder, positional embeddings, and input embeddings with the cross-entropy loss from the model output. The model parameters are frozen in this first step, and all the model parameters are trained together in the second step. + +BART has since been followed up by a multilingual version, mBART, intended for translation and pretrained on many different languages. + +Ready to try your hand at translation? Check out our complete [translation guide](tasks/summarization) to learn how to finetune T5 and use it for inference! + + + +For more information about text generation, check out the [text generation strategies](generation_strategies) guide! + + \ No newline at end of file diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md new file mode 100644 index 000000000000..dcd0893c5c33 --- /dev/null +++ b/docs/source/en/testing.md @@ -0,0 +1,1293 @@ + + +# Testing + + +Let's take a look at how 🤗 Transformers models are tested and how you can write new tests and improve the existing ones. + +There are 2 test suites in the repository: + +1. `tests` -- tests for the general API +2. `examples` -- tests primarily for various applications that aren't part of the API + +## How transformers are tested + +1. Once a PR is submitted it gets tested with 9 CircleCi jobs. Every new commit to that PR gets retested. These jobs + are defined in this [config file](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml), so that if needed you can reproduce the same + environment on your machine. + + These CI jobs don't run `@slow` tests. + +2. There are 3 jobs run by [github actions](https://github.com/huggingface/transformers/actions): + + - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): checks whether torch hub + integration works. + + - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): runs fast tests on GPU only on commits on + `main`. It only runs if a commit on `main` has updated the code in one of the following folders: `src`, + `tests`, `.github` (to prevent running on added model cards, notebooks, etc.) + + - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): runs normal and slow tests on GPU in + `tests` and `examples`: + +```bash +RUN_SLOW=1 pytest tests/ +RUN_SLOW=1 pytest examples/ +``` + + The results can be observed [here](https://github.com/huggingface/transformers/actions). + + + +## Running tests + + + + + +### Choosing which tests to run + +This document goes into many details of how tests can be run. If after reading everything, you need even more details +you will find them [here](https://docs.pytest.org/en/latest/usage.html). + +Here are some most useful ways of running tests. + +Run all: + +```console +pytest +``` + +or: + +```bash +make test +``` + +Note that the latter is defined as: + +```bash +python -m pytest -n auto --dist=loadfile -s -v ./tests/ +``` + +which tells pytest to: + +- run as many test processes as they are CPU cores (which could be too many if you don't have a ton of RAM!) +- ensure that all tests from the same file will be run by the same test process +- do not capture output +- run in verbose mode + + + +### Getting the list of all tests + +All tests of the test suite: + +```bash +pytest --collect-only -q +``` + +All tests of a given test file: + +```bash +pytest tests/test_optimization.py --collect-only -q +``` + +### Run a specific test module + +To run an individual test module: + +```bash +pytest tests/utils/test_logging.py +``` + +### Run specific tests + +Since unittest is used inside most of the tests, to run specific subtests you need to know the name of the unittest +class containing those tests. For example, it could be: + +```bash +pytest tests/test_optimization.py::OptimizationTest::test_adam_w +``` + +Here: + +- `tests/test_optimization.py` - the file with tests +- `OptimizationTest` - the name of the class +- `test_adam_w` - the name of the specific test function + +If the file contains multiple classes, you can choose to run only tests of a given class. For example: + +```bash +pytest tests/test_optimization.py::OptimizationTest +``` + +will run all the tests inside that class. + +As mentioned earlier you can see what tests are contained inside the `OptimizationTest` class by running: + +```bash +pytest tests/test_optimization.py::OptimizationTest --collect-only -q +``` + +You can run tests by keyword expressions. + +To run only tests whose name contains `adam`: + +```bash +pytest -k adam tests/test_optimization.py +``` + +Logical `and` and `or` can be used to indicate whether all keywords should match or either. `not` can be used to +negate. + +To run all tests except those whose name contains `adam`: + +```bash +pytest -k "not adam" tests/test_optimization.py +``` + +And you can combine the two patterns in one: + +```bash +pytest -k "ada and not adam" tests/test_optimization.py +``` + +For example to run both `test_adafactor` and `test_adam_w` you can use: + +```bash +pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py +``` + +Note that we use `or` here, since we want either of the keywords to match to include both. + +If you want to include only tests that include both patterns, `and` is to be used: + +```bash +pytest -k "test and ada" tests/test_optimization.py +``` + +### Run `accelerate` tests + +Sometimes you need to run `accelerate` tests on your models. For that you can just add `-m accelerate_tests` to your command, if let's say you want to run these tests on `OPT` run: +```bash +RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py +``` + + +### Run documentation tests + +In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. +As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): + +```python +r""" +Returns: + +Example: + ```python + >>> import torch + >>> from transformers import WhisperModel, WhisperFeatureExtractor + >>> from datasets import load_dataset + + >>> model = WhisperModel.from_pretrained("openai/whisper-base") + >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") + >>> input_features = inputs.input_features + >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id + >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state + >>> list(last_hidden_state.shape) + [1, 2, 512] + ```""" + +``` + +Just run the following line to automatically test every docstring example in the desired file: +```bash +pytest --doctest-modules +``` +If the file has a markdown extention, you should add the `--doctest-glob="*.md"` argument. + +### Run only modified tests + +You can run the tests related to the unstaged files or the current branch (according to Git) by using [pytest-picked](https://github.com/anapaulagomes/pytest-picked). This is a great way of quickly testing your changes didn't break +anything, since it won't run the tests related to files you didn't touch. + +```bash +pip install pytest-picked +``` + +```bash +pytest --picked +``` + +All tests will be run from files and folders which are modified, but not yet committed. + +### Automatically rerun failed tests on source modification + +[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) provides a very useful feature of detecting all failed +tests, and then waiting for you to modify files and continuously re-rerun those failing tests until they pass while you +fix them. So that you don't need to re start pytest after you made the fix. This is repeated until all tests pass after +which again a full run is performed. + +```bash +pip install pytest-xdist +``` + +To enter the mode: `pytest -f` or `pytest --looponfail` + +File changes are detected by looking at `looponfailroots` root directories and all of their contents (recursively). +If the default for this value does not work for you, you can change it in your project by setting a configuration +option in `setup.cfg`: + +```ini +[tool:pytest] +looponfailroots = transformers tests +``` + +or `pytest.ini`/``tox.ini`` files: + +```ini +[pytest] +looponfailroots = transformers tests +``` + +This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file’s +directory. + +[pytest-watch](https://github.com/joeyespo/pytest-watch) is an alternative implementation of this functionality. + + +### Skip a test module + +If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For +example, to run all except `test_modeling_*.py` tests: + +```bash +pytest *ls -1 tests/*py | grep -v test_modeling* +``` + +### Clearing state + +CI builds and when isolation is important (against speed), cache should be cleared: + +```bash +pytest --cache-clear tests +``` + +### Running tests in parallel + +As mentioned earlier `make test` runs tests in parallel via `pytest-xdist` plugin (`-n X` argument, e.g. `-n 2` +to run 2 parallel jobs). + +`pytest-xdist`'s `--dist=` option allows one to control how the tests are grouped. `--dist=loadfile` puts the +tests located in one file onto the same process. + +Since the order of executed tests is different and unpredictable, if running the test suite with `pytest-xdist` +produces failures (meaning we have some undetected coupled tests), use [pytest-replay](https://github.com/ESSS/pytest-replay) to replay the tests in the same order, which should help with then somehow +reducing that failing sequence to a minimum. + +### Test order and repetition + +It's good to repeat the tests several times, in sequence, randomly, or in sets, to detect any potential +inter-dependency and state-related bugs (tear down). And the straightforward multiple repetition is just good to detect +some problems that get uncovered by randomness of DL. + + +#### Repeat tests + +- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder): + +```bash +pip install pytest-flakefinder +``` + +And then run every test multiple times (50 by default): + +```bash +pytest --flake-finder --flake-runs=5 tests/test_failing_test.py +``` + + + +This plugin doesn't work with `-n` flag from `pytest-xdist`. + + + + + +There is another plugin `pytest-repeat`, but it doesn't work with `unittest`. + + + +#### Run tests in a random order + +```bash +pip install pytest-random-order +``` + +Important: the presence of `pytest-random-order` will automatically randomize tests, no configuration change or +command line options is required. + +As explained earlier this allows detection of coupled tests - where one test's state affects the state of another. When +`pytest-random-order` is installed it will print the random seed it used for that session, e.g: + +```bash +pytest tests +[...] +Using --random-order-bucket=module +Using --random-order-seed=573663 +``` + +So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.: + +```bash +pytest --random-order-seed=573663 +[...] +Using --random-order-bucket=module +Using --random-order-seed=573663 +``` + +It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to +manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order +they failed and tell pytest to not randomize them instead using `--random-order-bucket=none`, e.g.: + +```bash +pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py +``` + +To disable the shuffling for all tests: + +```bash +pytest --random-order-bucket=none +``` + +By default `--random-order-bucket=module` is implied, which will shuffle the files on the module levels. It can also +shuffle on `class`, `package`, `global` and `none` levels. For the complete details please see its +[documentation](https://github.com/jbasko/pytest-random-order). + +Another randomization alternative is: [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly). This +module has a very similar functionality/interface, but it doesn't have the bucket modes available in +`pytest-random-order`. It has the same problem of imposing itself once installed. + +### Look and feel variations + +#### pytest-sugar + +[pytest-sugar](https://github.com/Frozenball/pytest-sugar) is a plugin that improves the look-n-feel, adds a +progressbar, and show tests that fail and the assert instantly. It gets activated automatically upon installation. + +```bash +pip install pytest-sugar +``` + +To run tests without it, run: + +```bash +pytest -p no:sugar +``` + +or uninstall it. + + + +#### Report each sub-test name and its progress + +For a single or a group of tests via `pytest` (after `pip install pytest-pspec`): + +```bash +pytest --pspec tests/test_optimization.py +``` + +#### Instantly shows failed tests + +[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) shows failures and errors instantly instead of +waiting until the end of test session. + +```bash +pip install pytest-instafail +``` + +```bash +pytest --instafail +``` + +### To GPU or not to GPU + +On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`: + +```bash +CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py +``` + +or if you have multiple gpus, you can specify which one is to be used by `pytest`. For example, to use only the +second gpu if you have gpus `0` and `1`, you can run: + +```bash +CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py +``` + +This is handy when you want to run different tasks on different GPUs. + +Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip +decorators are used to set the requirements of tests CPU/GPU/TPU-wise: + +- `require_torch` - this test will run only under torch +- `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU +- `require_torch_multi_gpu` - as `require_torch` plus requires at least 2 GPUs +- `require_torch_non_multi_gpu` - as `require_torch` plus requires 0 or 1 GPUs +- `require_torch_up_to_2_gpus` - as `require_torch` plus requires 0 or 1 or 2 GPUs +- `require_torch_tpu` - as `require_torch` plus requires at least 1 TPU + +Let's depict the GPU requirements in the following table: + + +| n gpus | decorator | +|--------+--------------------------------| +| `>= 0` | `@require_torch` | +| `>= 1` | `@require_torch_gpu` | +| `>= 2` | `@require_torch_multi_gpu` | +| `< 2` | `@require_torch_non_multi_gpu` | +| `< 3` | `@require_torch_up_to_2_gpus` | + + +For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed: + +```python no-style +@require_torch_multi_gpu +def test_example_with_multi_gpu(): +``` + +If a test requires `tensorflow` use the `require_tf` decorator. For example: + +```python no-style +@require_tf +def test_tf_thing_with_tensorflow(): +``` + +These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is +how to set it up: + +```python no-style +@require_torch_gpu +@slow +def test_example_slow_on_gpu(): +``` + +Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed +last for them to work correctly. Here is an example of the correct usage: + +```python no-style +@parameterized.expand(...) +@require_torch_multi_gpu +def test_integration_foo(): +``` + +This order problem doesn't exist with `@pytest.mark.parametrize`, you can put it first or last and it will still +work. But it only works with non-unittests. + +Inside tests: + +- How many GPUs are available: + +```python +from transformers.testing_utils import get_gpu_count + +n_gpu = get_gpu_count() # works with torch and tf +``` + +### Testing with a specific PyTorch backend or device + +To run the test suite on a specific torch device add `TRANSFORMERS_TEST_DEVICE="$device"` where `$device` is the target backend. For example, to test on CPU only: +```bash +TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py +``` + +This variable is useful for testing custom or less common PyTorch backends such as `mps`. It can also be used to achieve the same effect as `CUDA_VISIBLE_DEVICES` by targeting specific GPUs or testing in CPU-only mode. + +Certain devices will require an additional import after importing `torch` for the first time. This can be specified using the environment variable `TRANSFORMERS_TEST_BACKEND`: +```bash +TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py +``` + + +### Distributed training + +`pytest` can't deal with distributed training directly. If this is attempted - the sub-processes don't do the right +thing and end up thinking they are `pytest` and start running the test suite in loops. It works, however, if one +spawns a normal process that then spawns off multiple workers and manages the IO pipes. + +Here are some tests that use it: + +- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py) +- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py) + +To jump right into the execution point, search for the `execute_subprocess_async` call in those tests. + +You will need at least 2 GPUs to see these tests in action: + +```bash +CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py +``` + +### Output capture + +During test execution any output sent to `stdout` and `stderr` is captured. If a test or a setup method fails, its +according captured output will usually be shown along with the failure traceback. + +To disable output capturing and to get the `stdout` and `stderr` normally, use `-s` or `--capture=no`: + +```bash +pytest -s tests/utils/test_logging.py +``` + +To send test results to JUnit format output: + +```bash +py.test tests --junitxml=result.xml +``` + +### Color control + +To have no color (e.g., yellow on white background is not readable): + +```bash +pytest --color=no tests/utils/test_logging.py +``` + +### Sending test report to online pastebin service + +Creating a URL for each test failure: + +```bash +pytest --pastebin=failed tests/utils/test_logging.py +``` + +This will submit test run information to a remote Paste service and provide a URL for each failure. You may select +tests as usual or add for example -x if you only want to send one particular failure. + +Creating a URL for a whole test session log: + +```bash +pytest --pastebin=all tests/utils/test_logging.py +``` + +## Writing tests + +🤗 transformers tests are based on `unittest`, but run by `pytest`, so most of the time features from both systems +can be used. + +You can read [here](https://docs.pytest.org/en/stable/unittest.html) which features are supported, but the important +thing to remember is that most `pytest` fixtures don't work. Neither parametrization, but we use the module +`parameterized` that works in a similar way. + + +### Parametrization + +Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within +the test, but then there is no way of running that test for just one set of arguments. + +```python +# test_this1.py +import unittest +from parameterized import parameterized + + +class TestMathUnitTest(unittest.TestCase): + @parameterized.expand( + [ + ("negative", -1.5, -2.0), + ("integer", 1, 1.0), + ("large fraction", 1.6, 1), + ] + ) + def test_floor(self, name, input, expected): + assert_equal(math.floor(input), expected) +``` + +Now, by default this test will be run 3 times, each time with the last 3 arguments of `test_floor` being assigned the +corresponding arguments in the parameter list. + +and you could run just the `negative` and `integer` sets of params with: + +```bash +pytest -k "negative and integer" tests/test_mytest.py +``` + +or all but `negative` sub-tests, with: + +```bash +pytest -k "not negative" tests/test_mytest.py +``` + +Besides using the `-k` filter that was just mentioned, you can find out the exact name of each sub-test and run any +or all of them using their exact names. + +```bash +pytest test_this1.py --collect-only -q +``` + +and it will list: + +```bash +test_this1.py::TestMathUnitTest::test_floor_0_negative +test_this1.py::TestMathUnitTest::test_floor_1_integer +test_this1.py::TestMathUnitTest::test_floor_2_large_fraction +``` + +So now you can run just 2 specific sub-tests: + +```bash +pytest test_this1.py::TestMathUnitTest::test_floor_0_negative test_this1.py::TestMathUnitTest::test_floor_1_integer +``` + +The module [parameterized](https://pypi.org/project/parameterized/) which is already in the developer dependencies +of `transformers` works for both: `unittests` and `pytest` tests. + +If, however, the test is not a `unittest`, you may use `pytest.mark.parametrize` (or you may see it being used in +some existing tests, mostly under `examples`). + +Here is the same example, this time using `pytest`'s `parametrize` marker: + +```python +# test_this2.py +import pytest + + +@pytest.mark.parametrize( + "name, input, expected", + [ + ("negative", -1.5, -2.0), + ("integer", 1, 1.0), + ("large fraction", 1.6, 1), + ], +) +def test_floor(name, input, expected): + assert_equal(math.floor(input), expected) +``` + +Same as with `parameterized`, with `pytest.mark.parametrize` you can have a fine control over which sub-tests are +run, if the `-k` filter doesn't do the job. Except, this parametrization function creates a slightly different set of +names for the sub-tests. Here is what they look like: + +```bash +pytest test_this2.py --collect-only -q +``` + +and it will list: + +```bash +test_this2.py::test_floor[integer-1-1.0] +test_this2.py::test_floor[negative--1.5--2.0] +test_this2.py::test_floor[large fraction-1.6-1] +``` + +So now you can run just the specific test: + +```bash +pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0] +``` + +as in the previous example. + + + +### Files and directories + +In tests often we need to know where things are relative to the current test file, and it's not trivial since the test +could be invoked from more than one directory or could reside in sub-directories with different depths. A helper class +`transformers.test_utils.TestCasePlus` solves this problem by sorting out all the basic paths and provides easy +accessors to them: + +- `pathlib` objects (all fully resolved): + + - `test_file_path` - the current test file path, i.e. `__file__` + - `test_file_dir` - the directory containing the current test file + - `tests_dir` - the directory of the `tests` test suite + - `examples_dir` - the directory of the `examples` test suite + - `repo_root_dir` - the directory of the repository + - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides) + +- stringified paths---same as above but these return paths as strings, rather than `pathlib` objects: + + - `test_file_path_str` + - `test_file_dir_str` + - `tests_dir_str` + - `examples_dir_str` + - `repo_root_dir_str` + - `src_dir_str` + +To start using those all you need is to make sure that the test resides in a subclass of +`transformers.test_utils.TestCasePlus`. For example: + +```python +from transformers.testing_utils import TestCasePlus + + +class PathExampleTest(TestCasePlus): + def test_something_involving_local_locations(self): + data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" +``` + +If you don't need to manipulate paths via `pathlib` or you just need a path as a string, you can always invoked +`str()` on the `pathlib` object or use the accessors ending with `_str`. For example: + +```python +from transformers.testing_utils import TestCasePlus + + +class PathExampleTest(TestCasePlus): + def test_something_involving_stringified_locations(self): + examples_dir = self.examples_dir_str +``` + +### Temporary files and directories + +Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite +each other's data. Also we want to get the temporary files and directories removed at the end of each test that created +them. Therefore, using packages like `tempfile`, which address these needs is essential. + +However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want +to know it's exact path and not having it randomized on every test re-run. + +A helper class `transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of +`unittest.TestCase`, so we can easily inherit from it in the test modules. + +Here is an example of its usage: + +```python +from transformers.testing_utils import TestCasePlus + + +class ExamplesTests(TestCasePlus): + def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() +``` + +This code creates a unique temporary directory, and sets `tmp_dir` to its location. + +- Create a unique temporary dir: + +```python +def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() +``` + +`tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the +test. + +- Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test. + +```python +def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir("./xxx") +``` + +This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't +leave any data in there. + +- You can override the default behavior by directly overriding the `before` and `after` args, leading to one of the + following behaviors: + + - `before=True`: the temporary dir will always be cleared at the beginning of the test. + - `before=False`: if the temporary dir already existed, any existing files will remain there. + - `after=True`: the temporary dir will always be deleted at the end of the test. + - `after=False`: the temporary dir will always be left intact at the end of the test. + + + +In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are allowed if +an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem will +get nuked. i.e. please always pass paths that start with `./`. + + + + + +Each test can register multiple temporary directories and they all will get auto-removed, unless requested +otherwise. + + + +### Temporary sys.path override + +If you need to temporary override `sys.path` to import from another test for example, you can use the +`ExtendSysPath` context manager. Example: + + +```python +import os +from transformers.testing_utils import ExtendSysPath + +bindir = os.path.abspath(os.path.dirname(__file__)) +with ExtendSysPath(f"{bindir}/.."): + from test_trainer import TrainerIntegrationCommon # noqa +``` + +### Skipping tests + +This is useful when a bug is found and a new test is written, yet the bug is not fixed yet. In order to be able to +commit it to the main repository we need make sure it's skipped during `make test`. + +Methods: + +- A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip + running the test altogether. Common examples are skipping windows-only tests on non-windows platforms, or skipping + tests that depend on an external resource which is not available at the moment (for example a database). + +- A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet + implemented, or a bug not yet fixed. When a test passes despite being expected to fail (marked with + pytest.mark.xfail), it’s an xpass and will be reported in the test summary. + +One of the important differences between the two is that `skip` doesn't run the test, and `xfail` does. So if the +code that's buggy causes some bad state that will affect other tests, do not use `xfail`. + +#### Implementation + +- Here is how to skip whole test unconditionally: + +```python no-style +@unittest.skip("this bug needs to be fixed") +def test_feature_x(): +``` + +or via pytest: + +```python no-style +@pytest.mark.skip(reason="this bug needs to be fixed") +``` + +or the `xfail` way: + +```python no-style +@pytest.mark.xfail +def test_feature_x(): +``` + +- Here is how to skip a test based on some internal check inside the test: + +```python +def test_feature_x(): + if not has_something(): + pytest.skip("unsupported configuration") +``` + +or the whole module: + +```python +import pytest + +if not pytest.config.getoption("--custom-flag"): + pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True) +``` + +or the `xfail` way: + +```python +def test_feature_x(): + pytest.xfail("expected to fail until bug XYZ is fixed") +``` + +- Here is how to skip all tests in a module if some import is missing: + +```python +docutils = pytest.importorskip("docutils", minversion="0.3") +``` + +- Skip a test based on a condition: + +```python no-style +@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher") +def test_feature_x(): +``` + +or: + +```python no-style +@unittest.skipIf(torch_device == "cpu", "Can't do half precision") +def test_feature_x(): +``` + +or skip the whole module: + +```python no-style +@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows") +class TestClass(): + def test_feature_x(self): +``` + +More details, example and ways are [here](https://docs.pytest.org/en/latest/skipping.html). + +### Slow tests + +The library of tests is ever-growing, and some of the tests take minutes to run, therefore we can't afford waiting for +an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be +marked as in the example below: + +```python no-style +from transformers.testing_utils import slow +@slow +def test_integration_foo(): +``` + +Once a test is marked as `@slow`, to run such tests set `RUN_SLOW=1` env var, e.g.: + +```bash +RUN_SLOW=1 pytest tests +``` + +Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators +`@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage: + +```python no-style +@parameteriz ed.expand(...) +@slow +def test_integration_foo(): +``` + +As explained at the beginning of this document, slow tests get to run on a scheduled basis, rather than in PRs CI +checks. So it's possible that some problems will be missed during a PR submission and get merged. Such problems will +get caught during the next scheduled CI job. But it also means that it's important to run the slow tests on your +machine before submitting the PR. + +Here is a rough decision making mechanism for choosing which tests should be marked as slow: + +If the test is focused on one of the library's internal components (e.g., modeling files, tokenization files, +pipelines), then we should run that test in the non-slow test suite. If it's focused on an other aspect of the library, +such as the documentation or the examples, then we should run these tests in the slow test suite. And then, to refine +this approach we should have exceptions: + +- All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or + tokenizer integration tests, pipeline integration tests) should be set to slow. If you're adding a new model, you + should create and upload to the hub a tiny version of it (with random weights) for integration tests. This is + discussed in the following paragraphs. +- All tests that need to do a training not specifically optimized to be fast should be set to slow. +- We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to + `@slow`. Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked + as `@slow`. +- If a test completes under 1 second on CI (including downloads if any) then it should be a normal test regardless. + +Collectively, all the non-slow tests need to cover entirely the different internals, while remaining fast. For example, +a significant coverage can be achieved by testing with specially created tiny models with random weights. Such models +have the very minimal number of layers (e.g., 2), vocab size (e.g., 1000), etc. Then the `@slow` tests can use large +slow models to do qualitative testing. To see the use of these simply look for *tiny* models with: + +```bash +grep tiny tests examples +``` + +Here is a an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model +[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). You can easily adjust it to your specific +model's architecture. + +It's easy to measure the run-time incorrectly if for example there is an overheard of downloading a huge model, but if +you test it locally the downloaded files would be cached and thus the download time not measured. Hence check the +execution speed report in CI logs instead (the output of `pytest --durations=0 tests`). + +That report is also useful to find slow outliers that aren't marked as such, or which need to be re-written to be fast. +If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest +tests. + + +### Testing the stdout/stderr output + +In order to test functions that write to `stdout` and/or `stderr`, the test can access those streams using the +`pytest`'s [capsys system](https://docs.pytest.org/en/latest/capture.html). Here is how this is accomplished: + +```python +import sys + + +def print_to_stdout(s): + print(s) + + +def print_to_stderr(s): + sys.stderr.write(s) + + +def test_result_and_stdout(capsys): + msg = "Hello" + print_to_stdout(msg) + print_to_stderr(msg) + out, err = capsys.readouterr() # consume the captured output streams + # optional: if you want to replay the consumed streams: + sys.stdout.write(out) + sys.stderr.write(err) + # test: + assert msg in out + assert msg in err +``` + +And, of course, most of the time, `stderr` will come as a part of an exception, so try/except has to be used in such +a case: + +```python +def raise_exception(msg): + raise ValueError(msg) + + +def test_something_exception(): + msg = "Not a good value" + error = "" + try: + raise_exception(msg) + except Exception as e: + error = str(e) + assert msg in error, f"{msg} is in the exception:\n{error}" +``` + +Another approach to capturing stdout is via `contextlib.redirect_stdout`: + +```python +from io import StringIO +from contextlib import redirect_stdout + + +def print_to_stdout(s): + print(s) + + +def test_result_and_stdout(): + msg = "Hello" + buffer = StringIO() + with redirect_stdout(buffer): + print_to_stdout(msg) + out = buffer.getvalue() + # optional: if you want to replay the consumed streams: + sys.stdout.write(out) + # test: + assert msg in out +``` + +An important potential issue with capturing stdout is that it may contain `\r` characters that in normal `print` +reset everything that has been printed so far. There is no problem with `pytest`, but with `pytest -s` these +characters get included in the buffer, so to be able to have the test run with and without `-s`, you have to make an +extra cleanup to the captured output, using `re.sub(r'~.*\r', '', buf, 0, re.M)`. + +But, then we have a helper context manager wrapper to automatically take care of it all, regardless of whether it has +some `\r`'s in it or not, so it's a simple: + +```python +from transformers.testing_utils import CaptureStdout + +with CaptureStdout() as cs: + function_that_writes_to_stdout() +print(cs.out) +``` + +Here is a full test example: + +```python +from transformers.testing_utils import CaptureStdout + +msg = "Secret message\r" +final = "Hello World" +with CaptureStdout() as cs: + print(msg + final) +assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}" +``` + +If you'd like to capture `stderr` use the `CaptureStderr` class instead: + +```python +from transformers.testing_utils import CaptureStderr + +with CaptureStderr() as cs: + function_that_writes_to_stderr() +print(cs.err) +``` + +If you need to capture both streams at once, use the parent `CaptureStd` class: + +```python +from transformers.testing_utils import CaptureStd + +with CaptureStd() as cs: + function_that_writes_to_stdout_and_stderr() +print(cs.err, cs.out) +``` + +Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit +from the context. + + +### Capturing logger stream + +If you need to validate the output of a logger, you can use `CaptureLogger`: + +```python +from transformers import logging +from transformers.testing_utils import CaptureLogger + +msg = "Testing 1, 2, 3" +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.bart.tokenization_bart") +with CaptureLogger(logger) as cl: + logger.info(msg) +assert cl.out, msg + "\n" +``` + +### Testing with environment variables + +If you want to test the impact of environment variables for a specific test you can use a helper decorator +`transformers.testing_utils.mockenv` + +```python +from transformers.testing_utils import mockenv + + +class HfArgumentParserTest(unittest.TestCase): + @mockenv(TRANSFORMERS_VERBOSITY="error") + def test_env_override(self): + env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) +``` + +At times an external program needs to be called, which requires setting `PYTHONPATH` in `os.environ` to include +multiple local paths. A helper class `transformers.test_utils.TestCasePlus` comes to help: + +```python +from transformers.testing_utils import TestCasePlus + + +class EnvExampleTest(TestCasePlus): + def test_external_prog(self): + env = self.get_env() + # now call the external program, passing `env` to it +``` + +Depending on whether the test file was under the `tests` test suite or `examples` it'll correctly set up +`env[PYTHONPATH]` to include one of these two directories, and also the `src` directory to ensure the testing is +done against the current repo, and finally with whatever `env[PYTHONPATH]` was already set to before the test was +called if anything. + +This helper method creates a copy of the `os.environ` object, so the original remains intact. + + +### Getting reproducible results + +In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you +will need to fix the seed: + +```python +seed = 42 + +# python RNG +import random + +random.seed(seed) + +# pytorch RNGs +import torch + +torch.manual_seed(seed) +torch.backends.cudnn.deterministic = True +if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + +# numpy RNG +import numpy as np + +np.random.seed(seed) + +# tf RNG +tf.random.set_seed(seed) +``` + +### Debugging tests + +To start a debugger at the point of the warning, do this: + +```bash +pytest tests/utils/test_logging.py -W error::UserWarning --pdb +``` + +## Working with github actions workflows + +To trigger a self-push workflow CI job, you must: + +1. Create a new branch on `transformers` origin (not a fork!). +2. The branch name has to start with either `ci_` or `ci-` (`main` triggers it too, but we can't do PRs on + `main`). It also gets triggered only for specific paths - you can find the up-to-date definition in case it + changed since this document has been written [here](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) under *push:* +3. Create a PR from this branch. +4. Then you can see the job appear [here](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). It may not run right away if there + is a backlog. + + + + +## Testing Experimental CI Features + +Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a +new CI feature is to be added, it should be done as following. + +1. Create a new dedicated job that tests what needs to be tested +2. The new job must always succeed so that it gives us a green ✓ (details below). +3. Let it run for some days to see that a variety of different PR types get to run on it (user fork branches, + non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. - there + are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always + green) +4. When it's clear that everything is solid, then merge the new changes into existing jobs. + +That way experiments on CI functionality itself won't interfere with the normal workflow. + +Now how can we make the job always succeed while the new CI feature is being developed? + +Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and +Github Actions as of this writing don't support that. + +So the following workaround can be used: + +1. `set +euo pipefail` at the beginning of the run command to suppress most potential failures in the bash script. +2. the last command must be a success: `echo "done"` or just `true` will do + +Here is an example: + +```yaml +- run: + name: run CI experiment + command: | + set +euo pipefail + echo "setting run-all-despite-any-errors-mode" + this_command_will_fail + echo "but bash continues to run" + # emulate another failure + false + # but the last command must be a success + echo "during experiment do not remove: reporting success to CI, even if there were failures" +``` + +For simple commands you could also do: + +```bash +cmd_that_may_fail || true +``` + +Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs, +while removing `set +euo pipefail` or any other things you may have added to ensure that the experimental job doesn't +interfere with the normal CI functioning. + +This whole process would have been much easier if we only could set something like `allow-failure` for the +experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and +Github Actions don't support it at the moment. + +You can vote for this feature and see where it is at these CI-specific threads: + +- [Github Actions:](https://github.com/actions/toolkit/issues/399) +- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344) diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx deleted file mode 100644 index cb03a57b0413..000000000000 --- a/docs/source/en/testing.mdx +++ /dev/null @@ -1,1273 +0,0 @@ - - -# Testing - - -Let's take a look at how 🤗 Transformers models are tested and how you can write new tests and improve the existing ones. - -There are 2 test suites in the repository: - -1. `tests` -- tests for the general API -2. `examples` -- tests primarily for various applications that aren't part of the API - -## How transformers are tested - -1. Once a PR is submitted it gets tested with 9 CircleCi jobs. Every new commit to that PR gets retested. These jobs - are defined in this [config file](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml), so that if needed you can reproduce the same - environment on your machine. - - These CI jobs don't run `@slow` tests. - -2. There are 3 jobs run by [github actions](https://github.com/huggingface/transformers/actions): - - - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): checks whether torch hub - integration works. - - - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): runs fast tests on GPU only on commits on - `main`. It only runs if a commit on `main` has updated the code in one of the following folders: `src`, - `tests`, `.github` (to prevent running on added model cards, notebooks, etc.) - - - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): runs normal and slow tests on GPU in - `tests` and `examples`: - -```bash -RUN_SLOW=1 pytest tests/ -RUN_SLOW=1 pytest examples/ -``` - - The results can be observed [here](https://github.com/huggingface/transformers/actions). - - - -## Running tests - - - - - -### Choosing which tests to run - -This document goes into many details of how tests can be run. If after reading everything, you need even more details -you will find them [here](https://docs.pytest.org/en/latest/usage.html). - -Here are some most useful ways of running tests. - -Run all: - -```console -pytest -``` - -or: - -```bash -make test -``` - -Note that the latter is defined as: - -```bash -python -m pytest -n auto --dist=loadfile -s -v ./tests/ -``` - -which tells pytest to: - -- run as many test processes as they are CPU cores (which could be too many if you don't have a ton of RAM!) -- ensure that all tests from the same file will be run by the same test process -- do not capture output -- run in verbose mode - - - -### Getting the list of all tests - -All tests of the test suite: - -```bash -pytest --collect-only -q -``` - -All tests of a given test file: - -```bash -pytest tests/test_optimization.py --collect-only -q -``` - -### Run a specific test module - -To run an individual test module: - -```bash -pytest tests/test_logging.py -``` - -### Run specific tests - -Since unittest is used inside most of the tests, to run specific subtests you need to know the name of the unittest -class containing those tests. For example, it could be: - -```bash -pytest tests/test_optimization.py::OptimizationTest::test_adam_w -``` - -Here: - -- `tests/test_optimization.py` - the file with tests -- `OptimizationTest` - the name of the class -- `test_adam_w` - the name of the specific test function - -If the file contains multiple classes, you can choose to run only tests of a given class. For example: - -```bash -pytest tests/test_optimization.py::OptimizationTest -``` - -will run all the tests inside that class. - -As mentioned earlier you can see what tests are contained inside the `OptimizationTest` class by running: - -```bash -pytest tests/test_optimization.py::OptimizationTest --collect-only -q -``` - -You can run tests by keyword expressions. - -To run only tests whose name contains `adam`: - -```bash -pytest -k adam tests/test_optimization.py -``` - -Logical `and` and `or` can be used to indicate whether all keywords should match or either. `not` can be used to -negate. - -To run all tests except those whose name contains `adam`: - -```bash -pytest -k "not adam" tests/test_optimization.py -``` - -And you can combine the two patterns in one: - -```bash -pytest -k "ada and not adam" tests/test_optimization.py -``` - -For example to run both `test_adafactor` and `test_adam_w` you can use: - -```bash -pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py -``` - -Note that we use `or` here, since we want either of the keywords to match to include both. - -If you want to include only tests that include both patterns, `and` is to be used: - -```bash -pytest -k "test and ada" tests/test_optimization.py -``` -### Run documentation tests - -In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. -As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): - -```python -r""" -Returns: - -Example: - ```python - >>> import torch - >>> from transformers import WhisperModel, WhisperFeatureExtractor - >>> from datasets import load_dataset - - >>> model = WhisperModel.from_pretrained("openai/whisper-base") - >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") - >>> input_features = inputs.input_features - >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id - >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state - >>> list(last_hidden_state.shape) - [1, 2, 512] - ```""" - -``` -3 steps are required to debug the docstring examples: -1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: -```bash -python utils/prepare_for_doc_test.py -``` - -2. Then, you can use the following line to automatically test every docstring example in the desired file: -```bash -pytest --doctest-modules -``` -3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: -```bash -python utils/prepare_for_doc_test.py --remove_new_line -``` - -### Run only modified tests - -You can run the tests related to the unstaged files or the current branch (according to Git) by using [pytest-picked](https://github.com/anapaulagomes/pytest-picked). This is a great way of quickly testing your changes didn't break -anything, since it won't run the tests related to files you didn't touch. - -```bash -pip install pytest-picked -``` - -```bash -pytest --picked -``` - -All tests will be run from files and folders which are modified, but not yet committed. - -### Automatically rerun failed tests on source modification - -[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) provides a very useful feature of detecting all failed -tests, and then waiting for you to modify files and continuously re-rerun those failing tests until they pass while you -fix them. So that you don't need to re start pytest after you made the fix. This is repeated until all tests pass after -which again a full run is performed. - -```bash -pip install pytest-xdist -``` - -To enter the mode: `pytest -f` or `pytest --looponfail` - -File changes are detected by looking at `looponfailroots` root directories and all of their contents (recursively). -If the default for this value does not work for you, you can change it in your project by setting a configuration -option in `setup.cfg`: - -```ini -[tool:pytest] -looponfailroots = transformers tests -``` - -or `pytest.ini`/``tox.ini`` files: - -```ini -[pytest] -looponfailroots = transformers tests -``` - -This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file’s -directory. - -[pytest-watch](https://github.com/joeyespo/pytest-watch) is an alternative implementation of this functionality. - - -### Skip a test module - -If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For -example, to run all except `test_modeling_*.py` tests: - -```bash -pytest *ls -1 tests/*py | grep -v test_modeling* -``` - -### Clearing state - -CI builds and when isolation is important (against speed), cache should be cleared: - -```bash -pytest --cache-clear tests -``` - -### Running tests in parallel - -As mentioned earlier `make test` runs tests in parallel via `pytest-xdist` plugin (`-n X` argument, e.g. `-n 2` -to run 2 parallel jobs). - -`pytest-xdist`'s `--dist=` option allows one to control how the tests are grouped. `--dist=loadfile` puts the -tests located in one file onto the same process. - -Since the order of executed tests is different and unpredictable, if running the test suite with `pytest-xdist` -produces failures (meaning we have some undetected coupled tests), use [pytest-replay](https://github.com/ESSS/pytest-replay) to replay the tests in the same order, which should help with then somehow -reducing that failing sequence to a minimum. - -### Test order and repetition - -It's good to repeat the tests several times, in sequence, randomly, or in sets, to detect any potential -inter-dependency and state-related bugs (tear down). And the straightforward multiple repetition is just good to detect -some problems that get uncovered by randomness of DL. - - -#### Repeat tests - -- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder): - -```bash -pip install pytest-flakefinder -``` - -And then run every test multiple times (50 by default): - -```bash -pytest --flake-finder --flake-runs=5 tests/test_failing_test.py -``` - - - -This plugin doesn't work with `-n` flag from `pytest-xdist`. - - - - - -There is another plugin `pytest-repeat`, but it doesn't work with `unittest`. - - - -#### Run tests in a random order - -```bash -pip install pytest-random-order -``` - -Important: the presence of `pytest-random-order` will automatically randomize tests, no configuration change or -command line options is required. - -As explained earlier this allows detection of coupled tests - where one test's state affects the state of another. When -`pytest-random-order` is installed it will print the random seed it used for that session, e.g: - -```bash -pytest tests -[...] -Using --random-order-bucket=module -Using --random-order-seed=573663 -``` - -So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.: - -```bash -pytest --random-order-seed=573663 -[...] -Using --random-order-bucket=module -Using --random-order-seed=573663 -``` - -It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to -manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order -they failed and tell pytest to not randomize them instead using `--random-order-bucket=none`, e.g.: - -```bash -pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py -``` - -To disable the shuffling for all tests: - -```bash -pytest --random-order-bucket=none -``` - -By default `--random-order-bucket=module` is implied, which will shuffle the files on the module levels. It can also -shuffle on `class`, `package`, `global` and `none` levels. For the complete details please see its -[documentation](https://github.com/jbasko/pytest-random-order). - -Another randomization alternative is: [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly). This -module has a very similar functionality/interface, but it doesn't have the bucket modes available in -`pytest-random-order`. It has the same problem of imposing itself once installed. - -### Look and feel variations - -#### pytest-sugar - -[pytest-sugar](https://github.com/Frozenball/pytest-sugar) is a plugin that improves the look-n-feel, adds a -progressbar, and show tests that fail and the assert instantly. It gets activated automatically upon installation. - -```bash -pip install pytest-sugar -``` - -To run tests without it, run: - -```bash -pytest -p no:sugar -``` - -or uninstall it. - - - -#### Report each sub-test name and its progress - -For a single or a group of tests via `pytest` (after `pip install pytest-pspec`): - -```bash -pytest --pspec tests/test_optimization.py -``` - -#### Instantly shows failed tests - -[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) shows failures and errors instantly instead of -waiting until the end of test session. - -```bash -pip install pytest-instafail -``` - -```bash -pytest --instafail -``` - -### To GPU or not to GPU - -On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`: - -```bash -CUDA_VISIBLE_DEVICES="" pytest tests/test_logging.py -``` - -or if you have multiple gpus, you can specify which one is to be used by `pytest`. For example, to use only the -second gpu if you have gpus `0` and `1`, you can run: - -```bash -CUDA_VISIBLE_DEVICES="1" pytest tests/test_logging.py -``` - -This is handy when you want to run different tasks on different GPUs. - -Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip -decorators are used to set the requirements of tests CPU/GPU/TPU-wise: - -- `require_torch` - this test will run only under torch -- `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU -- `require_torch_multi_gpu` - as `require_torch` plus requires at least 2 GPUs -- `require_torch_non_multi_gpu` - as `require_torch` plus requires 0 or 1 GPUs -- `require_torch_up_to_2_gpus` - as `require_torch` plus requires 0 or 1 or 2 GPUs -- `require_torch_tpu` - as `require_torch` plus requires at least 1 TPU - -Let's depict the GPU requirements in the following table: - - -| n gpus | decorator | -|--------+--------------------------------| -| `>= 0` | `@require_torch` | -| `>= 1` | `@require_torch_gpu` | -| `>= 2` | `@require_torch_multi_gpu` | -| `< 2` | `@require_torch_non_multi_gpu` | -| `< 3` | `@require_torch_up_to_2_gpus` | - - -For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed: - -```python no-style -@require_torch_multi_gpu -def test_example_with_multi_gpu(): -``` - -If a test requires `tensorflow` use the `require_tf` decorator. For example: - -```python no-style -@require_tf -def test_tf_thing_with_tensorflow(): -``` - -These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is -how to set it up: - -```python no-style -@require_torch_gpu -@slow -def test_example_slow_on_gpu(): -``` - -Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed -last for them to work correctly. Here is an example of the correct usage: - -```python no-style -@parameterized.expand(...) -@require_torch_multi_gpu -def test_integration_foo(): -``` - -This order problem doesn't exist with `@pytest.mark.parametrize`, you can put it first or last and it will still -work. But it only works with non-unittests. - -Inside tests: - -- How many GPUs are available: - -```python -from transformers.testing_utils import get_gpu_count - -n_gpu = get_gpu_count() # works with torch and tf -``` - -### Distributed training - -`pytest` can't deal with distributed training directly. If this is attempted - the sub-processes don't do the right -thing and end up thinking they are `pytest` and start running the test suite in loops. It works, however, if one -spawns a normal process that then spawns off multiple workers and manages the IO pipes. - -Here are some tests that use it: - -- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py) -- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py) - -To jump right into the execution point, search for the `execute_subprocess_async` call in those tests. - -You will need at least 2 GPUs to see these tests in action: - -```bash -CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py -``` - -### Output capture - -During test execution any output sent to `stdout` and `stderr` is captured. If a test or a setup method fails, its -according captured output will usually be shown along with the failure traceback. - -To disable output capturing and to get the `stdout` and `stderr` normally, use `-s` or `--capture=no`: - -```bash -pytest -s tests/test_logging.py -``` - -To send test results to JUnit format output: - -```bash -py.test tests --junitxml=result.xml -``` - -### Color control - -To have no color (e.g., yellow on white background is not readable): - -```bash -pytest --color=no tests/test_logging.py -``` - -### Sending test report to online pastebin service - -Creating a URL for each test failure: - -```bash -pytest --pastebin=failed tests/test_logging.py -``` - -This will submit test run information to a remote Paste service and provide a URL for each failure. You may select -tests as usual or add for example -x if you only want to send one particular failure. - -Creating a URL for a whole test session log: - -```bash -pytest --pastebin=all tests/test_logging.py -``` - -## Writing tests - -🤗 transformers tests are based on `unittest`, but run by `pytest`, so most of the time features from both systems -can be used. - -You can read [here](https://docs.pytest.org/en/stable/unittest.html) which features are supported, but the important -thing to remember is that most `pytest` fixtures don't work. Neither parametrization, but we use the module -`parameterized` that works in a similar way. - - -### Parametrization - -Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within -the test, but then there is no way of running that test for just one set of arguments. - -```python -# test_this1.py -import unittest -from parameterized import parameterized - - -class TestMathUnitTest(unittest.TestCase): - @parameterized.expand( - [ - ("negative", -1.5, -2.0), - ("integer", 1, 1.0), - ("large fraction", 1.6, 1), - ] - ) - def test_floor(self, name, input, expected): - assert_equal(math.floor(input), expected) -``` - -Now, by default this test will be run 3 times, each time with the last 3 arguments of `test_floor` being assigned the -corresponding arguments in the parameter list. - -and you could run just the `negative` and `integer` sets of params with: - -```bash -pytest -k "negative and integer" tests/test_mytest.py -``` - -or all but `negative` sub-tests, with: - -```bash -pytest -k "not negative" tests/test_mytest.py -``` - -Besides using the `-k` filter that was just mentioned, you can find out the exact name of each sub-test and run any -or all of them using their exact names. - -```bash -pytest test_this1.py --collect-only -q -``` - -and it will list: - -```bash -test_this1.py::TestMathUnitTest::test_floor_0_negative -test_this1.py::TestMathUnitTest::test_floor_1_integer -test_this1.py::TestMathUnitTest::test_floor_2_large_fraction -``` - -So now you can run just 2 specific sub-tests: - -```bash -pytest test_this1.py::TestMathUnitTest::test_floor_0_negative test_this1.py::TestMathUnitTest::test_floor_1_integer -``` - -The module [parameterized](https://pypi.org/project/parameterized/) which is already in the developer dependencies -of `transformers` works for both: `unittests` and `pytest` tests. - -If, however, the test is not a `unittest`, you may use `pytest.mark.parametrize` (or you may see it being used in -some existing tests, mostly under `examples`). - -Here is the same example, this time using `pytest`'s `parametrize` marker: - -```python -# test_this2.py -import pytest - - -@pytest.mark.parametrize( - "name, input, expected", - [ - ("negative", -1.5, -2.0), - ("integer", 1, 1.0), - ("large fraction", 1.6, 1), - ], -) -def test_floor(name, input, expected): - assert_equal(math.floor(input), expected) -``` - -Same as with `parameterized`, with `pytest.mark.parametrize` you can have a fine control over which sub-tests are -run, if the `-k` filter doesn't do the job. Except, this parametrization function creates a slightly different set of -names for the sub-tests. Here is what they look like: - -```bash -pytest test_this2.py --collect-only -q -``` - -and it will list: - -```bash -test_this2.py::test_floor[integer-1-1.0] -test_this2.py::test_floor[negative--1.5--2.0] -test_this2.py::test_floor[large fraction-1.6-1] -``` - -So now you can run just the specific test: - -```bash -pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0] -``` - -as in the previous example. - - - -### Files and directories - -In tests often we need to know where things are relative to the current test file, and it's not trivial since the test -could be invoked from more than one directory or could reside in sub-directories with different depths. A helper class -`transformers.test_utils.TestCasePlus` solves this problem by sorting out all the basic paths and provides easy -accessors to them: - -- `pathlib` objects (all fully resolved): - - - `test_file_path` - the current test file path, i.e. `__file__` - - `test_file_dir` - the directory containing the current test file - - `tests_dir` - the directory of the `tests` test suite - - `examples_dir` - the directory of the `examples` test suite - - `repo_root_dir` - the directory of the repository - - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides) - -- stringified paths---same as above but these return paths as strings, rather than `pathlib` objects: - - - `test_file_path_str` - - `test_file_dir_str` - - `tests_dir_str` - - `examples_dir_str` - - `repo_root_dir_str` - - `src_dir_str` - -To start using those all you need is to make sure that the test resides in a subclass of -`transformers.test_utils.TestCasePlus`. For example: - -```python -from transformers.testing_utils import TestCasePlus - - -class PathExampleTest(TestCasePlus): - def test_something_involving_local_locations(self): - data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" -``` - -If you don't need to manipulate paths via `pathlib` or you just need a path as a string, you can always invoked -`str()` on the `pathlib` object or use the accessors ending with `_str`. For example: - -```python -from transformers.testing_utils import TestCasePlus - - -class PathExampleTest(TestCasePlus): - def test_something_involving_stringified_locations(self): - examples_dir = self.examples_dir_str -``` - -### Temporary files and directories - -Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite -each other's data. Also we want to get the temporary files and directories removed at the end of each test that created -them. Therefore, using packages like `tempfile`, which address these needs is essential. - -However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want -to know it's exact path and not having it randomized on every test re-run. - -A helper class `transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of -`unittest.TestCase`, so we can easily inherit from it in the test modules. - -Here is an example of its usage: - -```python -from transformers.testing_utils import TestCasePlus - - -class ExamplesTests(TestCasePlus): - def test_whatever(self): - tmp_dir = self.get_auto_remove_tmp_dir() -``` - -This code creates a unique temporary directory, and sets `tmp_dir` to its location. - -- Create a unique temporary dir: - -```python -def test_whatever(self): - tmp_dir = self.get_auto_remove_tmp_dir() -``` - -`tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the -test. - -- Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test. - -```python -def test_whatever(self): - tmp_dir = self.get_auto_remove_tmp_dir("./xxx") -``` - -This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't -leave any data in there. - -- You can override the default behavior by directly overriding the `before` and `after` args, leading to one of the - following behaviors: - - - `before=True`: the temporary dir will always be cleared at the beginning of the test. - - `before=False`: if the temporary dir already existed, any existing files will remain there. - - `after=True`: the temporary dir will always be deleted at the end of the test. - - `after=False`: the temporary dir will always be left intact at the end of the test. - - - -In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are allowed if -an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem will -get nuked. i.e. please always pass paths that start with `./`. - - - - - -Each test can register multiple temporary directories and they all will get auto-removed, unless requested -otherwise. - - - -### Temporary sys.path override - -If you need to temporary override `sys.path` to import from another test for example, you can use the -`ExtendSysPath` context manager. Example: - - -```python -import os -from transformers.testing_utils import ExtendSysPath - -bindir = os.path.abspath(os.path.dirname(__file__)) -with ExtendSysPath(f"{bindir}/.."): - from test_trainer import TrainerIntegrationCommon # noqa -``` - -### Skipping tests - -This is useful when a bug is found and a new test is written, yet the bug is not fixed yet. In order to be able to -commit it to the main repository we need make sure it's skipped during `make test`. - -Methods: - -- A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip - running the test altogether. Common examples are skipping windows-only tests on non-windows platforms, or skipping - tests that depend on an external resource which is not available at the moment (for example a database). - -- A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet - implemented, or a bug not yet fixed. When a test passes despite being expected to fail (marked with - pytest.mark.xfail), it’s an xpass and will be reported in the test summary. - -One of the important differences between the two is that `skip` doesn't run the test, and `xfail` does. So if the -code that's buggy causes some bad state that will affect other tests, do not use `xfail`. - -#### Implementation - -- Here is how to skip whole test unconditionally: - -```python no-style -@unittest.skip("this bug needs to be fixed") -def test_feature_x(): -``` - -or via pytest: - -```python no-style -@pytest.mark.skip(reason="this bug needs to be fixed") -``` - -or the `xfail` way: - -```python no-style -@pytest.mark.xfail -def test_feature_x(): -``` - -- Here is how to skip a test based on some internal check inside the test: - -```python -def test_feature_x(): - if not has_something(): - pytest.skip("unsupported configuration") -``` - -or the whole module: - -```python -import pytest - -if not pytest.config.getoption("--custom-flag"): - pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True) -``` - -or the `xfail` way: - -```python -def test_feature_x(): - pytest.xfail("expected to fail until bug XYZ is fixed") -``` - -- Here is how to skip all tests in a module if some import is missing: - -```python -docutils = pytest.importorskip("docutils", minversion="0.3") -``` - -- Skip a test based on a condition: - -```python no-style -@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher") -def test_feature_x(): -``` - -or: - -```python no-style -@unittest.skipIf(torch_device == "cpu", "Can't do half precision") -def test_feature_x(): -``` - -or skip the whole module: - -```python no-style -@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows") -class TestClass(): - def test_feature_x(self): -``` - -More details, example and ways are [here](https://docs.pytest.org/en/latest/skipping.html). - -### Slow tests - -The library of tests is ever-growing, and some of the tests take minutes to run, therefore we can't afford waiting for -an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be -marked as in the example below: - -```python no-style -from transformers.testing_utils import slow -@slow -def test_integration_foo(): -``` - -Once a test is marked as `@slow`, to run such tests set `RUN_SLOW=1` env var, e.g.: - -```bash -RUN_SLOW=1 pytest tests -``` - -Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators -`@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage: - -```python no-style -@parameteriz ed.expand(...) -@slow -def test_integration_foo(): -``` - -As explained at the beginning of this document, slow tests get to run on a scheduled basis, rather than in PRs CI -checks. So it's possible that some problems will be missed during a PR submission and get merged. Such problems will -get caught during the next scheduled CI job. But it also means that it's important to run the slow tests on your -machine before submitting the PR. - -Here is a rough decision making mechanism for choosing which tests should be marked as slow: - -If the test is focused on one of the library's internal components (e.g., modeling files, tokenization files, -pipelines), then we should run that test in the non-slow test suite. If it's focused on an other aspect of the library, -such as the documentation or the examples, then we should run these tests in the slow test suite. And then, to refine -this approach we should have exceptions: - -- All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or - tokenizer integration tests, pipeline integration tests) should be set to slow. If you're adding a new model, you - should create and upload to the hub a tiny version of it (with random weights) for integration tests. This is - discussed in the following paragraphs. -- All tests that need to do a training not specifically optimized to be fast should be set to slow. -- We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to - `@slow`. Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked - as `@slow`. -- If a test completes under 1 second on CI (including downloads if any) then it should be a normal test regardless. - -Collectively, all the non-slow tests need to cover entirely the different internals, while remaining fast. For example, -a significant coverage can be achieved by testing with specially created tiny models with random weights. Such models -have the very minimal number of layers (e.g., 2), vocab size (e.g., 1000), etc. Then the `@slow` tests can use large -slow models to do qualitative testing. To see the use of these simply look for *tiny* models with: - -```bash -grep tiny tests examples -``` - -Here is a an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model -[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). You can easily adjust it to your specific -model's architecture. - -It's easy to measure the run-time incorrectly if for example there is an overheard of downloading a huge model, but if -you test it locally the downloaded files would be cached and thus the download time not measured. Hence check the -execution speed report in CI logs instead (the output of `pytest --durations=0 tests`). - -That report is also useful to find slow outliers that aren't marked as such, or which need to be re-written to be fast. -If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest -tests. - - -### Testing the stdout/stderr output - -In order to test functions that write to `stdout` and/or `stderr`, the test can access those streams using the -`pytest`'s [capsys system](https://docs.pytest.org/en/latest/capture.html). Here is how this is accomplished: - -```python -import sys - - -def print_to_stdout(s): - print(s) - - -def print_to_stderr(s): - sys.stderr.write(s) - - -def test_result_and_stdout(capsys): - msg = "Hello" - print_to_stdout(msg) - print_to_stderr(msg) - out, err = capsys.readouterr() # consume the captured output streams - # optional: if you want to replay the consumed streams: - sys.stdout.write(out) - sys.stderr.write(err) - # test: - assert msg in out - assert msg in err -``` - -And, of course, most of the time, `stderr` will come as a part of an exception, so try/except has to be used in such -a case: - -```python -def raise_exception(msg): - raise ValueError(msg) - - -def test_something_exception(): - msg = "Not a good value" - error = "" - try: - raise_exception(msg) - except Exception as e: - error = str(e) - assert msg in error, f"{msg} is in the exception:\n{error}" -``` - -Another approach to capturing stdout is via `contextlib.redirect_stdout`: - -```python -from io import StringIO -from contextlib import redirect_stdout - - -def print_to_stdout(s): - print(s) - - -def test_result_and_stdout(): - msg = "Hello" - buffer = StringIO() - with redirect_stdout(buffer): - print_to_stdout(msg) - out = buffer.getvalue() - # optional: if you want to replay the consumed streams: - sys.stdout.write(out) - # test: - assert msg in out -``` - -An important potential issue with capturing stdout is that it may contain `\r` characters that in normal `print` -reset everything that has been printed so far. There is no problem with `pytest`, but with `pytest -s` these -characters get included in the buffer, so to be able to have the test run with and without `-s`, you have to make an -extra cleanup to the captured output, using `re.sub(r'~.*\r', '', buf, 0, re.M)`. - -But, then we have a helper context manager wrapper to automatically take care of it all, regardless of whether it has -some `\r`'s in it or not, so it's a simple: - -```python -from transformers.testing_utils import CaptureStdout - -with CaptureStdout() as cs: - function_that_writes_to_stdout() -print(cs.out) -``` - -Here is a full test example: - -```python -from transformers.testing_utils import CaptureStdout - -msg = "Secret message\r" -final = "Hello World" -with CaptureStdout() as cs: - print(msg + final) -assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}" -``` - -If you'd like to capture `stderr` use the `CaptureStderr` class instead: - -```python -from transformers.testing_utils import CaptureStderr - -with CaptureStderr() as cs: - function_that_writes_to_stderr() -print(cs.err) -``` - -If you need to capture both streams at once, use the parent `CaptureStd` class: - -```python -from transformers.testing_utils import CaptureStd - -with CaptureStd() as cs: - function_that_writes_to_stdout_and_stderr() -print(cs.err, cs.out) -``` - -Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit -from the context. - - -### Capturing logger stream - -If you need to validate the output of a logger, you can use `CaptureLogger`: - -```python -from transformers import logging -from transformers.testing_utils import CaptureLogger - -msg = "Testing 1, 2, 3" -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.bart.tokenization_bart") -with CaptureLogger(logger) as cl: - logger.info(msg) -assert cl.out, msg + "\n" -``` - -### Testing with environment variables - -If you want to test the impact of environment variables for a specific test you can use a helper decorator -`transformers.testing_utils.mockenv` - -```python -from transformers.testing_utils import mockenv - - -class HfArgumentParserTest(unittest.TestCase): - @mockenv(TRANSFORMERS_VERBOSITY="error") - def test_env_override(self): - env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) -``` - -At times an external program needs to be called, which requires setting `PYTHONPATH` in `os.environ` to include -multiple local paths. A helper class `transformers.test_utils.TestCasePlus` comes to help: - -```python -from transformers.testing_utils import TestCasePlus - - -class EnvExampleTest(TestCasePlus): - def test_external_prog(self): - env = self.get_env() - # now call the external program, passing `env` to it -``` - -Depending on whether the test file was under the `tests` test suite or `examples` it'll correctly set up -`env[PYTHONPATH]` to include one of these two directories, and also the `src` directory to ensure the testing is -done against the current repo, and finally with whatever `env[PYTHONPATH]` was already set to before the test was -called if anything. - -This helper method creates a copy of the `os.environ` object, so the original remains intact. - - -### Getting reproducible results - -In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you -will need to fix the seed: - -```python -seed = 42 - -# python RNG -import random - -random.seed(seed) - -# pytorch RNGs -import torch - -torch.manual_seed(seed) -torch.backends.cudnn.deterministic = True -if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - -# numpy RNG -import numpy as np - -np.random.seed(seed) - -# tf RNG -tf.random.set_seed(seed) -``` - -### Debugging tests - -To start a debugger at the point of the warning, do this: - -```bash -pytest tests/test_logging.py -W error::UserWarning --pdb -``` - -## Working with github actions workflows - -To trigger a self-push workflow CI job, you must: - -1. Create a new branch on `transformers` origin (not a fork!). -2. The branch name has to start with either `ci_` or `ci-` (`main` triggers it too, but we can't do PRs on - `main`). It also gets triggered only for specific paths - you can find the up-to-date definition in case it - changed since this document has been written [here](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) under *push:* -3. Create a PR from this branch. -4. Then you can see the job appear [here](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). It may not run right away if there - is a backlog. - - - - -## Testing Experimental CI Features - -Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a -new CI feature is to be added, it should be done as following. - -1. Create a new dedicated job that tests what needs to be tested -2. The new job must always succeed so that it gives us a green ✓ (details below). -3. Let it run for some days to see that a variety of different PR types get to run on it (user fork branches, - non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. - there - are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always - green) -4. When it's clear that everything is solid, then merge the new changes into existing jobs. - -That way experiments on CI functionality itself won't interfere with the normal workflow. - -Now how can we make the job always succeed while the new CI feature is being developed? - -Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and -Github Actions as of this writing don't support that. - -So the following workaround can be used: - -1. `set +euo pipefail` at the beginning of the run command to suppress most potential failures in the bash script. -2. the last command must be a success: `echo "done"` or just `true` will do - -Here is an example: - -```yaml -- run: - name: run CI experiment - command: | - set +euo pipefail - echo "setting run-all-despite-any-errors-mode" - this_command_will_fail - echo "but bash continues to run" - # emulate another failure - false - # but the last command must be a success - echo "during experiment do not remove: reporting success to CI, even if there were failures" -``` - -For simple commands you could also do: - -```bash -cmd_that_may_fail || true -``` - -Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs, -while removing `set +euo pipefail` or any other things you may have added to ensure that the experimental job doesn't -interfere with the normal CI functioning. - -This whole process would have been much easier if we only could set something like `allow-failure` for the -experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and -Github Actions don't support it at the moment. - -You can vote for this feature and see where it is at these CI-specific threads: - -- [Github Actions:](https://github.com/actions/toolkit/issues/399) -- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344) diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md new file mode 100644 index 000000000000..5f6a360dd8d5 --- /dev/null +++ b/docs/source/en/tf_xla.md @@ -0,0 +1,174 @@ + + +# XLA Integration for TensorFlow Models + +[[open-in-colab]] + +Accelerated Linear Algebra, dubbed XLA, is a compiler for accelerating the runtime of TensorFlow Models. From the [official documentation](https://www.tensorflow.org/xla): + +XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. + +Using XLA in TensorFlow is simple – it comes packaged inside the `tensorflow` library, and it can be triggered with the `jit_compile` argument in any graph-creating function such as [`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs). When using Keras methods like `fit()` and `predict()`, you can enable XLA simply by passing the `jit_compile` argument to `model.compile()`. However, XLA is not limited to these methods - it can also be used to accelerate any arbitrary `tf.function`. + +Several TensorFlow methods in 🤗 Transformers have been rewritten to be XLA-compatible, including text generation for models such as [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [T5](https://huggingface.co/docs/transformers/model_doc/t5) and [OPT](https://huggingface.co/docs/transformers/model_doc/opt), as well as speech processing for models such as [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper). + +While the exact amount of speed-up is very much model-dependent, for TensorFlow text generation models inside 🤗 Transformers, we noticed a speed-up of ~100x. This document will explain how you can use XLA for these models to get the maximum amount of performance. We’ll also provide links to additional resources if you’re interested to learn more about the benchmarks and our design philosophy behind the XLA integration. + +## Running TF functions with XLA + +Let us consider the following model in TensorFlow: + +```py +import tensorflow as tf + +model = tf.keras.Sequential( + [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")] +) +``` + +The above model accepts inputs having a dimension of `(10, )`. We can use the model for running a forward pass like so: + +```py +# Generate random inputs for the model. +batch_size = 16 +input_vector_dim = 10 +random_inputs = tf.random.normal((batch_size, input_vector_dim)) + +# Run a forward pass. +_ = model(random_inputs) +``` + +In order to run the forward pass with an XLA-compiled function, we’d need to do: + +```py +xla_fn = tf.function(model, jit_compile=True) +_ = xla_fn(random_inputs) +``` + +The default `call()` function of the `model` is used for compiling the XLA graph. But if there’s any other model function you want to compile into XLA that’s also possible with: + +```py +my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True) +``` + +## Running a TF text generation model with XLA from 🤗 Transformers + +To enable XLA-accelerated generation within 🤗 Transformers, you need to have a recent version of `transformers` installed. You can install it by running: + +```bash +pip install transformers --upgrade +``` + +And then you can run the following code: + +```py +import tensorflow as tf +from transformers import AutoTokenizer, TFAutoModelForCausalLM + +# Will error if the minimal version of Transformers is not installed. +from transformers.utils import check_min_version + +check_min_version("4.21.0") + + +tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="") +model = TFAutoModelForCausalLM.from_pretrained("gpt2") +input_string = ["TensorFlow is"] + +# One line to create an XLA generation function +xla_generate = tf.function(model.generate, jit_compile=True) + +tokenized_input = tokenizer(input_string, return_tensors="tf") +generated_tokens = xla_generate(**tokenized_input, num_beams=2) + +decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) +print(f"Generated -- {decoded_text}") +# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the +``` + +As you can notice, enabling XLA on `generate()` is just a single line of code. The rest of the code remains unchanged. However, there are a couple of gotchas in the above code snippet that are specific to XLA. You need to be aware of those to realize the speed-ups that XLA can bring in. We discuss these in the following section. + +## Gotchas to be aware of + +When you are executing an XLA-enabled function (like `xla_generate()` above) for the first time, it will internally try to infer the computation graph, which is time-consuming. This process is known as [“tracing”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing). + +You might notice that the generation time is not fast. Successive calls of `xla_generate()` (or any other XLA-enabled function) won’t have to infer the computation graph, given the inputs to the function follow the same shape with which the computation graph was initially built. While this is not a problem for modalities with fixed input shapes (e.g., images), you must pay attention if you are working with variable input shape modalities (e.g., text). + +To ensure `xla_generate()` always operates with the same input shapes, you can specify the `padding` arguments when calling the tokenizer. + +```py +import tensorflow as tf +from transformers import AutoTokenizer, TFAutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="") +model = TFAutoModelForCausalLM.from_pretrained("gpt2") +input_string = ["TensorFlow is"] + +xla_generate = tf.function(model.generate, jit_compile=True) + +# Here, we call the tokenizer with padding options. +tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf") + +generated_tokens = xla_generate(**tokenized_input, num_beams=2) +decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) +print(f"Generated -- {decoded_text}") +``` + +This way, you can ensure that the inputs to `xla_generate()` will always receive inputs with the shape it was traced with and thus leading to speed-ups in the generation time. You can verify this with the code below: + +```py +import time +import tensorflow as tf +from transformers import AutoTokenizer, TFAutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="") +model = TFAutoModelForCausalLM.from_pretrained("gpt2") + +xla_generate = tf.function(model.generate, jit_compile=True) + +for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]: + tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf") + start = time.time_ns() + generated_tokens = xla_generate(**tokenized_input, num_beams=2) + end = time.time_ns() + print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n") +``` + +On a Tesla T4 GPU, you can expect the outputs like so: + +```bash +Execution time -- 30819.6 ms + +Execution time -- 79.0 ms + +Execution time -- 78.9 ms +``` +The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. + +We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases. + +## Additional Resources + +Here, we leave you with some additional resources if you want to delve deeper into XLA in 🤗 Transformers and in general. + +* [This Colab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) provides an interactive demonstration if you want to fiddle with the XLA-compatible encoder-decoder (like [T5](https://huggingface.co/docs/transformers/model_doc/t5)) and decoder-only (like [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)) text generation models. +* [This blog post](https://huggingface.co/blog/tf-xla-generate) provides an overview of the comparison benchmarks for XLA-compatible models along with a friendly introduction to XLA in TensorFlow. +* [This blog post](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) discusses our design philosophy behind adding XLA support to the TensorFlow models in 🤗 Transformers. +* Recommended posts for learning more about XLA and TensorFlow graphs in general: + * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla) + * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) + * [Better performance with tf.function](https://www.tensorflow.org/guide/function) \ No newline at end of file diff --git a/docs/source/en/tflite.md b/docs/source/en/tflite.md new file mode 100644 index 000000000000..7b7735c992ea --- /dev/null +++ b/docs/source/en/tflite.md @@ -0,0 +1,62 @@ + + +# Export to TFLite + +[TensorFlow Lite](https://www.tensorflow.org/lite/guide) is a lightweight framework for deploying machine learning models +on resource-constrained devices, such as mobile phones, embedded systems, and Internet of Things (IoT) devices. +TFLite is designed to optimize and run models efficiently on these devices with limited computational power, memory, and +power consumption. +A TensorFlow Lite model is represented in a special efficient portable format identified by the `.tflite` file extension. + +🤗 Optimum offers functionality to export 🤗 Transformers models to TFLite through the `exporters.tflite` module. +For the list of supported model architectures, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/tflite/overview). + +To export a model to TFLite, install the required dependencies: + +```bash +pip install optimum[exporters-tf] +``` + +To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model), +or view help in command line: + +```bash +optimum-cli export tflite --help +``` + +To export a model's checkpoint from the 🤗 Hub, for example, `bert-base-uncased`, run the following command: + +```bash +optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/ +``` + +You should see the logs indicating progress and showing where the resulting `model.tflite` is saved, like this: + +```bash +Validating TFLite model... + -[✓] TFLite model output names match reference model (logits) + - Validating TFLite Model output "logits": + -[✓] (1, 128, 30522) matches (1, 128, 30522) + -[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05) +The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05: +- logits: max diff = 5.817413330078125e-05. + The exported model was saved at: bert_tflite + ``` + +The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you +saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the +`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub. \ No newline at end of file diff --git a/docs/source/en/tokenizer_summary.md b/docs/source/en/tokenizer_summary.md new file mode 100644 index 000000000000..5a23c7bf8473 --- /dev/null +++ b/docs/source/en/tokenizer_summary.md @@ -0,0 +1,282 @@ + + +# Summary of the tokenizers + +[[open-in-colab]] + +On this page, we will have a closer look at tokenization. + + + +As we saw in [the preprocessing tutorial](preprocessing), tokenizing a text is splitting it into words or +subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is +straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text). +More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: [Byte-Pair Encoding +(BPE)](#byte-pair-encoding), [WordPiece](#wordpiece), and [SentencePiece](#sentencepiece), and show examples +of which tokenizer type is used by which model. + +Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer +type was used by the pretrained model. For instance, if we look at [`BertTokenizer`], we can see +that the model uses [WordPiece](#wordpiece). + +## Introduction + +Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so. +For instance, let's look at the sentence `"Don't you love 🤗 Transformers? We sure do."` + + + +A simple way of tokenizing this text is to split it by spaces, which would give: + +``` +["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."] +``` + +This is a sensible first step, but if we look at the tokens `"Transformers?"` and `"do."`, we notice that the +punctuation is attached to the words `"Transformer"` and `"do"`, which is suboptimal. We should take the +punctuation into account so that a model does not have to learn a different representation of a word and every possible +punctuation symbol that could follow it, which would explode the number of representations the model has to learn. +Taking punctuation into account, tokenizing our exemplary text would give: + +``` +["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."] +``` + +Better. However, it is disadvantageous, how the tokenization dealt with the word `"Don't"`. `"Don't"` stands for +`"do not"`, so it would be better tokenized as `["Do", "n't"]`. This is where things start getting complicated, and +part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a +different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an +input that was tokenized with the same rules that were used to tokenize its training data. + +[spaCy](https://spacy.io/) and [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) are two popular +rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like: + +``` +["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."] +``` + +As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and +punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined +as splitting sentences into words. While it's the most intuitive way to split texts into smaller chunks, this +tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization +usually generates a very big vocabulary (the set of all unique words and tokens used). *E.g.*, [Transformer XL](model_doc/transformerxl) uses space and punctuation tokenization, resulting in a vocabulary size of 267,735! + +Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which +causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size +greater than 50,000, especially if they are pretrained only on a single language. + +So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters? + + + +While character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder +for the model to learn meaningful input representations. *E.g.* learning a meaningful context-independent +representation for the letter `"t"` is much harder than learning a context-independent representation for the word +`"today"`. Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of +both worlds, transformers models use a hybrid between word-level and character-level tokenization called **subword** +tokenization. + +## Subword tokenization + + + +Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller +subwords, but rare words should be decomposed into meaningful subwords. For instance `"annoyingly"` might be +considered a rare word and could be decomposed into `"annoying"` and `"ly"`. Both `"annoying"` and `"ly"` as +stand-alone subwords would appear more frequently while at the same time the meaning of `"annoyingly"` is kept by the +composite meaning of `"annoying"` and `"ly"`. This is especially useful in agglutinative languages such as Turkish, +where you can form (almost) arbitrarily long complex words by stringing together subwords. + +Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful +context-independent representations. In addition, subword tokenization enables the model to process words it has never +seen before, by decomposing them into known subwords. For instance, the [`~transformers.BertTokenizer`] tokenizes +`"I have a new GPU!"` as follows: + +```py +>>> from transformers import BertTokenizer + +>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") +>>> tokenizer.tokenize("I have a new GPU!") +["i", "have", "a", "new", "gp", "##u", "!"] +``` + +Because we are considering the uncased model, the sentence was lowercased first. We can see that the words `["i", "have", "a", "new"]` are present in the tokenizer's vocabulary, but the word `"gpu"` is not. Consequently, the +tokenizer splits `"gpu"` into known subwords: `["gp" and "##u"]`. `"##"` means that the rest of the token should +be attached to the previous one, without space (for decoding or reversal of the tokenization). + +As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously exemplary text as follows: + +```py +>>> from transformers import XLNetTokenizer + +>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") +>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.") +["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."] +``` + +We'll get back to the meaning of those `"▁"` when we look at [SentencePiece](#sentencepiece). As one can see, +the rare word `"Transformers"` has been split into the more frequent subwords `"Transform"` and `"ers"`. + +Let's now look at how the different subword tokenization algorithms work. Note that all of those tokenization +algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained +on. + + + +### Byte-Pair Encoding (BPE) + +Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword Units (Sennrich et +al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the training data into +words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [RoBERTa](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm), +[FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/gpt) which uses +Spacy and ftfy, to count the frequency of each word in the training corpus. + +After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the +training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set +of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until +the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to +define before training the tokenizer. + +As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been +determined: + +``` +("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5) +``` + +Consequently, the base vocabulary is `["b", "g", "h", "n", "p", "s", "u"]`. Splitting all words into symbols of the +base vocabulary, we obtain: + +``` +("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5) +``` + +BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In +the example above `"h"` followed by `"u"` is present _10 + 5 = 15_ times (10 times in the 10 occurrences of +`"hug"`, 5 times in the 5 occurrences of `"hugs"`). However, the most frequent symbol pair is `"u"` followed by +`"g"`, occurring _10 + 5 + 5 = 20_ times in total. Thus, the first merge rule the tokenizer learns is to group all +`"u"` symbols followed by a `"g"` symbol together. Next, `"ug"` is added to the vocabulary. The set of words then +becomes + +``` +("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5) +``` + +BPE then identifies the next most common symbol pair. It's `"u"` followed by `"n"`, which occurs 16 times. `"u"`, +`"n"` is merged to `"un"` and added to the vocabulary. The next most frequent symbol pair is `"h"` followed by +`"ug"`, occurring 15 times. Again the pair is merged and `"hug"` can be added to the vocabulary. + +At this stage, the vocabulary is `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` and our set of unique words +is represented as + +``` +("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5) +``` + +Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied +to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance, +the word `"bug"` would be tokenized to `["b", "ug"]` but `"mug"` would be tokenized as `["", "ug"]` since +the symbol `"m"` is not in the base vocabulary. In general, single letters such as `"m"` are not replaced by the +`""` symbol because the training data usually includes at least one occurrence of each letter, but it is likely +to happen for very special characters like emojis. + +As mentioned earlier, the vocabulary size, *i.e.* the base vocabulary size + the number of merges, is a hyperparameter +to choose. For instance [GPT](model_doc/gpt) has a vocabulary size of 40,478 since they have 478 base characters +and chose to stop training after 40,000 merges. + +#### Byte-level BPE + +A base vocabulary that includes all possible base characters can be quite large if *e.g.* all unicode characters are +considered as base characters. To have a better base vocabulary, [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) uses bytes +as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that +every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2's +tokenizer can tokenize every text without the need for the symbol. [GPT-2](model_doc/gpt) has a vocabulary +size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned +with 50,000 merges. + + + +### WordPiece + +WordPiece is the subword tokenization algorithm used for [BERT](model_doc/bert), [DistilBERT](model_doc/distilbert), and [Electra](model_doc/electra). The algorithm was outlined in [Japanese and Korean +Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very similar to +BPE. WordPiece first initializes the vocabulary to include every character present in the training data and +progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent +symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary. + +So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is +equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by +its second symbol is the greatest among all symbol pairs. *E.g.* `"u"`, followed by `"g"` would have only been +merged if the probability of `"ug"` divided by `"u"`, `"g"` would have been greater than for any other symbol +pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it _loses_ by merging two symbols +to ensure it's _worth it_. + + + +### Unigram + +Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural Network Translation +Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf). In contrast to BPE or +WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each +symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and +the most common substrings. Unigram is not used directly for any of the models in the transformers, but it's used in +conjunction with [SentencePiece](#sentencepiece). + +At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training +data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm +computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then +removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, *i.e.* those +symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has +reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized. + +Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of +tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary: + +``` +["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"], +``` + +`"hugs"` could be tokenized both as `["hug", "s"]`, `["h", "ug", "s"]` or `["h", "u", "g", "s"]`. So which one +to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that +the probability of each possible tokenization can be computed after training. The algorithm simply picks the most +likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their +probabilities. + +Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of +the words \\(x_{1}, \dots, x_{N}\\) and that the set of all possible tokenizations for a word \\(x_{i}\\) is +defined as \\(S(x_{i})\\), then the overall loss is defined as + +$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$ + + + +### SentencePiece + +All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to +separate words. However, not all languages use spaces to separate words. One possible solution is to use language +specific pre-tokenizers, *e.g.* [XLM](model_doc/xlm) uses a specific Chinese, Japanese, and Thai pre-tokenizer). +To solve this problem more generally, [SentencePiece: A simple and language independent subword tokenizer and +detokenizer for Neural Text Processing (Kudo et al., 2018)](https://arxiv.org/pdf/1808.06226.pdf) treats the input +as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram +algorithm to construct the appropriate vocabulary. + +The [`XLNetTokenizer`] uses SentencePiece for example, which is also why in the example earlier the +`"▁"` character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be +concatenated and `"▁"` is replaced by a space. + +All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models +using SentencePiece are [ALBERT](model_doc/albert), [XLNet](model_doc/xlnet), [Marian](model_doc/marian), and [T5](model_doc/t5). diff --git a/docs/source/en/tokenizer_summary.mdx b/docs/source/en/tokenizer_summary.mdx deleted file mode 100644 index 942fe279068e..000000000000 --- a/docs/source/en/tokenizer_summary.mdx +++ /dev/null @@ -1,278 +0,0 @@ - - -# Summary of the tokenizers - -[[open-in-colab]] - -On this page, we will have a closer look at tokenization. - - - -As we saw in [the preprocessing tutorial](preprocessing), tokenizing a text is splitting it into words or -subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is -straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text). -More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: [Byte-Pair Encoding -(BPE)](#byte-pair-encoding), [WordPiece](#wordpiece), and [SentencePiece](#sentencepiece), and show examples -of which tokenizer type is used by which model. - -Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer -type was used by the pretrained model. For instance, if we look at [`BertTokenizer`], we can see -that the model uses [WordPiece](#wordpiece). - -## Introduction - -Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so. -For instance, let's look at the sentence `"Don't you love 🤗 Transformers? We sure do."` - - - -A simple way of tokenizing this text is to split it by spaces, which would give: - -``` -["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."] -``` - -This is a sensible first step, but if we look at the tokens `"Transformers?"` and `"do."`, we notice that the -punctuation is attached to the words `"Transformer"` and `"do"`, which is suboptimal. We should take the -punctuation into account so that a model does not have to learn a different representation of a word and every possible -punctuation symbol that could follow it, which would explode the number of representations the model has to learn. -Taking punctuation into account, tokenizing our exemplary text would give: - -``` -["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."] -``` - -Better. However, it is disadvantageous, how the tokenization dealt with the word `"Don't"`. `"Don't"` stands for -`"do not"`, so it would be better tokenized as `["Do", "n't"]`. This is where things start getting complicated, and -part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a -different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an -input that was tokenized with the same rules that were used to tokenize its training data. - -[spaCy](https://spacy.io/) and [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) are two popular -rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like: - -``` -["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."] -``` - -As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and -punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined -as splitting sentences into words. While it's the most intuitive way to split texts into smaller chunks, this -tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization -usually generates a very big vocabulary (the set of all unique words and tokens used). *E.g.*, [Transformer XL](model_doc/transformerxl) uses space and punctuation tokenization, resulting in a vocabulary size of 267,735! - -Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which -causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size -greater than 50,000, especially if they are pretrained only on a single language. - -So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters? - - - -While character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder -for the model to learn meaningful input representations. *E.g.* learning a meaningful context-independent -representation for the letter `"t"` is much harder than learning a context-independent representation for the word -`"today"`. Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of -both worlds, transformers models use a hybrid between word-level and character-level tokenization called **subword** -tokenization. - -## Subword tokenization - - - -Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller -subwords, but rare words should be decomposed into meaningful subwords. For instance `"annoyingly"` might be -considered a rare word and could be decomposed into `"annoying"` and `"ly"`. Both `"annoying"` and `"ly"` as -stand-alone subwords would appear more frequently while at the same time the meaning of `"annoyingly"` is kept by the -composite meaning of `"annoying"` and `"ly"`. This is especially useful in agglutinative languages such as Turkish, -where you can form (almost) arbitrarily long complex words by stringing together subwords. - -Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful -context-independent representations. In addition, subword tokenization enables the model to process words it has never -seen before, by decomposing them into known subwords. For instance, the [`~transformers.BertTokenizer`] tokenizes -`"I have a new GPU!"` as follows: - -```py ->>> from transformers import BertTokenizer - ->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") ->>> tokenizer.tokenize("I have a new GPU!") -["i", "have", "a", "new", "gp", "##u", "!"] -``` - -Because we are considering the uncased model, the sentence was lowercased first. We can see that the words `["i", "have", "a", "new"]` are present in the tokenizer's vocabulary, but the word `"gpu"` is not. Consequently, the -tokenizer splits `"gpu"` into known subwords: `["gp" and "##u"]`. `"##"` means that the rest of the token should -be attached to the previous one, without space (for decoding or reversal of the tokenization). - -As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously exemplary text as follows: - -```py ->>> from transformers import XLNetTokenizer - ->>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") ->>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.") -["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."] -``` - -We'll get back to the meaning of those `"▁"` when we look at [SentencePiece](#sentencepiece). As one can see, -the rare word `"Transformers"` has been split into the more frequent subwords `"Transform"` and `"ers"`. - -Let's now look at how the different subword tokenization algorithms work. Note that all of those tokenization -algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained -on. - - - -### Byte-Pair Encoding (BPE) - -Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword Units (Sennrich et -al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the training data into -words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [Roberta](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm), -[FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/gpt) which uses -Spacy and ftfy, to count the frequency of each word in the training corpus. - -After pre-tokenization, a set of unique words has been created and the frequency of each word it occurred in the -training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set -of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until -the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to -define before training the tokenizer. - -As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been -determined: - -``` -("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5) -``` - -Consequently, the base vocabulary is `["b", "g", "h", "n", "p", "s", "u"]`. Splitting all words into symbols of the -base vocabulary, we obtain: - -``` -("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5) -``` - -BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In -the example above `"h"` followed by `"u"` is present _10 + 5 = 15_ times (10 times in the 10 occurrences of -`"hug"`, 5 times in the 5 occurrences of `"hugs"`). However, the most frequent symbol pair is `"u"` followed by -`"g"`, occurring _10 + 5 + 5 = 20_ times in total. Thus, the first merge rule the tokenizer learns is to group all -`"u"` symbols followed by a `"g"` symbol together. Next, `"ug"` is added to the vocabulary. The set of words then -becomes - -``` -("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5) -``` - -BPE then identifies the next most common symbol pair. It's `"u"` followed by `"n"`, which occurs 16 times. `"u"`, -`"n"` is merged to `"un"` and added to the vocabulary. The next most frequent symbol pair is `"h"` followed by -`"ug"`, occurring 15 times. Again the pair is merged and `"hug"` can be added to the vocabulary. - -At this stage, the vocabulary is `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` and our set of unique words -is represented as - -``` -("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5) -``` - -Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied -to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance, -the word `"bug"` would be tokenized to `["b", "ug"]` but `"mug"` would be tokenized as `["", "ug"]` since -the symbol `"m"` is not in the base vocabulary. In general, single letters such as `"m"` are not replaced by the -`""` symbol because the training data usually includes at least one occurrence of each letter, but it is likely -to happen for very special characters like emojis. - -As mentioned earlier, the vocabulary size, *i.e.* the base vocabulary size + the number of merges, is a hyperparameter -to choose. For instance [GPT](model_doc/gpt) has a vocabulary size of 40,478 since they have 478 base characters -and chose to stop training after 40,000 merges. - -#### Byte-level BPE - -A base vocabulary that includes all possible base characters can be quite large if *e.g.* all unicode characters are -considered as base characters. To have a better base vocabulary, [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) uses bytes -as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that -every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2's -tokenizer can tokenize every text without the need for the symbol. [GPT-2](model_doc/gpt) has a vocabulary -size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned -with 50,000 merges. - - - -### WordPiece - -WordPiece is the subword tokenization algorithm used for [BERT](model_doc/bert), [DistilBERT](model_doc/distilbert), and [Electra](model_doc/electra). The algorithm was outlined in [Japanese and Korean -Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very similar to -BPE. WordPiece first initializes the vocabulary to include every character present in the training data and -progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent -symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary. - -So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is -equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by -its second symbol is the greatest among all symbol pairs. *E.g.* `"u"`, followed by `"g"` would have only been -merged if the probability of `"ug"` divided by `"u"`, `"g"` would have been greater than for any other symbol -pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it _loses_ by merging two symbols -to ensure it's _worth it_. - - - -### Unigram - -Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural Network Translation -Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf). In contrast to BPE or -WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each -symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and -the most common substrings. Unigram is not used directly for any of the models in the transformers, but it's used in -conjunction with [SentencePiece](#sentencepiece). - -At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training -data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm -computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then -removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, *i.e.* those -symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has -reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized. - -Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of -tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary: - -``` -["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"], -``` - -`"hugs"` could be tokenized both as `["hug", "s"]`, `["h", "ug", "s"]` or `["h", "u", "g", "s"]`. So which one -to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that -the probability of each possible tokenization can be computed after training. The algorithm simply picks the most -likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their -probabilities. - -Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of -the words \\(x_{1}, \dots, x_{N}\\) and that the set of all possible tokenizations for a word \\(x_{i}\\) is -defined as \\(S(x_{i})\\), then the overall loss is defined as - -$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$ - - - -### SentencePiece - -All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to -separate words. However, not all languages use spaces to separate words. One possible solution is to use language -specific pre-tokenizers, *e.g.* [XLM](model_doc/xlm) uses a specific Chinese, Japanese, and Thai pre-tokenizer). -To solve this problem more generally, [SentencePiece: A simple and language independent subword tokenizer and -detokenizer for Neural Text Processing (Kudo et al., 2018)](https://arxiv.org/pdf/1808.06226.pdf) treats the input -as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram -algorithm to construct the appropriate vocabulary. - -The [`XLNetTokenizer`] uses SentencePiece for example, which is also why in the example earlier the -`"▁"` character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be -concatenated and `"▁"` is replaced by a space. - -All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models -using SentencePiece are [ALBERT](model_doc/albert), [XLNet](model_doc/xlnet), [Marian](model_doc/marian), and [T5](model_doc/t5). diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md new file mode 100644 index 000000000000..adf34b2ea699 --- /dev/null +++ b/docs/source/en/torchscript.md @@ -0,0 +1,229 @@ + + +# Export to TorchScript + + + +This is the very beginning of our experiments with TorchScript and we are still +exploring its capabilities with variable-input-size models. It is a focus of interest to +us and we will deepen our analysis in upcoming releases, with more code examples, a more +flexible implementation, and benchmarks comparing Python-based codes with compiled +TorchScript. + + + +According to the [TorchScript documentation](https://pytorch.org/docs/stable/jit.html): + +> TorchScript is a way to create serializable and optimizable models from PyTorch code. + +There are two PyTorch modules, [JIT and +TRACE](https://pytorch.org/docs/stable/jit.html), that allow developers to export their +models to be reused in other programs like efficiency-oriented C++ programs. + +We provide an interface that allows you to export 🤗 Transformers models to TorchScript +so they can be reused in a different environment than PyTorch-based Python programs. +Here, we explain how to export and use our models using TorchScript. + +Exporting a model requires two things: + +- model instantiation with the `torchscript` flag +- a forward pass with dummy inputs + +These necessities imply several things developers should be careful about as detailed +below. + +## TorchScript flag and tied weights + +The `torchscript` flag is necessary because most of the 🤗 Transformers language models +have tied weights between their `Embedding` layer and their `Decoding` layer. +TorchScript does not allow you to export models that have tied weights, so it is +necessary to untie and clone the weights beforehand. + +Models instantiated with the `torchscript` flag have their `Embedding` layer and +`Decoding` layer separated, which means that they should not be trained down the line. +Training would desynchronize the two layers, leading to unexpected results. + +This is not the case for models that do not have a language model head, as those do not +have tied weights. These models can be safely exported without the `torchscript` flag. + +## Dummy inputs and standard lengths + +The dummy inputs are used for a models forward pass. While the inputs' values are +propagated through the layers, PyTorch keeps track of the different operations executed +on each tensor. These recorded operations are then used to create the *trace* of the +model. + +The trace is created relative to the inputs' dimensions. It is therefore constrained by +the dimensions of the dummy input, and will not work for any other sequence length or +batch size. When trying with a different size, the following error is raised: + +``` +`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` +``` + +We recommended you trace the model with a dummy input size at least as large as the +largest input that will be fed to the model during inference. Padding can help fill the +missing values. However, since the model is traced with a larger input size, the +dimensions of the matrix will also be large, resulting in more calculations. + +Be careful of the total number of operations done on each input and follow the +performance closely when exporting varying sequence-length models. + +## Using TorchScript in Python + +This section demonstrates how to save and load models as well as how to use the trace +for inference. + +### Saving a model + +To export a `BertModel` with TorchScript, instantiate `BertModel` from the `BertConfig` +class and then save it to disk under the filename `traced_bert.pt`: + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch + +enc = BertTokenizer.from_pretrained("bert-base-uncased") + +# Tokenizing input text +text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" +tokenized_text = enc.tokenize(text) + +# Masking one of the input tokens +masked_index = 8 +tokenized_text[masked_index] = "[MASK]" +indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + +# Creating a dummy input +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) +dummy_input = [tokens_tensor, segments_tensors] + +# Initializing the model with the torchscript flag +# Flag set to True even though it is not necessary as this model does not have an LM Head. +config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, +) + +# Instantiating the model +model = BertModel(config) + +# The model needs to be in evaluation mode +model.eval() + +# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag +model = BertModel.from_pretrained("bert-base-uncased", torchscript=True) + +# Creating the trace +traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) +torch.jit.save(traced_model, "traced_bert.pt") +``` + +### Loading a model + +Now you can load the previously saved `BertModel`, `traced_bert.pt`, from disk and use +it on the previously initialised `dummy_input`: + +```python +loaded_model = torch.jit.load("traced_bert.pt") +loaded_model.eval() + +all_encoder_layers, pooled_output = loaded_model(*dummy_input) +``` + +### Using a traced model for inference + +Use the traced model for inference by using its `__call__` dunder method: + +```python +traced_model(tokens_tensor, segments_tensors) +``` + +## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK + +AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) +instance family for low cost, high performance machine learning inference in the cloud. +The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware +accelerator, specializing in deep learning inferencing workloads. [AWS +Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) is the SDK for +Inferentia that supports tracing and optimizing transformers models for deployment on +Inf1. The Neuron SDK provides: + + +1. Easy-to-use API with one line of code change to trace and optimize a TorchScript + model for inference in the cloud. +2. Out of the box performance optimizations for [improved + cost-performance](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>). +3. Support for Hugging Face transformers models built with either + [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) + or + [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). + +### Implications + +Transformers models based on the [BERT (Bidirectional Encoder Representations from +Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert) +architecture, or its variants such as +[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) and +[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) run best on +Inf1 for non-generative tasks such as extractive question answering, sequence +classification, and token classification. However, text generation tasks can still be +adapted to run on Inf1 according to this [AWS Neuron MarianMT +tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). +More information about models that can be converted out of the box on Inferentia can be +found in the [Model Architecture +Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) +section of the Neuron documentation. + +### Dependencies + +Using AWS Neuron to convert models requires a [Neuron SDK +environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) +which comes preconfigured on [AWS Deep Learning +AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). + +### Converting a model for AWS Neuron + +Convert a model for AWS NEURON using the same code from [Using TorchScript in +Python](torchscript#using-torchscript-in-python) to trace a `BertModel`. Import the +`torch.neuron` framework extension to access the components of the Neuron SDK through a +Python API: + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch +import torch.neuron +``` + +You only need to modify the following line: + +```diff +- torch.jit.trace(model, [tokens_tensor, segments_tensors]) ++ torch.neuron.trace(model, [token_tensor, segments_tensors]) +``` + +This enables the Neuron SDK to trace the model and optimize it for Inf1 instances. + +To learn more about AWS Neuron SDK features, tools, example tutorials and latest +updates, please see the [AWS NeuronSDK +documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/docs/source/en/torchscript.mdx b/docs/source/en/torchscript.mdx deleted file mode 100644 index 0840973ad078..000000000000 --- a/docs/source/en/torchscript.mdx +++ /dev/null @@ -1,225 +0,0 @@ - - -# Export to TorchScript - - - -This is the very beginning of our experiments with TorchScript and we are still -exploring its capabilities with variable-input-size models. It is a focus of interest to -us and we will deepen our analysis in upcoming releases, with more code examples, a more -flexible implementation, and benchmarks comparing Python-based codes with compiled -TorchScript. - - - -According to the [TorchScript documentation](https://pytorch.org/docs/stable/jit.html): - -> TorchScript is a way to create serializable and optimizable models from PyTorch code. - -There are two PyTorch modules, [JIT and -TRACE](https://pytorch.org/docs/stable/jit.html), that allow developers to export their -models to be reused in other programs like efficiency-oriented C++ programs. - -We provide an interface that allows you to export 🤗 Transformers models to TorchScript -so they can be reused in a different environment than PyTorch-based Python programs. -Here, we explain how to export and use our models using TorchScript. - -Exporting a model requires two things: - -- model instantiation with the `torchscript` flag -- a forward pass with dummy inputs - -These necessities imply several things developers should be careful about as detailed -below. - -## TorchScript flag and tied weights - -The `torchscript` flag is necessary because most of the 🤗 Transformers language models -have tied weights between their `Embedding` layer and their `Decoding` layer. -TorchScript does not allow you to export models that have tied weights, so it is -necessary to untie and clone the weights beforehand. - -Models instantiated with the `torchscript` flag have their `Embedding` layer and -`Decoding` layer separated, which means that they should not be trained down the line. -Training would desynchronize the two layers, leading to unexpected results. - -This is not the case for models that do not have a language model head, as those do not -have tied weights. These models can be safely exported without the `torchscript` flag. - -## Dummy inputs and standard lengths - -The dummy inputs are used for a models forward pass. While the inputs' values are -propagated through the layers, PyTorch keeps track of the different operations executed -on each tensor. These recorded operations are then used to create the *trace* of the -model. - -The trace is created relative to the inputs' dimensions. It is therefore constrained by -the dimensions of the dummy input, and will not work for any other sequence length or -batch size. When trying with a different size, the following error is raised: - -``` -`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` -``` - -We recommended you trace the model with a dummy input size at least as large as the -largest input that will be fed to the model during inference. Padding can help fill the -missing values. However, since the model is traced with a larger input size, the -dimensions of the matrix will also be large, resulting in more calculations. - -Be careful of the total number of operations done on each input and follow the -performance closely when exporting varying sequence-length models. - -## Using TorchScript in Python - -This section demonstrates how to save and load models as well as how to use the trace -for inference. - -### Saving a model - -To export a `BertModel` with TorchScript, instantiate `BertModel` from the `BertConfig` -class and then save it to disk under the filename `traced_bert.pt`: - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch - -enc = BertTokenizer.from_pretrained("bert-base-uncased") - -# Tokenizing input text -text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" -tokenized_text = enc.tokenize(text) - -# Masking one of the input tokens -masked_index = 8 -tokenized_text[masked_index] = "[MASK]" -indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) -segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] - -# Creating a dummy input -tokens_tensor = torch.tensor([indexed_tokens]) -segments_tensors = torch.tensor([segments_ids]) -dummy_input = [tokens_tensor, segments_tensors] - -# Initializing the model with the torchscript flag -# Flag set to True even though it is not necessary as this model does not have an LM Head. -config = BertConfig( - vocab_size_or_config_json_file=32000, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - torchscript=True, -) - -# Instantiating the model -model = BertModel(config) - -# The model needs to be in evaluation mode -model.eval() - -# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag -model = BertModel.from_pretrained("bert-base-uncased", torchscript=True) - -# Creating the trace -traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) -torch.jit.save(traced_model, "traced_bert.pt") -``` - -### Loading a model - -Now you can load the previously saved `BertModel`, `traced_bert.pt`, from disk and use -it on the previously initialised `dummy_input`: - -```python -loaded_model = torch.jit.load("traced_bert.pt") -loaded_model.eval() - -all_encoder_layers, pooled_output = loaded_model(*dummy_input) -``` - -### Using a traced model for inference - -Use the traced model for inference by using its `__call__` dunder method: - -```python -traced_model(tokens_tensor, segments_tensors) -``` - -## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK - -AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) -instance family for low cost, high performance machine learning inference in the cloud. -The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware -accelerator, specializing in deep learning inferencing workloads. [AWS -Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) is the SDK for -Inferentia that supports tracing and optimizing transformers models for deployment on -Inf1. The Neuron SDK provides: - - -1. Easy-to-use API with one line of code change to trace and optimize a TorchScript - model for inference in the cloud. -2. Out of the box performance optimizations for [improved - cost-performance](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>). -3. Support for Hugging Face transformers models built with either - [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) - or - [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). - -### Implications - -Transformers models based on the [BERT (Bidirectional Encoder Representations from -Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert) -architecture, or its variants such as -[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) and -[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) run best on -Inf1 for non-generative tasks such as extractive question answering, sequence -classification, and token classification. However, text generation tasks can still be -adapted to run on Inf1 according to this [AWS Neuron MarianMT -tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). -More information about models that can be converted out of the box on Inferentia can be -found in the [Model Architecture -Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) -section of the Neuron documentation. - -### Dependencies - -Using AWS Neuron to convert models requires a [Neuron SDK -environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) -which comes preconfigured on [AWS Deep Learning -AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). - -### Converting a model for AWS Neuron - -Convert a model for AWS NEURON using the same code from [Using TorchScript in -Python](serialization#using-torchscript-in-python) to trace a `BertModel`. Import the -`torch.neuron` framework extension to access the components of the Neuron SDK through a -Python API: - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch -import torch.neuron -``` - -You only need to modify the following line: - -```diff -- torch.jit.trace(model, [tokens_tensor, segments_tensors]) -+ torch.neuron.trace(model, [token_tensor, segments_tensors]) -``` - -This enables the Neuron SDK to trace the model and optimize it for Inf1 instances. - -To learn more about AWS Neuron SDK features, tools, example tutorials and latest -updates, please see the [AWS NeuronSDK -documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/docs/source/en/training.md b/docs/source/en/training.md new file mode 100644 index 000000000000..fb4a0b6a279e --- /dev/null +++ b/docs/source/en/training.md @@ -0,0 +1,434 @@ + + +# Fine-tune a pretrained model + +[[open-in-colab]] + +There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice: + +* Fine-tune a pretrained model with 🤗 Transformers [`Trainer`]. +* Fine-tune a pretrained model in TensorFlow with Keras. +* Fine-tune a pretrained model in native PyTorch. + + + +## Prepare a dataset + + + +Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test! + +Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset: + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("yelp_review_full") +>>> dataset["train"][100] +{'label': 0, + 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} +``` + +As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + +>>> def tokenize_function(examples): +... return tokenizer(examples["text"], padding="max_length", truncation=True) + + +>>> tokenized_datasets = dataset.map(tokenize_function, batched=True) +``` + +If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + + + +## Train + +At this point, you should follow the section corresponding to the framework you want to use. You can use the links +in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework, +just use the button at the top-right of that framework's block! + + + + + +## Train with PyTorch Trainer + +🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision. + +Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + + + +You will see a warning about some of the pretrained weights not being used and some weights being randomly +initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it. + + + +### Training hyperparameters + +Next, create a [`TrainingArguments`] class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings. + +Specify where to save the checkpoints from your training: + +```py +>>> from transformers import TrainingArguments + +>>> training_args = TrainingArguments(output_dir="test_trainer") +``` + +### Evaluate + +[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function: + +```py +>>> import numpy as np +>>> import evaluate + +>>> metric = evaluate.load("accuracy") +``` + +Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits): + +```py +>>> def compute_metrics(eval_pred): +... logits, labels = eval_pred +... predictions = np.argmax(logits, axis=-1) +... return metric.compute(predictions=predictions, references=labels) +``` + +If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch: + +```py +>>> from transformers import TrainingArguments, Trainer + +>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") +``` + +### Trainer + +Create a [`Trainer`] object with your model, training arguments, training and test datasets, and evaluation function: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +Then fine-tune your model by calling [`~transformers.Trainer.train`]: + +```py +>>> trainer.train() +``` + + + + + + +## Train a TensorFlow model with Keras + +You can also train 🤗 Transformers models in TensorFlow with the Keras API! + +### Loading data for Keras + +When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that +Keras understands. If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras. +Let's try that first before we do anything more complicated. + +First, load a dataset. We'll use the CoLA dataset from the [GLUE benchmark](https://huggingface.co/datasets/glue), +since it's a simple binary text classification task, and just take the training split for now. + +```py +from datasets import load_dataset + +dataset = load_dataset("glue", "cola") +dataset = dataset["train"] # Just take the training split for now +``` + +Next, load a tokenizer and tokenize the data as NumPy arrays. Note that the labels are already a list of 0 and 1s, +so we can just convert that directly to a NumPy array without tokenization! + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True) +# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras +tokenized_data = dict(tokenized_data) + +labels = np.array(dataset["label"]) # Label is already an array of 0 and 1 +``` + +Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +```py +from transformers import TFAutoModelForSequenceClassification +from tensorflow.keras.optimizers import Adam + +# Load and compile our model +model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased") +# Lower learning rates are often better for fine-tuning transformers +model.compile(optimizer=Adam(3e-5)) # No loss argument! + +model.fit(tokenized_data, labels) +``` + + + +You don't have to pass a loss argument to your models when you `compile()` them! Hugging Face models automatically +choose a loss that is appropriate for their task and model architecture if this argument is left blank. You can always +override this by specifying a loss yourself if you want to! + + + +This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. Why? +Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle +“jagged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole +dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too! + +### Loading data as a tf.data.Dataset + +If you want to avoid slowing down training, you can load your data as a `tf.data.Dataset` instead. Although you can write your own +`tf.data` pipeline if you want, we have two convenience methods for doing this: + +- [`~TFPreTrainedModel.prepare_tf_dataset`]: This is the method we recommend in most cases. Because it is a method +on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and +discard the others to make a simpler, more performant dataset. +- [`~datasets.Dataset.to_tf_dataset`]: This method is more low-level, and is useful when you want to exactly control how +your dataset is created, by specifying exactly which `columns` and `label_cols` to include. + +Before you can use [`~TFPreTrainedModel.prepare_tf_dataset`], you will need to add the tokenizer outputs to your dataset as columns, as shown in +the following code sample: + +```py +def tokenize_dataset(data): + # Keys of the returned dictionary will be added to the dataset as columns + return tokenizer(data["text"]) + + +dataset = dataset.map(tokenize_dataset) +``` + +Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the +columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly +reduces the number of padding tokens compared to padding the entire dataset. + + +```py +>>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer) +``` + +Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded. +If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument. +If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language +modelling), you can use the `collate_fn` argument instead to pass a function that will be called to transform the +list of samples into a batch and apply any preprocessing you want. See our +[examples](https://github.com/huggingface/transformers/tree/main/examples) or +[notebooks](https://huggingface.co/docs/transformers/notebooks) to see this approach in action. + +Once you've created a `tf.data.Dataset`, you can compile and fit the model as before: + +```py +model.compile(optimizer=Adam(3e-5)) # No loss argument! + +model.fit(tf_dataset) +``` + + + + + + +## Train in native PyTorch + + + + + +[`Trainer`] takes care of the training loop and allows you to fine-tune a model in a single line of code. For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch. + +At this point, you may need to restart your notebook or execute the following code to free some memory: + +```py +del model +del trainer +torch.cuda.empty_cache() +``` + +Next, manually postprocess `tokenized_dataset` to prepare it for training. + +1. Remove the `text` column because the model does not accept raw text as an input: + + ```py + >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) + ``` + +2. Rename the `label` column to `labels` because the model expects the argument to be named `labels`: + + ```py + >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") + ``` + +3. Set the format of the dataset to return PyTorch tensors instead of lists: + + ```py + >>> tokenized_datasets.set_format("torch") + ``` + +Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + +### DataLoader + +Create a `DataLoader` for your training and test datasets so you can iterate over batches of data: + +```py +>>> from torch.utils.data import DataLoader + +>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) +>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) +``` + +Load your model with the number of expected labels: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + +### Optimizer and learning rate scheduler + +Create an optimizer and learning rate scheduler to fine-tune the model. Let's use the [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch: + +```py +>>> from torch.optim import AdamW + +>>> optimizer = AdamW(model.parameters(), lr=5e-5) +``` + +Create the default learning rate scheduler from [`Trainer`]: + +```py +>>> from transformers import get_scheduler + +>>> num_epochs = 3 +>>> num_training_steps = num_epochs * len(train_dataloader) +>>> lr_scheduler = get_scheduler( +... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps +... ) +``` + +Lastly, specify `device` to use a GPU if you have access to one. Otherwise, training on a CPU may take several hours instead of a couple of minutes. + +```py +>>> import torch + +>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +>>> model.to(device) +``` + + + +Get free access to a cloud GPU if you don't have one with a hosted notebook like [Colaboratory](https://colab.research.google.com/) or [SageMaker StudioLab](https://studiolab.sagemaker.aws/). + + + +Great, now you are ready to train! 🥳 + +### Training loop + +To keep track of your training progress, use the [tqdm](https://tqdm.github.io/) library to add a progress bar over the number of training steps: + +```py +>>> from tqdm.auto import tqdm + +>>> progress_bar = tqdm(range(num_training_steps)) + +>>> model.train() +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... outputs = model(**batch) +... loss = outputs.loss +... loss.backward() + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +### Evaluate + +Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end. + +```py +>>> import evaluate + +>>> metric = evaluate.load("accuracy") +>>> model.eval() +>>> for batch in eval_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... with torch.no_grad(): +... outputs = model(**batch) + +... logits = outputs.logits +... predictions = torch.argmax(logits, dim=-1) +... metric.add_batch(predictions=predictions, references=batch["labels"]) + +>>> metric.compute() +``` + + + + + +## Additional resources + +For more fine-tuning examples, refer to: + +- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts + to train common NLP tasks in PyTorch and TensorFlow. + +- [🤗 Transformers Notebooks](notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow. diff --git a/docs/source/en/training.mdx b/docs/source/en/training.mdx deleted file mode 100644 index 336ce05b83c0..000000000000 --- a/docs/source/en/training.mdx +++ /dev/null @@ -1,430 +0,0 @@ - - -# Fine-tune a pretrained model - -[[open-in-colab]] - -There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice: - -* Fine-tune a pretrained model with 🤗 Transformers [`Trainer`]. -* Fine-tune a pretrained model in TensorFlow with Keras. -* Fine-tune a pretrained model in native PyTorch. - - - -## Prepare a dataset - - - -Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test! - -Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset: - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("yelp_review_full") ->>> dataset["train"][100] -{'label': 0, - 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} -``` - -As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - - ->>> def tokenize_function(examples): -... return tokenizer(examples["text"], padding="max_length", truncation=True) - - ->>> tokenized_datasets = dataset.map(tokenize_function, batched=True) -``` - -If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - - - -## Train - -At this point, you should follow the section corresponding to the framework you want to use. You can use the links -in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework, -just use the button at the top-right of that framework's block! - - - - - -## Train with PyTorch Trainer - -🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision. - -Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - - - -You will see a warning about some of the pretrained weights not being used and some weights being randomly -initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it. - - - -### Training hyperparameters - -Next, create a [`TrainingArguments`] class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings. - -Specify where to save the checkpoints from your training: - -```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments(output_dir="test_trainer") -``` - -### Evaluate - -[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function: - -```py ->>> import numpy as np ->>> import evaluate - ->>> metric = evaluate.load("accuracy") -``` - -Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits): - -```py ->>> def compute_metrics(eval_pred): -... logits, labels = eval_pred -... predictions = np.argmax(logits, axis=-1) -... return metric.compute(predictions=predictions, references=labels) -``` - -If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch: - -```py ->>> from transformers import TrainingArguments, Trainer - ->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") -``` - -### Trainer - -Create a [`Trainer`] object with your model, training arguments, training and test datasets, and evaluation function: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -Then fine-tune your model by calling [`~transformers.Trainer.train`]: - -```py ->>> trainer.train() -``` - - - - - - -## Train a TensorFlow model with Keras - -You can also train 🤗 Transformers models in TensorFlow with the Keras API! - -### Loading data for Keras - -When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that -Keras understands. If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras. -Let's try that first before we do anything more complicated. - -First, load a dataset. We'll use the CoLA dataset from the [GLUE benchmark](https://huggingface.co/datasets/glue), -since it's a simple binary text classification task, and just take the training split for now. - -```py -from datasets import load_dataset - -dataset = load_dataset("glue", "cola") -dataset = dataset["train"] # Just take the training split for now -``` - -Next, load a tokenizer and tokenize the data as NumPy arrays. Note that the labels are already a list of 0 and 1s, -so we can just convert that directly to a NumPy array without tokenization! - -```py -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True) -# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras -tokenized_data = dict(tokenized_data) - -labels = np.array(dataset["label"]) # Label is already an array of 0 and 1 -``` - -Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model: - -```py -from transformers import TFAutoModelForSequenceClassification -from tensorflow.keras.optimizers import Adam - -# Load and compile our model -model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased") -# Lower learning rates are often better for fine-tuning transformers -model.compile(optimizer=Adam(3e-5)) - -model.fit(tokenized_data, labels) -``` - - - -You don't have to pass a loss argument to your models when you `compile()` them! Hugging Face models automatically -choose a loss that is appropriate for their task and model architecture if this argument is left blank. You can always -override this by specifying a loss yourself if you want to! - - - -This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. Why? -Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle -“jagged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole -dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too! - -### Loading data as a tf.data.Dataset - -If you want to avoid slowing down training, you can load your data as a `tf.data.Dataset` instead. Although you can write your own -`tf.data` pipeline if you want, we have two convenience methods for doing this: - -- [`~TFPreTrainedModel.prepare_tf_dataset`]: This is the method we recommend in most cases. Because it is a method -on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and -discard the others to make a simpler, more performant dataset. -- [`~datasets.Dataset.to_tf_dataset`]: This method is more low-level, and is useful when you want to exactly control how -your dataset is created, by specifying exactly which `columns` and `label_cols` to include. - -Before you can use [`~TFPreTrainedModel.prepare_tf_dataset`], you will need to add the tokenizer outputs to your dataset as columns, as shown in -the following code sample: - -```py -def tokenize_dataset(data): - # Keys of the returned dictionary will be added to the dataset as columns - return tokenizer(data["text"]) - - -dataset = dataset.map(tokenize_dataset) -``` - -Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the -columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly -reduces the number of padding tokens compared to padding the entire dataset. - - -```py ->>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer) -``` - -Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded. -If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument. -If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language -modelling), you can use the `collate_fn` argument instead to pass a function that will be called to transform the -list of samples into a batch and apply any preprocessing you want. See our -[examples](https://github.com/huggingface/transformers/tree/main/examples) or -[notebooks](https://huggingface.co/docs/transformers/notebooks) to see this approach in action. - -Once you've created a `tf.data.Dataset`, you can compile and fit the model as before: - -```py -model.compile(optimizer=Adam(3e-5)) - -model.fit(tf_dataset) -``` - - - - - - -## Train in native PyTorch - - - - - -[`Trainer`] takes care of the training loop and allows you to fine-tune a model in a single line of code. For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch. - -At this point, you may need to restart your notebook or execute the following code to free some memory: - -```py -del model -del trainer -torch.cuda.empty_cache() -``` - -Next, manually postprocess `tokenized_dataset` to prepare it for training. - -1. Remove the `text` column because the model does not accept raw text as an input: - - ```py - >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) - ``` - -2. Rename the `label` column to `labels` because the model expects the argument to be named `labels`: - - ```py - >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") - ``` - -3. Set the format of the dataset to return PyTorch tensors instead of lists: - - ```py - >>> tokenized_datasets.set_format("torch") - ``` - -Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - -### DataLoader - -Create a `DataLoader` for your training and test datasets so you can iterate over batches of data: - -```py ->>> from torch.utils.data import DataLoader - ->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) ->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) -``` - -Load your model with the number of expected labels: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - -### Optimizer and learning rate scheduler - -Create an optimizer and learning rate scheduler to fine-tune the model. Let's use the [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch: - -```py ->>> from torch.optim import AdamW - ->>> optimizer = AdamW(model.parameters(), lr=5e-5) -``` - -Create the default learning rate scheduler from [`Trainer`]: - -```py ->>> from transformers import get_scheduler - ->>> num_epochs = 3 ->>> num_training_steps = num_epochs * len(train_dataloader) ->>> lr_scheduler = get_scheduler( -... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps -... ) -``` - -Lastly, specify `device` to use a GPU if you have access to one. Otherwise, training on a CPU may take several hours instead of a couple of minutes. - -```py ->>> import torch - ->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ->>> model.to(device) -``` - - - -Get free access to a cloud GPU if you don't have one with a hosted notebook like [Colaboratory](https://colab.research.google.com/) or [SageMaker StudioLab](https://studiolab.sagemaker.aws/). - - - -Great, now you are ready to train! 🥳 - -### Training loop - -To keep track of your training progress, use the [tqdm](https://tqdm.github.io/) library to add a progress bar over the number of training steps: - -```py ->>> from tqdm.auto import tqdm - ->>> progress_bar = tqdm(range(num_training_steps)) - ->>> model.train() ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... outputs = model(**batch) -... loss = outputs.loss -... loss.backward() - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -### Evaluate - -Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end. - -```py ->>> import evaluate - ->>> metric = evaluate.load("accuracy") ->>> model.eval() ->>> for batch in eval_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... with torch.no_grad(): -... outputs = model(**batch) - -... logits = outputs.logits -... predictions = torch.argmax(logits, dim=-1) -... metric.add_batch(predictions=predictions, references=batch["labels"]) - ->>> metric.compute() -``` - - - - - -## Additional resources - -For more fine-tuning examples, refer to: - -- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts - to train common NLP tasks in PyTorch and TensorFlow. - -- [🤗 Transformers Notebooks](notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow. diff --git a/docs/source/en/transformers_agents.md b/docs/source/en/transformers_agents.md new file mode 100644 index 000000000000..424f5b15f5dd --- /dev/null +++ b/docs/source/en/transformers_agents.md @@ -0,0 +1,323 @@ + + +# Transformers Agents + + + +Transformers Agents is an experimental API which is subject to change at any time. Results returned by the agents +can vary as the APIs or underlying models are prone to change. + + + +Transformers version v4.29.0, building on the concept of *tools* and *agents*. You can play with in +[this colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj). + +In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an +agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, +but we'll show you how the system can be extended easily to use any tool developed by the community. + +Let's start with a few examples of what can be achieved with this new API. It is particularly powerful when it comes +to multimodal tasks, so let's take it for a spin to generate images and read text out loud. + +```py +agent.run("Caption the following image", image=image) +``` + +| **Input** | **Output** | +|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------| +| | A beaver is swimming in the water | + +--- + +```py +agent.run("Read the following text out loud", text=text) +``` +| **Input** | **Output** | +|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------| +| A beaver is swimming in the water | + +--- + +```py +agent.run( + "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?", + document=document, +) +``` +| **Input** | **Output** | +|-----------------------------------------------------------------------------------------------------------------------------|----------------| +| | ballroom foyer | + +## Quickstart + +Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM). +We provide support for openAI models as well as opensource alternatives from BigCode and OpenAssistant. The openAI +models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is +providing free access to endpoints for BigCode and OpenAssistant models. + +To start with, please install the `agents` extras in order to install all default dependencies. +```bash +pip install transformers[agents] +``` + +To use openAI models, you instantiate an [`OpenAiAgent`] after installing the `openai` dependency: + +```bash +pip install openai +``` + + +```py +from transformers import OpenAiAgent + +agent = OpenAiAgent(model="text-davinci-003", api_key="") +``` + +To use BigCode or OpenAssistant, start by logging in to have access to the Inference API: + +```py +from huggingface_hub import login + +login("") +``` + +Then, instantiate the agent + +```py +from transformers import HfAgent + +# Starcoder +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") +# StarcoderBase +# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase") +# OpenAssistant +# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5") +``` + +This is using the inference API that Hugging Face provides for free at the moment. If you have your own inference +endpoint for this model (or another one) you can replace the URL above with your URL endpoint. + + + +StarCoder and OpenAssistant are free to use and perform admirably well on simple tasks. However, the checkpoints +don't hold up when handling more complex prompts. If you're facing such an issue, we recommend trying out the OpenAI +model which, while sadly not open-source, performs better at this given time. + + + +You're now good to go! Let's dive into the two APIs that you now have at your disposal. + +### Single execution (run) + +The single execution method is when using the [`~Agent.run`] method of the agent: + +```py +agent.run("Draw me a picture of rivers and lakes.") +``` + + + +It automatically selects the tool (or tools) appropriate for the task you want to perform and runs them appropriately. It +can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely +the agent is to fail). + +```py +agent.run("Draw me a picture of the sea then transform the picture to add an island") +``` + + + +
+ + +Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks. + +Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely +different results. It's important to explain as clearly as possible the task you want to perform. We go more in-depth +on how to write good prompts [here](custom_tools#writing-good-user-inputs). + +If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying +variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes, +and ask the model to update that picture to add an island by doing the following: + +```python +picture = agent.run("Generate a picture of rivers and lakes.") +updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture) +``` + + + +This can be helpful when the model is unable to understand your request and mixes tools. An example would be: + +```py +agent.run("Draw me the picture of a capybara swimming in the sea") +``` + +Here, the model could interpret in two ways: +- Have the `text-to-image` generate a capybara swimming in the sea +- Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea + +In case you would like to force the first scenario, you could do so by passing it the prompt as an argument: + +```py +agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea") +``` + + + + +### Chat-based execution (chat) + +The agent also has a chat-based approach, using the [`~Agent.chat`] method: + +```py +agent.chat("Generate a picture of rivers and lakes") +``` + + + +```py +agent.chat("Transform the picture so that there is a rock in there") +``` + + + +
+ +This is an interesting approach when you want to keep the state across instructions. It's better for experimentation, +but will tend to be much better at single instructions rather than complex instructions (which the [`~Agent.run`] +method is better at handling). + +This method can also take arguments if you would like to pass non-text types or specific prompts. + +### ⚠️ Remote execution + +For demonstration purposes and so that it could be used with all setups, we had created remote executors for several +of the default tools the agent has access for the release. These are created using +[inference endpoints](https://huggingface.co/inference-endpoints). + +We have turned these off for now, but in order to see how to set up remote executors tools yourself, +we recommend reading the [custom tool guide](./custom_tools). + +### What's happening here? What are tools, and what are agents? + + + +#### Agents + +The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools. + +LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the +LLM gives a small sample of code performing a task with a set of tools. This prompt is then completed by the +task you give your agent and the description of the tools you give it. This way it gets access to the doc of the +tools you are using, especially their expected inputs and outputs, and can generate the relevant code. + +#### Tools + +Tools are very simple: they're a single function, with a name, and a description. We then use these tools' descriptions +to prompt the agent. Through the prompt, we show the agent how it would leverage tools to perform what was +requested in the query. + +This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. +Pipelines are more refactored and often combine several tasks in one. Tools are meant to be focused on +one very simple task only. + +#### Code-execution?! + +This code is then executed with our small Python interpreter on the set of inputs passed along with your tools. +We hear you screaming "Arbitrary code execution!" in the back, but let us explain why that is not the case. + +The only functions that can be called are the tools you provided and the print function, so you're already +limited in what can be executed. You should be safe if it's limited to Hugging Face tools. + +Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along +inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM +to output them anyway) shouldn't be an issue. If you want to be on the super safe side, you can execute the +run() method with the additional argument return_code=True, in which case the agent will just return the code +to execute and you can decide whether to do it or not. + +The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error +with the code generated by the agent. + +### A curated set of tools + +We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated +in `transformers`: + +- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut)) +- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](./model_doc/flan-t5)) +- **Unconditional image captioning**: Caption the image! ([BLIP](./model_doc/blip)) +- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt)) +- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](./model_doc/clipseg)) +- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper)) +- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5)) +- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](./model_doc/bart)) +- **Text summarization**: summarize a long text in one or a few sentences ([BART](./model_doc/bart)) +- **Translation**: translate the text into a given language ([NLLB](./model_doc/nllb)) + +These tools have an integration in transformers, and can be used manually as well, for example: + +```py +from transformers import load_tool + +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") +``` + +### Custom tools + +While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is +the ability to quickly create and share custom tools. + +By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool +directly with the agent. We've added a few +**transformers-agnostic** tools to the [`huggingface-tools` organization](https://huggingface.co/huggingface-tools): + +- **Text downloader**: to download a text from a web URL +- **Text to image**: generate an image according to a prompt, leveraging stable diffusion +- **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion +- **Text to video**: generate a small video according to a prompt, leveraging damo-vilab + +The text-to-image tool we have been using since the beginning is a remote tool that lives in +[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will +continue releasing such tools on this and other organizations, to further supercharge this implementation. + +The agents have by default access to tools that reside on [`huggingface-tools`](https://huggingface.co/huggingface-tools). +We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools). + +### Code generation + +So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code +that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in +a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports. + +For example, the following instruction +```python +agent.run("Draw me a picture of rivers and lakes", return_code=True) +``` + +returns the following code + +```python +from transformers import load_tool + +image_generator = load_tool("huggingface-tools/text-to-image") + +image = image_generator(prompt="rivers and lakes") +``` + +that you can then modify and execute yourself. diff --git a/docs/source/en/troubleshooting.md b/docs/source/en/troubleshooting.md new file mode 100644 index 000000000000..29b032dd2799 --- /dev/null +++ b/docs/source/en/troubleshooting.md @@ -0,0 +1,198 @@ + + +# Troubleshoot + +Sometimes errors occur, but we are here to help! This guide covers some of the most common issues we've seen and how you can resolve them. However, this guide isn't meant to be a comprehensive collection of every 🤗 Transformers issue. For more help with troubleshooting your issue, try: + + + +1. Asking for help on the [forums](https://discuss.huggingface.co/). There are specific categories you can post your question to, like [Beginners](https://discuss.huggingface.co/c/beginners/5) or [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). Make sure you write a good descriptive forum post with some reproducible code to maximize the likelihood that your problem is solved! + + + +2. Create an [Issue](https://github.com/huggingface/transformers/issues/new/choose) on the 🤗 Transformers repository if it is a bug related to the library. Try to include as much information describing the bug as possible to help us better figure out what's wrong and how we can fix it. + +3. Check the [Migration](migration) guide if you use an older version of 🤗 Transformers since some important changes have been introduced between versions. + +For more details about troubleshooting and getting help, take a look at [Chapter 8](https://huggingface.co/course/chapter8/1?fw=pt) of the Hugging Face course. + + +## Firewalled environments + +Some GPU instances on cloud and intranet setups are firewalled to external connections, resulting in a connection error. When your script attempts to download model weights or datasets, the download will hang and then timeout with the following message: + +``` +ValueError: Connection error, and we cannot find the requested files in the cached path. +Please try again or make sure your Internet connection is on. +``` + +In this case, you should try to run 🤗 Transformers on [offline mode](installation#offline-mode) to avoid the connection error. + +## CUDA out of memory + +Training large models with millions of parameters can be challenging without the appropriate hardware. A common error you may encounter when the GPU runs out of memory is: + +``` +CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch) +``` + +Here are some potential solutions you can try to lessen memory use: + +- Reduce the [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) value in [`TrainingArguments`]. +- Try using [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) in [`TrainingArguments`] to effectively increase overall batch size. + + + +Refer to the Performance [guide](performance) for more details about memory-saving techniques. + + + +## Unable to load a saved TensorFlow model + +TensorFlow's [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) method will save the entire model - architecture, weights, training configuration - in a single file. However, when you load the model file again, you may run into an error because 🤗 Transformers may not load all the TensorFlow-related objects in the model file. To avoid issues with saving and loading TensorFlow models, we recommend you: + +- Save the model weights as a `h5` file extension with [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) and then reload the model with [`~TFPreTrainedModel.from_pretrained`]: + +```py +>>> from transformers import TFPreTrainedModel +>>> from tensorflow import keras + +>>> model.save_weights("some_folder/tf_model.h5") +>>> model = TFPreTrainedModel.from_pretrained("some_folder") +``` + +- Save the model with [`~TFPretrainedModel.save_pretrained`] and load it again with [`~TFPreTrainedModel.from_pretrained`]: + +```py +>>> from transformers import TFPreTrainedModel + +>>> model.save_pretrained("path_to/model") +>>> model = TFPreTrainedModel.from_pretrained("path_to/model") +``` + +## ImportError + +Another common error you may encounter, especially if it is a newly released model, is `ImportError`: + +``` +ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location) +``` + +For these error types, check to make sure you have the latest version of 🤗 Transformers installed to access the most recent models: + +```bash +pip install transformers --upgrade +``` + +## CUDA error: device-side assert triggered + +Sometimes you may run into a generic CUDA error about an error in the device code. + +``` +RuntimeError: CUDA error: device-side assert triggered +``` + +You should try to run the code on a CPU first to get a more descriptive error message. Add the following environment variable to the beginning of your code to switch to a CPU: + +```py +>>> import os + +>>> os.environ["CUDA_VISIBLE_DEVICES"] = "" +``` + +Another option is to get a better traceback from the GPU. Add the following environment variable to the beginning of your code to get the traceback to point to the source of the error: + +```py +>>> import os + +>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1" +``` + +## Incorrect output when padding tokens aren't masked + +In some cases, the output `hidden_state` may be incorrect if the `input_ids` include padding tokens. To demonstrate, load a model and tokenizer. You can access a model's `pad_token_id` to see its value. The `pad_token_id` may be `None` for some models, but you can always manually set it. + +```py +>>> from transformers import AutoModelForSequenceClassification +>>> import torch + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") +>>> model.config.pad_token_id +0 +``` + +The following example shows the output without masking the padding tokens: + +```py +>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]]) +>>> output = model(input_ids) +>>> print(output.logits) +tensor([[ 0.0082, -0.2307], + [ 0.1317, -0.1683]], grad_fn=) +``` + +Here is the actual output of the second sequence: + +```py +>>> input_ids = torch.tensor([[7592]]) +>>> output = model(input_ids) +>>> print(output.logits) +tensor([[-0.1008, -0.4061]], grad_fn=) +``` + +Most of the time, you should provide an `attention_mask` to your model to ignore the padding tokens to avoid this silent error. Now the output of the second sequence matches its actual output: + + + +By default, the tokenizer creates an `attention_mask` for you based on your specific tokenizer's defaults. + + + +```py +>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]]) +>>> output = model(input_ids, attention_mask=attention_mask) +>>> print(output.logits) +tensor([[ 0.0082, -0.2307], + [-0.1008, -0.4061]], grad_fn=) +``` + +🤗 Transformers doesn't automatically create an `attention_mask` to mask a padding token if it is provided because: + +- Some models don't have a padding token. +- For some use-cases, users want a model to attend to a padding token. + +## ValueError: Unrecognized configuration class XYZ for this kind of AutoModel + +Generally, we recommend using the [`AutoModel`] class to load pretrained instances of models. This class +can automatically infer and load the correct architecture from a given checkpoint based on the configuration. If you see +this `ValueError` when loading a model from a checkpoint, this means the Auto class couldn't find a mapping from +the configuration in the given checkpoint to the kind of model you are trying to load. Most commonly, this happens when a +checkpoint doesn't support a given task. +For instance, you'll see this error in the following example because there is no GPT2 for question answering: + +```py +>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering + +>>> processor = AutoProcessor.from_pretrained("gpt2-medium") +>>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium") +ValueError: Unrecognized configuration class for this kind of AutoModel: AutoModelForQuestionAnswering. +Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ... +``` diff --git a/docs/source/en/troubleshooting.mdx b/docs/source/en/troubleshooting.mdx deleted file mode 100644 index 74346bccef97..000000000000 --- a/docs/source/en/troubleshooting.mdx +++ /dev/null @@ -1,176 +0,0 @@ - - -# Troubleshoot - -Sometimes errors occur, but we are here to help! This guide covers some of the most common issues we've seen and how you can resolve them. However, this guide isn't meant to be a comprehensive collection of every 🤗 Transformers issue. For more help with troubleshooting your issue, try: - - - -1. Asking for help on the [forums](https://discuss.huggingface.co/). There are specific categories you can post your question to, like [Beginners](https://discuss.huggingface.co/c/beginners/5) or [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). Make sure you write a good descriptive forum post with some reproducible code to maximize the likelihood that your problem is solved! - - - -2. Create an [Issue](https://github.com/huggingface/transformers/issues/new/choose) on the 🤗 Transformers repository if it is a bug related to the library. Try to include as much information describing the bug as possible to help us better figure out what's wrong and how we can fix it. - -3. Check the [Migration](migration) guide if you use an older version of 🤗 Transformers since some important changes have been introduced between versions. - -For more details about troubleshooting and getting help, take a look at [Chapter 8](https://huggingface.co/course/chapter8/1?fw=pt) of the Hugging Face course. - - -## Firewalled environments - -Some GPU instances on cloud and intranet setups are firewalled to external connections, resulting in a connection error. When your script attempts to download model weights or datasets, the download will hang and then timeout with the following message: - -``` -ValueError: Connection error, and we cannot find the requested files in the cached path. -Please try again or make sure your Internet connection is on. -``` - -In this case, you should try to run 🤗 Transformers on [offline mode](installation#offline-mode) to avoid the connection error. - -## CUDA out of memory - -Training large models with millions of parameters can be challenging without the appropriate hardware. A common error you may encounter when the GPU runs out of memory is: - -``` -CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch) -``` - -Here are some potential solutions you can try to lessen memory use: - -- Reduce the [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) value in [`TrainingArguments`]. -- Try using [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) in [`TrainingArguments`] to effectively increase overall batch size. - - - -Refer to the Performance [guide](performance) for more details about memory-saving techniques. - - - -## Unable to load a saved TensorFlow model - -TensorFlow's [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) method will save the entire model - architecture, weights, training configuration - in a single file. However, when you load the model file again, you may run into an error because 🤗 Transformers may not load all the TensorFlow-related objects in the model file. To avoid issues with saving and loading TensorFlow models, we recommend you: - -- Save the model weights as a `h5` file extension with [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) and then reload the model with [`~TFPreTrainedModel.from_pretrained`]: - -```py ->>> from transformers import TFPreTrainedModel ->>> from tensorflow import keras - ->>> model.save_weights("some_folder/tf_model.h5") ->>> model = TFPreTrainedModel.from_pretrained("some_folder") -``` - -- Save the model with [`~TFPretrainedModel.save_pretrained`] and load it again with [`~TFPreTrainedModel.from_pretrained`]: - -```py ->>> from transformers import TFPreTrainedModel - ->>> model.save_pretrained("path_to/model") ->>> model = TFPreTrainedModel.from_pretrained("path_to/model") -``` - -## ImportError - -Another common error you may encounter, especially if it is a newly released model, is `ImportError`: - -``` -ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location) -``` - -For these error types, check to make sure you have the latest version of 🤗 Transformers installed to access the most recent models: - -```bash -pip install transformers --upgrade -``` - -## CUDA error: device-side assert triggered - -Sometimes you may run into a generic CUDA error about an error in the device code. - -``` -RuntimeError: CUDA error: device-side assert triggered -``` - -You should try to run the code on a CPU first to get a more descriptive error message. Add the following environment variable to the beginning of your code to switch to a CPU: - -```py ->>> import os - ->>> os.environ["CUDA_VISIBLE_DEVICES"] = "" -``` - -Another option is to get a better traceback from the GPU. Add the following environment variable to the beginning of your code to get the traceback to point to the source of the error: - -```py ->>> import os - ->>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1" -``` - -## Incorrect output when padding tokens aren't masked - -In some cases, the output `hidden_state` may be incorrect if the `input_ids` include padding tokens. To demonstrate, load a model and tokenizer. You can access a model's `pad_token_id` to see its value. The `pad_token_id` may be `None` for some models, but you can always manually set it. - -```py ->>> from transformers import AutoModelForSequenceClassification ->>> import torch - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") ->>> model.config.pad_token_id -0 -``` - -The following example shows the output without masking the padding tokens: - -```py ->>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]]) ->>> output = model(input_ids) ->>> print(output.logits) -tensor([[ 0.0082, -0.2307], - [ 0.1317, -0.1683]], grad_fn=) -``` - -Here is the actual output of the second sequence: - -```py ->>> input_ids = torch.tensor([[7592]]) ->>> output = model(input_ids) ->>> print(output.logits) -tensor([[-0.1008, -0.4061]], grad_fn=) -``` - -Most of the time, you should provide an `attention_mask` to your model to ignore the padding tokens to avoid this silent error. Now the output of the second sequence matches its actual output: - - - -By default, the tokenizer creates an `attention_mask` for you based on your specific tokenizer's defaults. - - - -```py ->>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]]) ->>> output = model(input_ids, attention_mask=attention_mask) ->>> print(output.logits) -tensor([[ 0.0082, -0.2307], - [-0.1008, -0.4061]], grad_fn=) -``` - -🤗 Transformers doesn't automatically create an `attention_mask` to mask a padding token if it is provided because: - -- Some models don't have a padding token. -- For some use-cases, users want a model to attend to a padding token. \ No newline at end of file diff --git a/docs/source/es/_toctree.yml b/docs/source/es/_toctree.yml index e107915dc121..dd110b746c6e 100644 --- a/docs/source/es/_toctree.yml +++ b/docs/source/es/_toctree.yml @@ -62,13 +62,15 @@ - sections: - local: debugging title: Debugging - title: Rendimiento y escalabilidad + title: Rendimiento y escalabilidad - sections: - local: add_new_pipeline title: ¿Cómo puedo añadir un pipeline a 🤗 Transformers? - local: pr_checks title: Verificaciones en un Pull Request title: Contribuir + - local: community + title: Los recursos de la comunidad title: Guías prácticas - sections: - local: philosophy diff --git a/docs/source/es/accelerate.md b/docs/source/es/accelerate.md new file mode 100644 index 000000000000..2c4063b7ca3b --- /dev/null +++ b/docs/source/es/accelerate.md @@ -0,0 +1,136 @@ + + +# Entrenamiento distribuido con 🤗 Accelerate + +El paralelismo ha emergido como una estrategia para entrenar modelos grandes en hardware limitado e incrementar la velocidad de entrenamiento en varios órdenes de magnitud. En Hugging Face creamos la biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate) para ayudar a los usuarios a entrenar modelos 🤗 Transformers en cualquier tipo de configuración distribuida, ya sea en una máquina con múltiples GPUs o en múltiples GPUs distribuidas entre muchas máquinas. En este tutorial aprenderás cómo personalizar tu bucle de entrenamiento de PyTorch nativo para poder entrenar en entornos distribuidos. + +## Configuración + +Empecemos por instalar 🤗 Accelerate: + +```bash +pip install accelerate +``` + +Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` detectará automáticamente el tipo de configuración distribuida que tengas disponible e inicializará todos los componentes necesarios para el entrenamiento. No necesitas especificar el dispositivo en donde se debe colocar tu modelo. + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## Prepárate para acelerar + +Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Esto incluye los DataLoaders de entrenamiento y evaluación, un modelo y un optimizador: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## Backward + +Por último, reemplaza el típico `loss.backward()` en tu bucle de entrenamiento con el método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) de 🤗 Accelerate: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +Como se puede ver en el siguiente código, ¡solo necesitas adicionar cuatro líneas de código a tu bucle de entrenamiento para habilitar el entrenamiento distribuido! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## Entrenamiento + +Una vez que hayas añadido las líneas de código relevantes, inicia el entrenamiento desde un script o notebook como Colaboratory. + +### Entrenar con un script + +Si estás corriendo tu entrenamiento desde un script ejecuta el siguiente comando para crear y guardar un archivo de configuración: + +```bash +accelerate config +``` + +Comienza el entrenamiento con: + +```bash +accelerate launch train.py +``` + +### Entrenar con un notebook + +🤗 Accelerate puede correr en un notebook si, por ejemplo, estás planeando utilizar las TPUs de Colaboratory. Encierra el código responsable del entrenamiento en una función y pásalo a `notebook_launcher`: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +Para obtener más información sobre 🤗 Accelerate y sus numerosas funciones, consulta la [documentación](https://huggingface.co/docs/accelerate). diff --git a/docs/source/es/accelerate.mdx b/docs/source/es/accelerate.mdx deleted file mode 100644 index 6065bc110a1d..000000000000 --- a/docs/source/es/accelerate.mdx +++ /dev/null @@ -1,132 +0,0 @@ - - -# Entrenamiento distribuido con 🤗 Accelerate - -El paralelismo ha emergido como una estrategia para entrenar modelos grandes en hardware limitado e incrementar la velocidad de entrenamiento en varios órdenes de magnitud. En Hugging Face creamos la biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate) para ayudar a los usuarios a entrenar modelos 🤗 Transformers en cualquier tipo de configuración distribuida, ya sea en una máquina con múltiples GPUs o en múltiples GPUs distribuidas entre muchas máquinas. En este tutorial aprenderás cómo personalizar tu bucle de entrenamiento de PyTorch nativo para poder entrenar en entornos distribuidos. - -## Configuración - -Empecemos por instalar 🤗 Accelerate: - -```bash -pip install accelerate -``` - -Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` detectará automáticamente el tipo de configuración distribuida que tengas disponible e inicializará todos los componentes necesarios para el entrenamiento. No necesitas especificar el dispositivo en donde se debe colocar tu modelo. - -```py ->>> from accelerate import Accelerator - ->>> accelerator = Accelerator() -``` - -## Prepárate para acelerar - -Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Esto incluye los DataLoaders de entrenamiento y evaluación, un modelo y un optimizador: - -```py ->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -... train_dataloader, eval_dataloader, model, optimizer -... ) -``` - -## Backward - -Por último, reemplaza el típico `loss.backward()` en tu bucle de entrenamiento con el método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) de 🤗 Accelerate: - -```py ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... outputs = model(**batch) -... loss = outputs.loss -... accelerator.backward(loss) - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -Como se puede ver en el siguiente código, ¡solo necesitas adicionar cuatro líneas de código a tu bucle de entrenamiento para habilitar el entrenamiento distribuido! - -```diff -+ from accelerate import Accelerator - from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler - -+ accelerator = Accelerator() - - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) - optimizer = AdamW(model.parameters(), lr=3e-5) - -- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -- model.to(device) - -+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -+ train_dataloader, eval_dataloader, model, optimizer -+ ) - - num_epochs = 3 - num_training_steps = num_epochs * len(train_dataloader) - lr_scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=num_training_steps - ) - - progress_bar = tqdm(range(num_training_steps)) - - model.train() - for epoch in range(num_epochs): - for batch in train_dataloader: -- batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss -- loss.backward() -+ accelerator.backward(loss) - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) -``` - -## Entrenamiento - -Una vez que hayas añadido las líneas de código relevantes, inicia el entrenamiento desde un script o notebook como Colaboratory. - -### Entrenar con un script - -Si estás corriendo tu entrenamiento desde un script ejecuta el siguiente comando para crear y guardar un archivo de configuración: - -```bash -accelerate config -``` - -Comienza el entrenamiento con: - -```bash -accelerate launch train.py -``` - -### Entrenar con un notebook - -🤗 Accelerate puede correr en un notebook si, por ejemplo, estás planeando utilizar las TPUs de Colaboratory. Encierra el código responsable del entrenamiento en una función y pásalo a `notebook_launcher`: - -```py ->>> from accelerate import notebook_launcher - ->>> notebook_launcher(training_function) -``` - -Para obtener más información sobre 🤗 Accelerate y sus numerosas funciones, consulta la [documentación](https://huggingface.co/docs/accelerate). diff --git a/docs/source/es/add_new_pipeline.md b/docs/source/es/add_new_pipeline.md new file mode 100644 index 000000000000..5e64c435ab98 --- /dev/null +++ b/docs/source/es/add_new_pipeline.md @@ -0,0 +1,264 @@ + + +# ¿Cómo puedo crear un pipeline personalizado? + +En esta guía, veremos cómo crear un pipeline personalizado y cómo compartirlo en el [Hub](hf.co/models) o añadirlo +a la biblioteca 🤗 Transformers. + +En primer lugar, debes decidir las entradas que tu pipeline podrá recibir. Pueden ser strings, bytes, +diccionarios o lo que te parezca que vaya a ser la entrada más apropiada. Intenta mantener estas entradas en un +formato que sea tan Python puro como sea posible, puesto que esto facilita la compatibilidad (incluso con otros +lenguajes de programación por medio de JSON). Estos serán los `inputs` (entradas) del pipeline (`preprocess`). + +Ahora debes definir los `outputs` (salidas). Al igual que con los `inputs`, entre más simple el formato, mejor. +Estas serán las salidas del método `postprocess` (posprocesamiento). + +Empieza heredando la clase base `Pipeline` con los 4 métodos que debemos implementar: `preprocess` (preprocesamiento), +`_forward` (ejecución), `postprocess` (posprocesamiento) y `_sanitize_parameters` (verificar parámetros). + +```python +from transformers import Pipeline + + +class MyPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} + + def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} + + def _forward(self, model_inputs): + # model_inputs == {"model_input": model_input} + outputs = self.model(**model_inputs) + # Quizá {"logits": Tensor(...)} + return outputs + + def postprocess(self, model_outputs): + best_class = model_outputs["logits"].softmax(-1) + return best_class +``` + +La estructura de este desglose es así para garantizar una compatibilidad más o menos transparente con el uso de +CPU/GPU y el pre/posprocesamiento en CPU en varios hilos. + +`preprocess` tomará las entradas definidas originalmente y las convertirá en algo que se le pueda pasar al modelo. +Podría contener más información y a menudo es un objeto `Dict` (diccionario). + +`_forward` contiene los detalles de la implementación y no debería ser invocado de forma directa. `forward` es el +método preferido a utilizar pues contiene verificaciones para asegurar que todo funcione en el dispositivo correcto. +Cualquier cosa que esté relacionada con un modelo real debería ir en el método `_forward`, todo lo demás va en +los métodos de preprocesamiento y posprocesamiento. + +Los métodos `postprocess` reciben la salida `_forward` y la convierten en la salida final que decidimos +anteriormente. + +`_sanitize_parameters` existe para permitir a los usuarios pasar cualesquiera parámetros cuando lo deseen, ya +sea al momento de inicializar el pipeline `pipeline(...., maybe_arg=4)` o al momento de invocarlo +`pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. + + +El método `_sanitize_parameters` devuelve 3 diccionarios de kwargs que serán pasados directamente a `preprocess`, +`_forward` y `postprocess`. No ingreses nada si el caller no se va a invocar con parámetros adicionales. +Esto permite mantener los parámetros por defecto de la definición de la función, lo que es más "natural". + +Un ejemplo clásico sería un argumento `top_k` en el posprocesamiento de una tarea de clasificación. + +```python +>>> pipe = pipeline("my-new-task") +>>> pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] + +>>> pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] +``` + +Para lograrlo, actualizaremos nuestro método `postprocess` con un valor por defecto de `5` y modificaremos +`_sanitize_parameters` para permitir este nuevo parámetro. + + +```python +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # Añade la lógica para manejar el top_k + return best_class + + +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + + postprocess_kwargs = {} + if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] + return preprocess_kwargs, {}, postprocess_kwargs +``` + +Intenta que las entradas y salidas sean muy simples e, idealmente, que puedan serializarse como JSON, pues esto +hace el uso del pipeline muy sencillo sin que el usuario tenga que preocuparse por conocer nuevos tipos de objetos. +También es relativamente común tener compatibilidad con muchos tipos diferentes de argumentos por facilidad de uso +(por ejemplo, los archivos de audio pueden ser nombres de archivo, URLs o bytes). + + +## Añadirlo a la lista de tareas + +Para registrar tu `new-task` (nueva tarea) en la lista de tareas, debes añadirla al +`PIPELINE_REGISTRY` (registro de pipelines): + +```python +from transformers.pipelines import PIPELINE_REGISTRY + +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) +``` + +Puedes especificar un modelo por defecto si lo deseas, en cuyo caso debe venir con una versión específica (que puede ser el nombre de un branch o hash de commit, en este caso usamos `"abcdef"`), así como el tipo: + +```python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # tipo de datos que maneja: texto, audio, imagen, multi-modalidad +) +``` + +## Comparte tu pipeline en el Hub + +Para compartir tu pipeline personalizado en el Hub, solo tienes que guardar el código personalizado de tu sub-clase +`Pipeline` en un archivo de Python. Por ejemplo, digamos que queremos usar un pipeline personalizado para la +clasificación de duplas de oraciones de esta forma: + +```py +import numpy as np + +from transformers import Pipeline + + +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} + + def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + + def _forward(self, model_inputs): + return self.model(**model_inputs) + + def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} +``` + +La implementación es independiente del framework y funcionará con modelos de PyTorch y TensorFlow. Si guardamos +esto en un archivo llamado `pair_classification.py`, podemos importarlo y registrarlo de la siguiente manera: + +```py +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification + +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) +``` + +Una vez hecho esto, podemos usarlo con un modelo pre-entrenado. Por ejemplo, al modelo `sgugger/finetuned-bert-mrpc` +se le hizo fine-tuning con el dataset MRPC, en el cual se clasifican duplas de oraciones como paráfrasis o no. + +```py +from transformers import pipeline + +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") +``` + +Ahora podemos compartirlo en el Hub usando el método `save_pretrained` (guardar pre-entrenado) en un `Repository`: + +```py +from huggingface_hub import Repository + +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() +``` + +Esto copiará el archivo donde definiste `PairClassificationPipeline` dentro de la carpeta `"test-dynamic-pipeline"`, +y además guardará el modelo y el tokenizer del pipeline, antes de enviar todo al repositorio +`{your_username}/test-dynamic-pipeline`. Después de esto, cualquier persona puede usarlo siempre que usen la opción +`trust_remote_code=True` (confiar en código remoto): + +```py +from transformers import pipeline + +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) +``` + +## Añadir el pipeline a 🤗 Transformers + +Si quieres contribuir tu pipeline a la biblioteca 🤗 Transformers, tendrás que añadirlo a un nuevo módulo en el +sub-módulo `pipelines` con el código de tu pipeline. Luego, debes añadirlo a la lista de tareas definidas en +`pipelines/__init__.py`. + +A continuación tienes que añadir las pruebas. Crea un nuevo archivo llamado `tests/test_pipelines_MY_PIPELINE.py` +basándote en las pruebas existentes. + +La función `run_pipeline_test` será muy genérica y se correrá sobre modelos pequeños escogidos al azar sobre todas las +arquitecturas posibles definidas en `model_mapping` y `tf_model_mapping`. + +Esto es muy importante para probar compatibilidades a futuro, lo que significa que si alguien añade un nuevo modelo +para `XXXForQuestionAnswering` entonces el pipeline intentará ejecutarse con ese modelo. Ya que los modelos son aleatorios, +es imposible verificar los valores como tales, y es por eso que hay un helper `ANY` que simplemente intentará que la +salida tenga el mismo tipo que la salida esperada del pipeline. + +También *debes* implementar 2 (preferiblemente 4) pruebas: + +- `test_small_model_pt` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido) +y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_tf`. +- `test_small_model_tf` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido) +y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_pt`. +- `test_large_model_pt` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido. +Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que +no haya divergencias en versiones futuras. +- `test_large_model_tf` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido. +Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que +no haya divergencias en versiones futuras. diff --git a/docs/source/es/add_new_pipeline.mdx b/docs/source/es/add_new_pipeline.mdx deleted file mode 100644 index 8e022077972f..000000000000 --- a/docs/source/es/add_new_pipeline.mdx +++ /dev/null @@ -1,260 +0,0 @@ - - -# ¿Cómo puedo crear un pipeline personalizado? - -En esta guía, veremos cómo crear un pipeline personalizado y cómo compartirlo en el [Hub](hf.co/models) o añadirlo -a la biblioteca 🤗 Transformers. - -En primer lugar, debes decidir las entradas que tu pipeline podrá recibir. Pueden ser strings, bytes, -diccionarios o lo que te parezca que vaya a ser la entrada más apropiada. Intenta mantener estas entradas en un -formato que sea tan Python puro como sea posible, puesto que esto facilita la compatibilidad (incluso con otros -lenguajes de programación por medio de JSON). Estos serán los `inputs` (entradas) del pipeline (`preprocess`). - -Ahora debes definir los `outputs` (salidas). Al igual que con los `inputs`, entre más simple el formato, mejor. -Estas serán las salidas del método `postprocess` (posprocesamiento). - -Empieza heredando la clase base `Pipeline` con los 4 métodos que debemos implementar: `preprocess` (preprocesamiento), -`_forward` (ejecución), `postprocess` (posprocesamiento) y `_sanitize_parameters` (verificar parámetros). - -```python -from transformers import Pipeline - - -class MyPipeline(Pipeline): - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - return preprocess_kwargs, {}, {} - - def preprocess(self, inputs, maybe_arg=2): - model_input = Tensor(inputs["input_ids"]) - return {"model_input": model_input} - - def _forward(self, model_inputs): - # model_inputs == {"model_input": model_input} - outputs = self.model(**model_inputs) - # Quizá {"logits": Tensor(...)} - return outputs - - def postprocess(self, model_outputs): - best_class = model_outputs["logits"].softmax(-1) - return best_class -``` - -La estructura de este desglose es así para garantizar una compatibilidad más o menos transparente con el uso de -CPU/GPU y el pre/posprocesamiento en CPU en varios hilos. - -`preprocess` tomará las entradas definidas originalmente y las convertirá en algo que se le pueda pasar al modelo. -Podría contener más información y a menudo es un objeto `Dict` (diccionario). - -`_forward` contiene los detalles de la implementación y no debería ser invocado de forma directa. `forward` es el -método preferido a utilizar pues contiene verificaciones para asegurar que todo funcione en el dispositivo correcto. -Cualquier cosa que esté relacionada con un modelo real debería ir en el método `_forward`, todo lo demás va en -los métodos de preprocesamiento y posprocesamiento. - -Los métodos `postprocess` reciben la salida `_forward` y la convierten en la salida final que decidimos -anteriormente. - -`_sanitize_parameters` existe para permitir a los usuarios pasar cualesquiera parámetros cuando lo deseen, ya -sea al momento de inicializar el pipeline `pipeline(...., maybe_arg=4)` o al momento de invocarlo -`pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. - - -El método `_sanitize_parameters` devuelve 3 diccionarios de kwargs que serán pasados directamente a `preprocess`, -`_forward` y `postprocess`. No ingreses nada si el caller no se va a invocar con parámetros adicionales. -Esto permite mantener los parámetros por defecto de la definición de la función, lo que es más "natural". - -Un ejemplo clásico sería un argumento `top_k` en el posprocesamiento de una tarea de clasificación. - -```python ->>> pipe = pipeline("my-new-task") ->>> pipe("This is a test") -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} -{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] - ->>> pipe("This is a test", top_k=2) -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] -``` - -Para lograrlo, actualizaremos nuestro método `postprocess` con un valor por defecto de `5` y modificaremos -`_sanitize_parameters` para permitir este nuevo parámetro. - - -```python -def postprocess(self, model_outputs, top_k=5): - best_class = model_outputs["logits"].softmax(-1) - # Añade la lógica para manejar el top_k - return best_class - - -def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - - postprocess_kwargs = {} - if "top_k" in kwargs: - postprocess_kwargs["top_k"] = kwargs["top_k"] - return preprocess_kwargs, {}, postprocess_kwargs -``` - -Intenta que las entradas y salidas sean muy simples e, idealmente, que puedan serializarse como JSON, pues esto -hace el uso del pipeline muy sencillo sin que el usuario tenga que preocuparse por conocer nuevos tipos de objetos. -También es relativamente común tener compatibilidad con muchos tipos diferentes de argumentos por facilidad de uso -(por ejemplo, los archivos de audio pueden ser nombres de archivo, URLs o bytes). - - -## Añadirlo a la lista de tareas - -Para registrar tu `new-task` (nueva tarea) en la lista de tareas, debes añadirla al -`PIPELINE_REGISTRY` (registro de pipelines): - -```python -from transformers.pipelines import PIPELINE_REGISTRY - -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, -) -``` - -Puedes especificar un modelo por defecto si lo deseas, en cuyo caso debe venir con una versión específica (que puede ser el nombre de un branch o hash de commit, en este caso usamos `"abcdef"`), así como el tipo: - -```python -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, - default={"pt": ("user/awesome_model", "abcdef")}, - type="text", # tipo de datos que maneja: texto, audio, imagen, multi-modalidad -) -``` - -## Comparte tu pipeline en el Hub - -Para compartir tu pipeline personalizado en el Hub, solo tienes que guardar el código personalizado de tu sub-clase -`Pipeline` en un archivo de Python. Por ejemplo, digamos que queremos usar un pipeline personalizado para la -clasificación de duplas de oraciones de esta forma: - -```py -import numpy as np - -from transformers import Pipeline - - -def softmax(outputs): - maxes = np.max(outputs, axis=-1, keepdims=True) - shifted_exp = np.exp(outputs - maxes) - return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) - - -class PairClassificationPipeline(Pipeline): - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "second_text" in kwargs: - preprocess_kwargs["second_text"] = kwargs["second_text"] - return preprocess_kwargs, {}, {} - - def preprocess(self, text, second_text=None): - return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) - - def _forward(self, model_inputs): - return self.model(**model_inputs) - - def postprocess(self, model_outputs): - logits = model_outputs.logits[0].numpy() - probabilities = softmax(logits) - - best_class = np.argmax(probabilities) - label = self.model.config.id2label[best_class] - score = probabilities[best_class].item() - logits = logits.tolist() - return {"label": label, "score": score, "logits": logits} -``` - -La implementación es independiente del framework y funcionará con modelos de PyTorch y TensorFlow. Si guardamos -esto en un archivo llamado `pair_classification.py`, podemos importarlo y registrarlo de la siguiente manera: - -```py -from pair_classification import PairClassificationPipeline -from transformers.pipelines import PIPELINE_REGISTRY -from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification - -PIPELINE_REGISTRY.register_pipeline( - "pair-classification", - pipeline_class=PairClassificationPipeline, - pt_model=AutoModelForSequenceClassification, - tf_model=TFAutoModelForSequenceClassification, -) -``` - -Una vez hecho esto, podemos usarlo con un modelo pre-entrenado. Por ejemplo, al modelo `sgugger/finetuned-bert-mrpc` -se le hizo fine-tuning con el dataset MRPC, en el cual se clasifican duplas de oraciones como paráfrasis o no. - -```py -from transformers import pipeline - -classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") -``` - -Ahora podemos compartirlo en el Hub usando el método `save_pretrained` (guardar pre-entrenado) en un `Repository`: - -```py -from huggingface_hub import Repository - -repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") -classifier.save_pretrained("test-dynamic-pipeline") -repo.push_to_hub() -``` - -Esto copiará el archivo donde definiste `PairClassificationPipeline` dentro de la carpeta `"test-dynamic-pipeline"`, -y además guardará el modelo y el tokenizer del pipeline, antes de enviar todo al repositorio -`{your_username}/test-dynamic-pipeline`. Después de esto, cualquier persona puede usarlo siempre que usen la opción -`trust_remote_code=True` (confiar en código remoto): - -```py -from transformers import pipeline - -classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) -``` - -## Añadir el pipeline a 🤗 Transformers - -Si quieres contribuir tu pipeline a la biblioteca 🤗 Transformers, tendrás que añadirlo a un nuevo módulo en el -sub-módulo `pipelines` con el código de tu pipeline. Luego, debes añadirlo a la lista de tareas definidas en -`pipelines/__init__.py`. - -A continuación tienes que añadir las pruebas. Crea un nuevo archivo llamado `tests/test_pipelines_MY_PIPELINE.py` -basándote en las pruebas existentes. - -La función `run_pipeline_test` será muy genérica y se correrá sobre modelos pequeños escogidos al azar sobre todas las -arquitecturas posibles definidas en `model_mapping` y `tf_model_mapping`. - -Esto es muy importante para probar compatibilidades a futuro, lo que significa que si alguien añade un nuevo modelo -para `XXXForQuestionAnswering` entonces el pipeline intentará ejecutarse con ese modelo. Ya que los modelos son aleatorios, -es imposible verificar los valores como tales, y es por eso que hay un helper `ANY` que simplemente intentará que la -salida tenga el mismo tipo que la salida esperada del pipeline. - -También *debes* implementar 2 (preferiblemente 4) pruebas: - -- `test_small_model_pt` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido) -y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_tf`. -- `test_small_model_tf` : Define un (1) modelo pequeño para este pipeline (no importa si los resultados no tienen sentido) -y prueba las salidas del pipeline. Los resultados deberían ser los mismos que en `test_small_model_pt`. -- `test_large_model_pt` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido. -Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que -no haya divergencias en versiones futuras. -- `test_large_model_tf` (`optional`): Prueba el pipeline en una tarea real en la que los resultados deben tener sentido. -Estas pruebas son lentas y deben marcarse como tales. El objetivo de esto es ejemplificar el pipeline y asegurarse de que -no haya divergencias en versiones futuras. diff --git a/docs/source/es/autoclass_tutorial.md b/docs/source/es/autoclass_tutorial.md new file mode 100644 index 000000000000..8b3ddd230b6b --- /dev/null +++ b/docs/source/es/autoclass_tutorial.md @@ -0,0 +1,123 @@ + + +# Carga instancias preentrenadas con un AutoClass + +Con tantas arquitecturas diferentes de Transformer puede ser retador crear una para tu checkpoint. Como parte de la filosofía central de 🤗 Transformers para hacer que la biblioteca sea fácil, simple y flexible de usar; una `AutoClass` automáticamente infiere y carga la arquitectura correcta desde un checkpoint dado. El método `from_pretrained` te permite cargar rápidamente un modelo preentrenado para cualquier arquitectura, por lo que no tendrás que dedicar tiempo y recursos para entrenar uno desde cero. Producir este tipo de código con checkpoint implica que si funciona con uno, funcionará también con otro (siempre que haya sido entrenado para una tarea similar) incluso si la arquitectura es distinta. + + + +Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/bert-base-uncased) es una arquitectura, mientras que `bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint. + + + +En este tutorial, aprenderás a: + +* Cargar un tokenizador pre-entrenado. +* Cargar un extractor de características (feature extractor en inglés) pre-entrenado. +* Cargar un procesador pre-entrenado. +* Cargar un modelo pre-entrenado. + +## AutoTokenizer + +Casi cualquier tarea de Procesamiento de Lenguaje Natural comienza con un tokenizador. Un tokenizador convierte tu input a un formato que puede ser procesado por el modelo. + +Carga un tokenizador con [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +Luego tokeniza tu input como lo mostrado a continuación: + +```py +>>> sequence = "In a hole in the ground there lived a hobbit." +>>> print(tokenizer(sequence)) +{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +## AutoFeatureExtractor + +Para tareas de audio y visión, un extractor de características procesa la señal de audio o imagen al formato de input correcto. + +Carga un extractor de características con [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained( +... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +## AutoProcessor + +Las tareas multimodales requieren un procesador que combine dos tipos de herramientas de preprocesamiento. Por ejemplo, el modelo [LayoutLMV2](model_doc/layoutlmv2) requiere que un extractor de características maneje las imágenes y que un tokenizador maneje el texto; un procesador combina ambas. + +Carga un procesador con [`AutoProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") +``` + +## AutoModel + + + +Finalmente, las clases `AutoModelFor` te permiten cargar un modelo preentrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, cargue un modelo para clasificación de secuencias con [`AutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +Generalmente recomendamos utilizar las clases `AutoTokenizer` y `AutoModelFor` para cargar instancias pre-entrenadas de modelos. Ésto asegurará que cargues la arquitectura correcta en cada ocasión. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning. + + +Finalmente, la clase `TFAutoModelFor` te permite cargar tu modelo pre-entrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, carga un modelo para clasificación de secuencias con [`TFAutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +Generalmente recomendamos utilizar las clases `AutoTokenizer` y `TFAutoModelFor` para cargar instancias de modelos pre-entrenados. Ésto asegurará que cargues la arquitectura correcta cada vez. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning. + + diff --git a/docs/source/es/autoclass_tutorial.mdx b/docs/source/es/autoclass_tutorial.mdx deleted file mode 100644 index e04a639422bb..000000000000 --- a/docs/source/es/autoclass_tutorial.mdx +++ /dev/null @@ -1,119 +0,0 @@ - - -# Carga instancias preentrenadas con un AutoClass - -Con tantas arquitecturas diferentes de Transformer puede ser retador crear una para tu checkpoint. Como parte de la filosofía central de 🤗 Transformers para hacer que la biblioteca sea fácil, simple y flexible de usar; una `AutoClass` automáticamente infiere y carga la arquitectura correcta desde un checkpoint dado. El método `from_pretrained` te permite cargar rápidamente un modelo preentrenado para cualquier arquitectura, por lo que no tendrás que dedicar tiempo y recursos para entrenar uno desde cero. Producir este tipo de código con checkpoint implica que si funciona con uno, funcionará también con otro (siempre que haya sido entrenado para una tarea similar) incluso si la arquitectura es distinta. - - - -Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/bert-base-uncased) es una arquitectura, mientras que `bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint. - - - -En este tutorial, aprenderás a: - -* Cargar un tokenizador pre-entrenado. -* Cargar un extractor de características (feature extractor en inglés) pre-entrenado. -* Cargar un procesador pre-entrenado. -* Cargar un modelo pre-entrenado. - -## AutoTokenizer - -Casi cualquier tarea de Procesamiento de Lenguaje Natural comienza con un tokenizador. Un tokenizador convierte tu input a un formato que puede ser procesado por el modelo. - -Carga un tokenizador con [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -``` - -Luego tokeniza tu input como lo mostrado a continuación: - -```py ->>> sequence = "In a hole in the ground there lived a hobbit." ->>> print(tokenizer(sequence)) -{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -## AutoFeatureExtractor - -Para tareas de audio y visión, un extractor de características procesa la señal de audio o imagen al formato de input correcto. - -Carga un extractor de características con [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained( -... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -## AutoProcessor - -Las tareas multimodales requieren un procesador que combine dos tipos de herramientas de preprocesamiento. Por ejemplo, el modelo [LayoutLMV2](model_doc/layoutlmv2) requiere que un extractor de características maneje las imágenes y que un tokenizador maneje el texto; un procesador combina ambas. - -Carga un procesador con [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") -``` - -## AutoModel - - - -Finalmente, las clases `AutoModelFor` te permiten cargar un modelo preentrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, cargue un modelo para clasificación de secuencias con [`AutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente: - -```py ->>> from transformers import AutoModelForTokenClassification - ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - -Generalmente recomendamos utilizar las clases `AutoTokenizer` y `AutoModelFor` para cargar instancias pre-entrenadas de modelos. Ésto asegurará que cargues la arquitectura correcta en cada ocasión. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning. - - -Finalmente, la clase `TFAutoModelFor` te permite cargar tu modelo pre-entrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, carga un modelo para clasificación de secuencias con [`TFAutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - -Generalmente recomendamos utilizar las clases `AutoTokenizer` y `TFAutoModelFor` para cargar instancias de modelos pre-entrenados. Ésto asegurará que cargues la arquitectura correcta cada vez. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning. - - diff --git a/docs/source/es/bertology.md b/docs/source/es/bertology.md new file mode 100644 index 000000000000..ed4e12a8d59c --- /dev/null +++ b/docs/source/es/bertology.md @@ -0,0 +1,41 @@ + + +# BERTología + +Hay un creciente campo de estudio empeñado en la investigación del funcionamiento interno de los transformers de gran escala como BERT +(que algunos llaman "BERTología"). Algunos buenos ejemplos de este campo son: + + +- BERT Rediscovers the Classical NLP Pipeline por Ian Tenney, Dipanjan Das, Ellie Pavlick: + https://arxiv.org/abs/1905.05950 +- Are Sixteen Heads Really Better than One? por Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 +- What Does BERT Look At? An Analysis of BERT's Attention por Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. + Manning: https://arxiv.org/abs/1906.04341 +- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 + +Para asistir al desarrollo de este nuevo campo, hemos incluido algunas features adicionales en los modelos BERT/GPT/GPT-2 para +ayudar a acceder a las representaciones internas, principalmente adaptado de la gran obra de Paul Michel +(https://arxiv.org/abs/1905.10650): + + +- accediendo a todos los hidden-states de BERT/GPT/GPT-2, +- accediendo a todos los pesos de atención para cada head de BERT/GPT/GPT-2, +- adquiriendo los valores de salida y gradientes de las heads para poder computar la métrica de importancia de las heads y realizar la poda de heads como se explica + en https://arxiv.org/abs/1905.10650. + +Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en +GLUE. diff --git a/docs/source/es/bertology.mdx b/docs/source/es/bertology.mdx deleted file mode 100644 index 4a3a1e551bcf..000000000000 --- a/docs/source/es/bertology.mdx +++ /dev/null @@ -1,36 +0,0 @@ - - -# BERTología - -Hay un creciente campo de estudio empeñado en la investigación del funcionamiento interno de los transformers de gran escala como BERT -(que algunos llaman "BERTología"). Algunos buenos ejemplos de este campo son: - - -- BERT Rediscovers the Classical NLP Pipeline por Ian Tenney, Dipanjan Das, Ellie Pavlick: - https://arxiv.org/abs/1905.05950 -- Are Sixteen Heads Really Better than One? por Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 -- What Does BERT Look At? An Analysis of BERT's Attention por Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. - Manning: https://arxiv.org/abs/1906.04341 - -Para asistir al desarrollo de este nuevo campo, hemos incluido algunas features adicionales en los modelos BERT/GPT/GPT-2 para -ayudar a acceder a las representaciones internas, principalmente adaptado de la gran obra de Paul Michel -(https://arxiv.org/abs/1905.10650): - - -- accediendo a todos los hidden-states de BERT/GPT/GPT-2, -- accediendo a todos los pesos de atención para cada head de BERT/GPT/GPT-2, -- adquiriendo los valores de salida y gradientes de las heads para poder computar la métrica de importancia de las heads y realizar la poda de heads como se explica - en https://arxiv.org/abs/1905.10650. - -Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en -GLUE. diff --git a/docs/source/es/community.md b/docs/source/es/community.md new file mode 100644 index 000000000000..261970e6fe7d --- /dev/null +++ b/docs/source/es/community.md @@ -0,0 +1,69 @@ + + +# Comunidad + +Esta página agrupa los recursos de 🤗 Transformers desarrollados por la comunidad. + +## Los recursos de la comunidad: + +| Recurso | Descripción | Autor | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | Un conjunto de flashcards basadas en el [Glosario de documentos de Transformers] (glosario) que se ha puesto en un formato que se puede aprender/revisar fácilmente usando [Anki] (https://apps.ankiweb.net/) una fuente abierta, aplicación de multiplataforma diseñada específicamente para la retención de conocimientos a largo plazo. Ve este [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## Los cuadernos de la comunidad: + +| Cuaderno | Descripción | Autor | | +|:----------|:-------------|:-------------|------:| +| [Ajustar un transformador preentrenado para generar letras](https://github.com/AlekseyKorshuk/huggingartists) | Cómo generar letras al estilo de tu artista favorito ajustando un modelo GPT-2 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Entrenar T5 en Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | Cómo entrenar a T5 para cualquier tarea usando Tensorflow 2. Este cuaderno demuestra una tarea de preguntas y respuestas implementada en Tensorflow 2 usando SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Entrenar T5 en TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | Cómo entrenar a T5 en SQUAD con Transformers y Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Ajustar T5 para Clasificación y Opción Múltiple](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | Cómo ajustar T5 para clasificación y tareas de opción múltiple usando un formato de texto a texto con PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Ajustar DialoGPT en nuevos conjuntos de datos e idiomas](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | Cómo ajustar el modelo DialoGPT en un nuevo conjunto de datos para chatbots conversacionales de diálogo abierto | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Modelado de secuencias largas con Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | Cómo entrenar en secuencias de hasta 500,000 tokens con Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Ajustar BART para resumir](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | Cómo ajustar BART para resumir con fastai usando blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | +| [Ajustar un Transformador previamente entrenado en los tweets de cualquier persona](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | Cómo generar tweets al estilo de tu cuenta de Twitter favorita ajustando un modelo GPT-2 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimizar 🤗 modelos de Hugging Face con pesos y sesgos](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | Un tutorial completo que muestra la integración de W&B con Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Preentrenar Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | Cómo construir una versión "larga" de modelos preentrenados existentes | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Ajustar Longformer para control de calidad](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | Cómo ajustar el modelo antiguo para la tarea de control de calidad | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluar modelo con 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | Cómo evaluar longformer en TriviaQA con `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Ajustar fino de T5 para la extracción de amplitud de opinión](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | Cómo ajustar T5 para la extracción de intervalos de opiniones mediante un formato de texto a texto con PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Ajustar fino de DistilBert para la clasificación multiclase](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | Cómo ajustar DistilBert para la clasificación multiclase con PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Ajustar BERT para la clasificación de etiquetas múltiples](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| Cómo ajustar BERT para la clasificación de múltiples etiquetas usando PyTorch |[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Ajustar T5 para resumir](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| Cómo ajustar T5 para resumir en PyTorch y realizar un seguimiento de los experimentos con WandB |[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Acelerar el ajuste fino en transformadores con Dynamic Padding/Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)| Cómo acelerar el ajuste fino en un factor de 2 usando relleno dinámico/cubetas |[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Preentrenar Reformer para modelado de lenguaje enmascarado](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| Cómo entrenar un modelo Reformer con capas de autoatención bidireccionales | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Ampliar y ajustar Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| Cómo aumentar el vocabulario de un modelo SciBERT preentrenado de AllenAI en el conjunto de datos CORD y canalizarlo. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Ajustar fino de BlenderBotSmall para resúmenes usando la API de Entrenador](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| Cómo ajustar BlenderBotSmall para resumir en un conjunto de datos personalizado, utilizando la API de Entrenador. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Ajustar Electra e interpreta con gradientes integrados](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | Cómo ajustar Electra para el análisis de sentimientos e interpretar predicciones con Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[ajustar un modelo GPT-2 que no está en inglés con la clase Trainer](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | Cómo ajustar un modelo GPT-2 que no está en inglés con la clase Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Ajustar un modelo DistilBERT para la tarea de clasificación de múltiples etiquetas](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | Cómo ajustar un modelo DistilBERT para la tarea de clasificación de múltiples etiquetas | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Ajustar ALBERT para la clasificación de pares de oraciones](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | Cómo ajustar un modelo ALBERT u otro modelo basado en BERT para la tarea de clasificación de pares de oraciones | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Ajustar a Roberta para el análisis de sentimientos](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | Cómo ajustar un modelo de Roberta para el análisis de sentimientos | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluación de modelos de generación de preguntas](https://github.com/flexudy-pipe/qugeev) | ¿Qué tan precisas son las respuestas a las preguntas generadas por tu modelo de transformador seq2seq? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Clasificar texto con DistilBERT y Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | Cómo ajustar DistilBERT para la clasificación de texto en TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Aprovechar BERT para el resumen de codificador y decodificador en CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* con un punto de control *bert-base-uncased* para resumir en CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Aprovechar RoBERTa para el resumen de codificador-decodificador en BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* compartido con un punto de control *roberta-base* para resumir en BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Ajustar TAPAS en Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | Cómo ajustar *TapasForQuestionAnswering* con un punto de control *tapas-base* en el conjunto de datos del Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluar TAPAS en Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | Cómo evaluar un *TapasForSequenceClassification* ajustado con un punto de control *tapas-base-finetuned-tabfact* usando una combinación de 🤗 conjuntos de datos y 🤗 bibliotecas de transformadores | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Ajustar de mBART para traducción](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Cómo ajustar mBART utilizando Seq2SeqTrainer para la traducción del hindi al inglés | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Ajustar LayoutLM en FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | Cómo ajustar *LayoutLMForTokenClassification* en el conjunto de datos de FUNSD para la extracción de información de documentos escaneados | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Ajustar DistilGPT2 y genere texto](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | Cómo ajustar DistilGPT2 y generar texto | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Ajustar LED en tokens de hasta 8K](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | Cómo ajustar LED en pubmed para resúmenes de largo alcance | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluar LED en Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | Cómo evaluar efectivamente LED en resúmenes de largo alcance | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Ajustar fino de LayoutLM en RVL-CDIP (un conjunto de datos de clasificación de imágenes de documentos)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | Cómo ajustar *LayoutLMForSequenceClassification* en el conjunto de datos RVL-CDIP para la clasificación de documentos escaneados | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Decodificación Wav2Vec2 CTC con ajuste GPT2](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | Cómo decodificar la secuencia CTC con el ajuste del modelo de lenguaje | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Ajustar BART para resúmenes en dos idiomas con la clase Trainer](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | Cómo ajustar BART para resúmenes en dos idiomas con la clase Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluar Big Bird en Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | Cómo evaluar BigBird en respuesta a preguntas de documentos largos en Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Crear subtítulos de video usando Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | Cómo crear subtítulos de YouTube a partir de cualquier vídeo transcribiendo el audio con Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Ajustar el transformador de visión en CIFAR-10 usando PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | Cómo ajustar el transformador de visión (ViT) en CIFAR-10 usando transformadores HuggingFace, conjuntos de datos y PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Ajustar el Transformador de visión en CIFAR-10 usando el 🤗 Entrenador](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | Cómo ajustar el Vision Transformer (ViT) en CIFAR-10 usando HuggingFace Transformers, Datasets y el 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluar LUKE en Open Entity, un conjunto de datos de tipificación de entidades](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | Cómo evaluar *LukeForEntityClassification* en el conjunto de datos de entidad abierta | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluar LUKE en TACRED, un conjunto de datos de extracción de relaciones](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | Cómo evaluar *LukeForEntityPairClassification* en el conjunto de datos TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluar LUKE en CoNLL-2003, un punto de referencia importante de NER](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | Cómo evaluar *LukeForEntitySpanClassification* en el conjunto de datos CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluar BigBird-Pegasus en el conjunto de datos de PubMed](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | Cómo evaluar *BigBirdPegasusForConditionalGeneration* en el conjunto de datos de PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Clasificación de emociones del habla con Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | Cómo aprovechar un modelo Wav2Vec2 preentrenado para la clasificación de emociones en el conjunto de datos MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detectar objetos en una imagen con DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | Cómo usar un modelo entrenado *DetrForObjectDetection* para detectar objetos en una imagen y visualizar la atención | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Ajustar el DETR en un conjunto de datos de detección de objetos personalizados](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | Cómo ajustar *DetrForObjectDetection* en un conjunto de datos de detección de objetos personalizados | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Ajustar T5 para el reconocimiento de entidades nombradas](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | Cómo ajustar *T5* en una tarea de reconocimiento de entidad nombrada | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | diff --git a/docs/source/es/converting_tensorflow_models.md b/docs/source/es/converting_tensorflow_models.md new file mode 100644 index 000000000000..c7e22bddac70 --- /dev/null +++ b/docs/source/es/converting_tensorflow_models.md @@ -0,0 +1,153 @@ + + +# Convertir checkpoints de Tensorflow + +Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca. + + + +Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0. + +La siguiente documentación refleja el formato para el comando **transformers-cli convert**. + + + +## BERT + +Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py). + +Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)). + +Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch. + +Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch. + +Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado: + +```bash +export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 + +transformers-cli convert --model_type bert \ + --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \ + --config $BERT_BASE_DIR/bert_config.json \ + --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin +``` + +Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models). + +## ALBERT + +Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py). + +La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch. + +Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado: + +```bash +export ALBERT_BASE_DIR=/path/to/albert/albert_base + +transformers-cli convert --model_type albert \ + --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \ + --config $ALBERT_BASE_DIR/albert_config.json \ + --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin +``` + +Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models). + +## OpenAI GPT + +Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)): + +```bash +export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights + +transformers-cli convert --model_type gpt \ + --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--config OPENAI_GPT_CONFIG] \ + [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \ +``` + +## OpenAI GPT-2 + +Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)): + +```bash +export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights + +transformers-cli convert --model_type gpt2 \ + --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--config OPENAI_GPT2_CONFIG] \ + [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK] +``` + +## Transformer-XL + +Aquí hay un ejemplo del proceso para convertir un modelo Transformer-XL pre-entrenado (más información [aquí](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)): + +```bash +export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint + +transformers-cli convert --model_type transfo_xl \ + --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--config TRANSFO_XL_CONFIG] \ + [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK] +``` + +## XLNet + +Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado: + +```bash +export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint +export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config + +transformers-cli convert --model_type xlnet \ + --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \ + --config $TRANSFO_XL_CONFIG_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--finetuning_task_name XLNET_FINETUNED_TASK] \ +``` + +## XLM + +Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado: + +```bash +export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint + +transformers-cli convert --model_type xlm \ + --tf_checkpoint $XLM_CHECKPOINT_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT + [--config XML_CONFIG] \ + [--finetuning_task_name XML_FINETUNED_TASK] +``` + +## T5 + +Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado: + +```bash +export T5=/path/to/t5/uncased_L-12_H-768_A-12 + +transformers-cli convert --model_type t5 \ + --tf_checkpoint $T5/t5_model.ckpt \ + --config $T5/t5_config.json \ + --pytorch_dump_output $T5/pytorch_model.bin +``` diff --git a/docs/source/es/converting_tensorflow_models.mdx b/docs/source/es/converting_tensorflow_models.mdx deleted file mode 100644 index 2ab15e81b250..000000000000 --- a/docs/source/es/converting_tensorflow_models.mdx +++ /dev/null @@ -1,149 +0,0 @@ - - -# Convertir checkpoints de Tensorflow - -Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca. - - - -Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0. - -La siguiente documentación refleja el formato para el comando **transformers-cli convert**. - - - -## BERT - -Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py). - -Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)). - -Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch. - -Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch. - -Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado: - -```bash -export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 - -transformers-cli convert --model_type bert \ - --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \ - --config $BERT_BASE_DIR/bert_config.json \ - --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin -``` - -Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models). - -## ALBERT - -Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py). - -La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch. - -Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado: - -```bash -export ALBERT_BASE_DIR=/path/to/albert/albert_base - -transformers-cli convert --model_type albert \ - --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \ - --config $ALBERT_BASE_DIR/albert_config.json \ - --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin -``` - -Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models). - -## OpenAI GPT - -Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)): - -```bash -export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights - -transformers-cli convert --model_type gpt \ - --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config OPENAI_GPT_CONFIG] \ - [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \ -``` - -## OpenAI GPT-2 - -Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)): - -```bash -export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights - -transformers-cli convert --model_type gpt2 \ - --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config OPENAI_GPT2_CONFIG] \ - [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK] -``` - -## Transformer-XL - -Aquí hay un ejemplo del proceso para convertir un modelo Transformer-XL pre-entrenado (más información [aquí](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)): - -```bash -export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint - -transformers-cli convert --model_type transfo_xl \ - --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config TRANSFO_XL_CONFIG] \ - [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK] -``` - -## XLNet - -Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado: - -```bash -export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint -export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config - -transformers-cli convert --model_type xlnet \ - --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \ - --config $TRANSFO_XL_CONFIG_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--finetuning_task_name XLNET_FINETUNED_TASK] \ -``` - -## XLM - -Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado: - -```bash -export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint - -transformers-cli convert --model_type xlm \ - --tf_checkpoint $XLM_CHECKPOINT_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT - [--config XML_CONFIG] \ - [--finetuning_task_name XML_FINETUNED_TASK] -``` - -## T5 - -Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado: - -```bash -export T5=/path/to/t5/uncased_L-12_H-768_A-12 - -transformers-cli convert --model_type t5 \ - --tf_checkpoint $T5/t5_model.ckpt \ - --config $T5/t5_config.json \ - --pytorch_dump_output $T5/pytorch_model.bin -``` diff --git a/docs/source/es/create_a_model.md b/docs/source/es/create_a_model.md new file mode 100644 index 000000000000..04014a7b6a70 --- /dev/null +++ b/docs/source/es/create_a_model.md @@ -0,0 +1,371 @@ + + +# Crea una arquitectura personalizada + +Una [`AutoClass`](model_doc/auto) infiere, automáticamente, la arquitectura del modelo y descarga la configuración y los pesos del modelo preentrenado. Normalmente, recomendamos usar una `AutoClass` para producir un código agnóstico a puntos de guardado o checkpoints. Sin embargo, los usuarios que quieran más control sobre los parámetros específicos de los modelos pueden crear su propio modelo 🤗 Transformers personalizado a partir de varias clases base. Esto puede ser particularmente útil para alguien que esté interesado en estudiar, entrenar o experimentar con modelos 🤗 Transformers. En esta guía vamos a profundizar en la creación de modelos personalizados sin usar `AutoClass`. Aprenderemos a: + +- Cargar y personalizar una configuración para un modelo. +- Crear una arquitectura para un modelo. +- Crear tokenizadores rápidos y lentos para textos. +- Crear un extractor de propiedades para tareas de audio o imágenes. +- Crear un procesador para tareas multimodales. + +## Configuración + +Una [configuración](main_classes/configuration) es un conjunto de atributos específicos de un modelo. Cada configuración de modelo tiene atributos diferentes. Por ejemplo, todos los modelos de PLN tienen los atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` y `vocab_size` en común. Estos atributos especifican el número de cabezas de atención o de capas ocultas con las que se construyen los modelos. + +Puedes echarle un vistazo a [DistilBERT](model_doc/distilbert) y sus atributos accediendo a [`DistilBertConfig`]: + +```py +>>> from transformers import DistilBertConfig + +>>> config = DistilBertConfig() +>>> print(config) +DistilBertConfig { + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +[`DistilBertConfig`] muestra todos los atributos por defecto que se han usado para construir un modelo [`DistilBertModel`] base. Todos ellos son personalizables, lo que deja espacio para poder experimentar. Por ejemplo, puedes personalizar un modelo predeterminado para: + +- Probar una función de activación diferente, usando el parámetro `activation`. +- Usar un valor de abandono (también conocido como _dropout_) más alto para las probabilidades de las capas de atención, usando el parámetro `attention_dropout`. + +```py +>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) +>>> print(my_config) +DistilBertConfig { + "activation": "relu", + "attention_dropout": 0.4, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +Los atributos de los modelos preentrenados pueden ser modificados con la función [`~PretrainedConfig.from_pretrained`]: + +```py +>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) +``` + +Cuando estés satisfecho con la configuración de tu modelo, puedes guardarlo con la función [`~PretrainedConfig.save_pretrained`]. Tu configuración se guardará en un archivo JSON dentro del directorio que le especifiques como parámetro. + +```py +>>> my_config.save_pretrained(save_directory="./your_model_save_path") +``` + +Para volver a usar el archivo de configuración, puedes cargarlo usando [`~PretrainedConfig.from_pretrained`]: + +```py +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +``` + + + +También puedes guardar los archivos de configuración como un diccionario; o incluso guardar solo la diferencia entre tu archivo personalizado y la configuración por defecto. Consulta la [documentación sobre configuración](main_classes/configuration) para ver más detalles. + + + +## Modelo + +El siguiente paso será crear un [modelo](main_classes/models). El modelo, al que a veces también nos referimos como arquitectura, es el encargado de definir cada capa y qué operaciones se realizan. Los atributos como `num_hidden_layers` de la configuración se usan para definir la arquitectura. Todos los modelos comparten una clase base, [`PreTrainedModel`], y algunos métodos comunes que se pueden usar para redimensionar los _embeddings_ o para recortar cabezas de auto-atención (también llamadas _self-attention heads_). Además, todos los modelos son subclases de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module), lo que significa que son compatibles con su respectivo framework. + + + + +Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma: + +```py +>>> from transformers import DistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> model = DistilBertModel(my_config) +``` + +Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo una fracción de los recursos que un entrenamiento completo hubiera requerido. + +Puedes crear un modelo preentrenado con [`~PreTrainedModel.from_pretrained`]: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + + +Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma: + +```py +>>> from transformers import TFDistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> tf_model = TFDistilBertModel(my_config) +``` + +Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo solo una fracción de los recursos que un entrenamiento completo hubiera requerido. + +Puedes crear un modelo preentrenado con [`~TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si este nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + + +### Cabezas de modelo + +En este punto del tutorial, tenemos un modelo DistilBERT base que devuelve los *hidden states* o estados ocultos. Los *hidden states* se pasan como parámetros de entrada a la cabeza del modelo para producir la salida. 🤗 Transformers ofrece una cabeza de modelo diferente para cada tarea, siempre y cuando el modelo sea compatible para la tarea (por ejemplo, no puedes usar DistilBERT para una tarea secuencia a secuencia como la traducción). + + + + + +Por ejemplo, [`DistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas. + +```py +>>> from transformers import DistilBertForSequenceClassification + +>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`DistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. + + +```py +>>> from transformers import DistilBertForQuestionAnswering + +>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + + +Por ejemplo, [`TFDistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas. + +```py +>>> from transformers import TFDistilBertForSequenceClassification + +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`TFDistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. + + +```py +>>> from transformers import TFDistilBertForQuestionAnswering + +>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + + +## Tokenizer + +La ultima clase base que debes conocer antes de usar un modelo con datos textuales es la clase [tokenizer](main_classes/tokenizer), que convierte el texto bruto en tensores. Hay dos tipos de *tokenizers* que puedes usar con 🤗 Transformers: + +- [`PreTrainedTokenizer`]: una implementación de un *tokenizer* hecha en Python. +- [`PreTrainedTokenizerFast`]: un *tokenizer* de nuestra librería [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/), basada en Rust. Este tipo de *tokenizer* es bastante más rápido, especialmente durante la tokenización por lotes, gracias a estar implementado en Rust. Esta rápida tokenización también ofrece métodos adicionales como el *offset mapping*, que relaciona los tokens con sus palabras o caracteres originales. + +Ambos *tokenizers* son compatibles con los métodos comunes, como los de encodificación y decodificación, los métodos para añadir tokens y aquellos que manejan tokens especiales. + + + +No todos los modelos son compatibles con un *tokenizer* rápido. Échale un vistazo a esta [tabla](index#supported-frameworks) para comprobar si un modelo específico es compatible con un *tokenizer* rápido. + + + +Si has entrenado tu propio *tokenizer*, puedes crear uno desde tu archivo de “vocabulario”: + +```py +>>> from transformers import DistilBertTokenizer + +>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") +``` + +Es importante recordar que los vocabularios que provienen de un *tokenizer* personalizado serán diferentes a los vocabularios generados por el *tokenizer* de un modelo preentrenado. Debes usar el vocabulario de un *tokenizer* preentrenado si vas a usar un modelo preentrenado, de lo contrario las entradas no tendrán sentido. Crea un *tokenizer* con el vocabulario de un modelo preentrenado usando la clase [`DistilBertTokenizer`]: + + +```py +>>> from transformers import DistilBertTokenizer + +>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") +``` + +Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]: + + +```py +>>> from transformers import DistilBertTokenizerFast + +>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") +``` + + + +Por defecto, el [`AutoTokenizer`] intentará cargar un *tokenizer* rápido. Puedes desactivar este comportamiento cambiando el parámetro `use_fast=False` de `from_pretrained`. + + + + +## Extractor de Características + +Un extractor de características procesa entradas de audio e imagen. Hereda de la clase base [`~feature_extraction_utils.FeatureExtractionMixin`] y también puede heredar de la clase [`ImageFeatureExtractionMixin`] para el procesamiento de características de las imágenes o de la clase [`SequenceFeatureExtractor`] para el procesamiento de entradas de audio. + +Dependiendo de si trabajas en una tarea de audio o de video, puedes crear un extractor de características asociado al modelo que estés usando. Por ejemplo, podrías crear un [`ViTFeatureExtractor`] por defecto si estás usando [ViT](model_doc/vit) para clasificación de imágenes: + +```py +>>> from transformers import ViTFeatureExtractor + +>>> vit_extractor = ViTFeatureExtractor() +>>> print(vit_extractor) +ViTFeatureExtractor { + "do_normalize": true, + "do_resize": true, + "feature_extractor_type": "ViTFeatureExtractor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} +``` + + + +Si no estás buscando ninguna personalización en específico, usa el método `from_pretrained` para cargar los parámetros del extractor de características por defecto del modelo. + + + +Puedes modificar cualquier parámetro de [`ViTFeatureExtractor`] para crear tu extractor de características personalizado: + +```py +>>> from transformers import ViTFeatureExtractor + +>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +>>> print(my_vit_extractor) +ViTFeatureExtractor { + "do_normalize": false, + "do_resize": true, + "feature_extractor_type": "ViTFeatureExtractor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} +``` + +Para las entradas de audio, puedes crear un [`Wav2Vec2FeatureExtractor`] y personalizar los parámetros de una forma similar: + + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor() +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} +``` + +## Procesador + +Para modelos que son compatibles con tareas multimodales, 🤗 Transformers ofrece una clase *procesador* que agrupa un extractor de características y un *tokenizer* en el mismo objeto. Por ejemplo, probemos a usar el procesador [`Wav2Vec2Processor`] para una tarea de reconocimiento de voz (ASR). Un ASR transcribe el audio a texto, por lo que necesitaremos un extractor de características y un *tokenizer*. + +Crea un extractor de características para manejar la entrada de audio: + + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) +``` + +Crea un *tokenizer* para manejar la entrada de texto: + +```py +>>> from transformers import Wav2Vec2CTCTokenizer + +>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") +``` + +Puedes combinar el extractor de características y el *tokenizer* en el [`Wav2Vec2Processor`]: + + +```py +>>> from transformers import Wav2Vec2Processor + +>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) +``` +Con dos clases base (la configuración y el modelo) y una clase de preprocesamiento adicional (*tokenizer*, extractor de características o procesador), puedes crear cualquiera de los modelos compatibles con 🤗 Transformers. Cada una de estas clases son configurables, permitiéndote usar sus atributos específicos. Puedes crear un modelo para entrenarlo de una forma fácil, o modificar un modelo preentrenado disponible para especializarlo. diff --git a/docs/source/es/create_a_model.mdx b/docs/source/es/create_a_model.mdx deleted file mode 100644 index 99ded53ee653..000000000000 --- a/docs/source/es/create_a_model.mdx +++ /dev/null @@ -1,367 +0,0 @@ - - -# Crea una arquitectura personalizada - -Una [`AutoClass`](model_doc/auto) infiere, automáticamente, la arquitectura del modelo y descarga la configuración y los pesos del modelo preentrenado. Normalmente, recomendamos usar una `AutoClass` para producir un código agnóstico a puntos de guardado o checkpoints. Sin embargo, los usuarios que quieran más control sobre los parámetros específicos de los modelos pueden crear su propio modelo 🤗 Transformers personalizado a partir de varias clases base. Esto puede ser particularmente útil para alguien que esté interesado en estudiar, entrenar o experimentar con modelos 🤗 Transformers. En esta guía vamos a profundizar en la creación de modelos personalizados sin usar `AutoClass`. Aprenderemos a: - -- Cargar y personalizar una configuración para un modelo. -- Crear una arquitectura para un modelo. -- Crear tokenizadores rápidos y lentos para textos. -- Crear un extractor de propiedades para tareas de audio o imágenes. -- Crear un procesador para tareas multimodales. - -## Configuración - -Una [configuración](main_classes/configuration) es un conjunto de atributos específicos de un modelo. Cada configuración de modelo tiene atributos diferentes. Por ejemplo, todos los modelos de PLN tienen los atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` y `vocab_size` en común. Estos atributos especifican el número de cabezas de atención o de capas ocultas con las que se construyen los modelos. - -Puedes echarle un vistazo a [DistilBERT](model_doc/distilbert) y sus atributos accediendo a [`DistilBertConfig`]: - -```py ->>> from transformers import DistilBertConfig - ->>> config = DistilBertConfig() ->>> print(config) -DistilBertConfig { - "activation": "gelu", - "attention_dropout": 0.1, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -[`DistilBertConfig`] muestra todos los atributos por defecto que se han usado para construir un modelo [`DistilBertModel`] base. Todos ellos son personalizables, lo que deja espacio para poder experimentar. Por ejemplo, puedes personalizar un modelo predeterminado para: - -- Probar una función de activación diferente, usando el parámetro `activation`. -- Usar un valor de abandono (también conocido como _dropout_) más alto para las probabilidades de las capas de atención, usando el parámetro `attention_dropout`. - -```py ->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) ->>> print(my_config) -DistilBertConfig { - "activation": "relu", - "attention_dropout": 0.4, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -Los atributos de los modelos preentrenados pueden ser modificados con la función [`~PretrainedConfig.from_pretrained`]: - -```py ->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) -``` - -Cuando estés satisfecho con la configuración de tu modelo, puedes guardarlo con la función [`~PretrainedConfig.save_pretrained`]. Tu configuración se guardará en un archivo JSON dentro del directorio que le especifiques como parámetro. - -```py ->>> my_config.save_pretrained(save_directory="./your_model_save_path") -``` - -Para volver a usar el archivo de configuración, puedes cargarlo usando [`~PretrainedConfig.from_pretrained`]: - -```py ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") -``` - - - -También puedes guardar los archivos de configuración como un diccionario; o incluso guardar solo la diferencia entre tu archivo personalizado y la configuración por defecto. Consulta la [documentación sobre configuración](main_classes/configuration) para ver más detalles. - - - -## Modelo - -El siguiente paso será crear un [modelo](main_classes/models). El modelo, al que a veces también nos referimos como arquitectura, es el encargado de definir cada capa y qué operaciones se realizan. Los atributos como `num_hidden_layers` de la configuración se usan para definir la arquitectura. Todos los modelos comparten una clase base, [`PreTrainedModel`], y algunos métodos comunes que se pueden usar para redimensionar los _embeddings_ o para recortar cabezas de auto-atención (también llamadas _self-attention heads_). Además, todos los modelos son subclases de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module), lo que significa que son compatibles con su respectivo framework. - - - - -Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma: - -```py ->>> from transformers import DistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> model = DistilBertModel(my_config) -``` - -Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo una fracción de los recursos que un entrenamiento completo hubiera requerido. - -Puedes crear un modelo preentrenado con [`~PreTrainedModel.from_pretrained`]: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") -``` - -Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) -``` - - - -Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma: - -```py ->>> from transformers import TFDistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> tf_model = TFDistilBertModel(my_config) -``` - -Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentrenamiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo solo una fracción de los recursos que un entrenamiento completo hubiera requerido. - -Puedes crear un modelo preentrenado con [`~TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") -``` - -Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si este nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) -``` - - - -### Cabezas de modelo - -En este punto del tutorial, tenemos un modelo DistilBERT base que devuelve los *hidden states* o estados ocultos. Los *hidden states* se pasan como parámetros de entrada a la cabeza del modelo para producir la salida. 🤗 Transformers ofrece una cabeza de modelo diferente para cada tarea, siempre y cuando el modelo sea compatible para la tarea (por ejemplo, no puedes usar DistilBERT para una tarea secuencia a secuencia como la traducción). - - - - - -Por ejemplo, [`DistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas. - -```py ->>> from transformers import DistilBertForSequenceClassification - ->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`DistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. - - -```py ->>> from transformers import DistilBertForQuestionAnswering - ->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - - -Por ejemplo, [`TFDistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas. - -```py ->>> from transformers import TFDistilBertForSequenceClassification - ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`TFDistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. - - -```py ->>> from transformers import TFDistilBertForQuestionAnswering - ->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - - -## Tokenizer - -La ultima clase base que debes conocer antes de usar un modelo con datos textuales es la clase [tokenizer](main_classes/tokenizer), que convierte el texto bruto en tensores. Hay dos tipos de *tokenizers* que puedes usar con 🤗 Transformers: - -- [`PreTrainedTokenizer`]: una implementación de un *tokenizer* hecha en Python. -- [`PreTrainedTokenizerFast`]: un *tokenizer* de nuestra librería [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/), basada en Rust. Este tipo de *tokenizer* es bastante más rápido, especialmente durante la tokenización por lotes, gracias a estar implementado en Rust. Esta rápida tokenización también ofrece métodos adicionales como el *offset mapping*, que relaciona los tokens con sus palabras o caracteres originales. - -Ambos *tokenizers* son compatibles con los métodos comunes, como los de encodificación y decodificación, los métodos para añadir tokens y aquellos que manejan tokens especiales. - - - -No todos los modelos son compatibles con un *tokenizer* rápido. Échale un vistazo a esta [tabla](index#supported-frameworks) para comprobar si un modelo específico es compatible con un *tokenizer* rápido. - - - -Si has entrenado tu propio *tokenizer*, puedes crear uno desde tu archivo de “vocabulario”: - -```py ->>> from transformers import DistilBertTokenizer - ->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") -``` - -Es importante recordar que los vocabularios que provienen de un *tokenizer* personalizado serán diferentes a los vocabularios generados por el *tokenizer* de un modelo preentrenado. Debes usar el vocabulario de un *tokenizer* preentrenado si vas a usar un modelo preentrenado, de lo contrario las entradas no tendrán sentido. Crea un *tokenizer* con el vocabulario de un modelo preentrenado usando la clase [`DistilBertTokenizer`]: - - -```py ->>> from transformers import DistilBertTokenizer - ->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -``` - -Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]: - - -```py ->>> from transformers import DistilBertTokenizerFast - ->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") -``` - - - -Por defecto, el [`AutoTokenizer`] intentará cargar un *tokenizer* rápido. Puedes desactivar este comportamiento cambiando el parámetro `use_fast=False` de `from_pretrained`. - - - - -## Extractor de Características - -Un extractor de características procesa entradas de audio e imagen. Hereda de la clase base [`~feature_extraction_utils.FeatureExtractionMixin`] y también puede heredar de la clase [`ImageFeatureExtractionMixin`] para el procesamiento de características de las imágenes o de la clase [`SequenceFeatureExtractor`] para el procesamiento de entradas de audio. - -Dependiendo de si trabajas en una tarea de audio o de video, puedes crear un extractor de características asociado al modelo que estés usando. Por ejemplo, podrías crear un [`ViTFeatureExtractor`] por defecto si estás usando [ViT](model_doc/vit) para clasificación de imágenes: - -```py ->>> from transformers import ViTFeatureExtractor - ->>> vit_extractor = ViTFeatureExtractor() ->>> print(vit_extractor) -ViTFeatureExtractor { - "do_normalize": true, - "do_resize": true, - "feature_extractor_type": "ViTFeatureExtractor", - "image_mean": [ - 0.5, - 0.5, - 0.5 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": 2, - "size": 224 -} -``` - - - -Si no estás buscando ninguna personalización en específico, usa el método `from_pretrained` para cargar los parámetros del extractor de características por defecto del modelo. - - - -Puedes modificar cualquier parámetro de [`ViTFeatureExtractor`] para crear tu extractor de características personalizado: - -```py ->>> from transformers import ViTFeatureExtractor - ->>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) ->>> print(my_vit_extractor) -ViTFeatureExtractor { - "do_normalize": false, - "do_resize": true, - "feature_extractor_type": "ViTFeatureExtractor", - "image_mean": [ - 0.3, - 0.3, - 0.3 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": "PIL.Image.BOX", - "size": 224 -} -``` - -Para las entradas de audio, puedes crear un [`Wav2Vec2FeatureExtractor`] y personalizar los parámetros de una forma similar: - - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> w2v2_extractor = Wav2Vec2FeatureExtractor() ->>> print(w2v2_extractor) -Wav2Vec2FeatureExtractor { - "do_normalize": true, - "feature_extractor_type": "Wav2Vec2FeatureExtractor", - "feature_size": 1, - "padding_side": "right", - "padding_value": 0.0, - "return_attention_mask": false, - "sampling_rate": 16000 -} -``` - -## Procesador - -Para modelos que son compatibles con tareas multimodales, 🤗 Transformers ofrece una clase *procesador* que agrupa un extractor de características y un *tokenizer* en el mismo objeto. Por ejemplo, probemos a usar el procesador [`Wav2Vec2Processor`] para una tarea de reconocimiento de voz (ASR). Un ASR transcribe el audio a texto, por lo que necesitaremos un extractor de características y un *tokenizer*. - -Crea un extractor de características para manejar la entrada de audio: - - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) -``` - -Crea un *tokenizer* para manejar la entrada de texto: - -```py ->>> from transformers import Wav2Vec2CTCTokenizer - ->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") -``` - -Puedes combinar el extractor de características y el *tokenizer* en el [`Wav2Vec2Processor`]: - - -```py ->>> from transformers import Wav2Vec2Processor - ->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) -``` -Con dos clases base (la configuración y el modelo) y una clase de preprocesamiento adicional (*tokenizer*, extractor de características o procesador), puedes crear cualquiera de los modelos compatibles con 🤗 Transformers. Cada una de estas clases son configurables, permitiéndote usar sus atributos específicos. Puedes crear un modelo para entrenarlo de una forma fácil, o modificar un modelo preentrenado disponible para especializarlo. diff --git a/docs/source/es/custom_models.md b/docs/source/es/custom_models.md new file mode 100644 index 000000000000..e616a056055e --- /dev/null +++ b/docs/source/es/custom_models.md @@ -0,0 +1,358 @@ + + +# Compartir modelos personalizados + +La biblioteca 🤗 Transformers está diseñada para ser fácilmente ampliable. Cada modelo está completamente codificado +sin abstracción en una subcarpeta determinada del repositorio, por lo que puedes copiar fácilmente un archivo del modelo +y ajustarlo según tus necesidades. + +Si estás escribiendo un modelo completamente nuevo, podría ser más fácil comenzar desde cero. En este tutorial, te mostraremos +cómo escribir un modelo personalizado y su configuración para que pueda usarse dentro de Transformers, y cómo puedes compartirlo +con la comunidad (con el código en el que se basa) para que cualquiera pueda usarlo, incluso si no está presente en la biblioteca +🤗 Transformers. + +Ilustraremos todo esto con un modelo ResNet, envolviendo la clase ResNet de la [biblioteca timm](https://github.com/rwightman/pytorch-image-models) en un [`PreTrainedModel`]. + +## Escribir una configuración personalizada + +Antes de adentrarnos en el modelo, primero escribamos su configuración. La configuración de un modelo es un objeto que +contendrá toda la información necesaria para construir el modelo. Como veremos en la siguiente sección, el modelo solo puede +tomar un `config` para ser inicializado, por lo que realmente necesitamos que ese objeto esté lo más completo posible. + +En nuestro ejemplo, tomaremos un par de argumentos de la clase ResNet que tal vez queramos modificar. Las diferentes +configuraciones nos darán los diferentes tipos de ResNet que son posibles. Luego simplemente almacenamos esos argumentos +después de verificar la validez de algunos de ellos. + +```python +from transformers import PretrainedConfig +from typing import List + + +class ResnetConfig(PretrainedConfig): + model_type = "resnet" + + def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, + ): + if block_type not in ["basic", "bottleneck"]: + raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") + if stem_type not in ["", "deep", "deep-tiered"]: + raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") + + self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) +``` + +Las tres cosas importantes que debes recordar al escribir tu propia configuración son las siguientes: +- tienes que heredar de `PretrainedConfig`, +- el `__init__` de tu `PretrainedConfig` debe aceptar cualquier `kwargs`, +- esos `kwargs` deben pasarse a la superclase `__init__`. + +La herencia es para asegurarte de obtener toda la funcionalidad de la biblioteca 🤗 Transformers, mientras que las otras dos +restricciones provienen del hecho de que una `PretrainedConfig` tiene más campos que los que estás configurando. Al recargar una +`config` con el método `from_pretrained`, esos campos deben ser aceptados por tu `config` y luego enviados a la superclase. + +Definir un `model_type` para tu configuración (en este caso `model_type="resnet"`) no es obligatorio, a menos que quieras +registrar tu modelo con las clases automáticas (ver la última sección). + +Una vez hecho esto, puedes crear y guardar fácilmente tu configuración como lo harías con cualquier otra configuración de un +modelo de la biblioteca. Así es como podemos crear una configuración resnet50d y guardarla: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d_config.save_pretrained("custom-resnet") +``` + +Esto guardará un archivo llamado `config.json` dentro de la carpeta `custom-resnet`. Luego puedes volver a cargar tu configuración +con el método `from_pretrained`: + +```py +resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") +``` + +También puedes usar cualquier otro método de la clase [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`], para cargar +directamente tu configuración en el Hub. + +## Escribir un modelo personalizado + +Ahora que tenemos nuestra configuración de ResNet, podemos seguir escribiendo el modelo. En realidad escribiremos dos: una que +extrae las características ocultas de un grupo de imágenes (como [`BertModel`]) y una que es adecuada para clasificación de +imagenes (como [`BertForSequenceClassification`]). + +Como mencionamos antes, solo escribiremos un envoltura (_wrapper_) libre del modelo para simplificar este ejemplo. Lo único que debemos +hacer antes de escribir esta clase es un mapeo entre los tipos de bloques y las clases de bloques reales. Luego se define el +modelo desde la configuración pasando todo a la clase `ResNet`: + +```py +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig + + +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} + + +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor): + return self.model.forward_features(tensor) +``` + +Para el modelo que clasificará las imágenes, solo cambiamos el método de avance (es decir, el método `forward`): + +```py +import torch + + +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} +``` + +En ambos casos, observa cómo heredamos de `PreTrainedModel` y llamamos a la inicialización de la superclase con `config` +(un poco como cuando escribes `torch.nn.Module`). La línea que establece `config_class` no es obligatoria, a menos +que quieras registrar tu modelo con las clases automáticas (consulta la última sección). + + + +Si tu modelo es muy similar a un modelo dentro de la biblioteca, puedes reutilizar la misma configuración de ese modelo. + + + +Puedes hacer que tu modelo devuelva lo que quieras, pero devolver un diccionario como lo hicimos para +`ResnetModelForImageClassification`, con el `loss` incluido cuando se pasan las etiquetas, hará que tu modelo se pueda +usar directamente dentro de la clase [`Trainer`]. Usar otro formato de salida está bien, siempre y cuando estés planeando usar +tu propio bucle de entrenamiento u otra biblioteca para el entrenamiento. + +Ahora que tenemos nuestra clase, vamos a crear un modelo: + +```py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +``` + +Nuevamente, puedes usar cualquiera de los métodos de [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] o +[`~PreTrainedModel.push_to_hub`]. Usaremos el segundo en la siguiente sección y veremos cómo pasar los pesos del modelo +con el código de nuestro modelo. Pero primero, carguemos algunos pesos previamente entrenados dentro de nuestro modelo. + +En tu caso de uso, probablemente estarás entrenando tu modelo personalizado con tus propios datos. Para ir rápido en este +tutorial, usaremos la versión preentrenada de resnet50d. Dado que nuestro modelo es solo un envoltorio alrededor del resnet50d +original, será fácil transferir esos pesos: + +```py +import timm + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +Ahora veamos cómo asegurarnos de que cuando hacemos [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], +se guarda el código del modelo. + +## Enviar el código al _Hub_ + + + +Esta _API_ es experimental y puede tener algunos cambios leves en las próximas versiones. + + + +Primero, asegúrate de que tu modelo esté completamente definido en un archivo `.py`. Puedes basarte en importaciones +relativas a otros archivos, siempre que todos los archivos estén en el mismo directorio (aún no admitimos submódulos +para esta característica). Para nuestro ejemplo, definiremos un archivo `modeling_resnet.py` y un archivo +`configuration_resnet.py` en una carpeta del directorio de trabajo actual llamado `resnet_model`. El archivo de configuración +contiene el código de `ResnetConfig` y el archivo del modelo contiene el código de `ResnetModel` y +`ResnetModelForImageClassification`. + +``` +. +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +``` + +El `__init__.py` puede estar vacío, solo está ahí para que Python detecte que `resnet_model` se puede usar como un módulo. + + + +Si copias archivos del modelo desde la biblioteca, deberás reemplazar todas las importaciones relativas en la parte superior +del archivo para importarlos desde el paquete `transformers`. + + + +Ten en cuenta que puedes reutilizar (o subclasificar) una configuración o modelo existente. + +Para compartir tu modelo con la comunidad, sigue estos pasos: primero importa el modelo y la configuración de ResNet desde +los archivos recién creados: + +```py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +``` + +Luego, debes decirle a la biblioteca que deseas copiar el código de esos objetos cuando usas el método `save_pretrained` +y registrarlos correctamente con una determinada clase automática (especialmente para modelos), simplemente ejecuta: + +```py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +``` + +Ten en cuenta que no es necesario especificar una clase automática para la configuración (solo hay una clase automática +para ellos, [`AutoConfig`]), pero es diferente para los modelos. Tu modelo personalizado podría ser adecuado para muchas +tareas diferentes, por lo que debes especificar cuál de las clases automáticas es la correcta para tu modelo. + +A continuación, vamos a crear la configuración y los modelos como lo hicimos antes: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +Ahora, para enviar el modelo al Hub, asegúrate de haber iniciado sesión. Ejecuta en tu terminal: + +```bash +huggingface-cli login +``` + +o desde un _notebook_: + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +Luego puedes ingresar a tu propio espacio (o una organización de la que seas miembro) de esta manera: + +```py +resnet50d.push_to_hub("custom-resnet50d") +``` + +Además de los pesos del modelo y la configuración en formato json, esto también copió los archivos `.py` del modelo y la +configuración en la carpeta `custom-resnet50d` y subió el resultado al Hub. Puedes verificar el resultado en este +[repositorio de modelos](https://huggingface.co/sgugger/custom-resnet50d). + +Consulta el tutorial sobre cómo [compartir modelos](model_sharing) para obtener más información sobre el método para subir modelos al Hub. + +## Usar un modelo con código personalizado + +Puedes usar cualquier configuración, modelo o _tokenizador_ con archivos de código personalizado en tu repositorio con las +clases automáticas y el método `from_pretrained`. Todos los archivos y códigos cargados en el Hub se analizan en busca de +malware (consulta la documentación de [seguridad del Hub](https://huggingface.co/docs/hub/security#malware-scanning) para +obtener más información), pero aún debes revisar el código del modelo y el autor para evitar la ejecución de código malicioso +en tu computadora. Configura `trust_remote_code=True` para usar un modelo con código personalizado: + +```py +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) +``` + +También se recomienda encarecidamente pasar un _hash_ de confirmación como una "revisión" para asegurarte de que el autor +de los modelos no actualizó el código con algunas líneas nuevas maliciosas (a menos que confíes plenamente en los autores +de los modelos). + +```py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +``` + +Ten en cuenta que al navegar por el historial de confirmaciones del repositorio del modelo en Hub, hay un botón para copiar +fácilmente el hash de confirmación de cualquier _commit_. + +## Registrar un model con código personalizado a las clases automáticas + +Si estás escribiendo una biblioteca que amplía 🤗 Transformers, es posible que quieras ampliar las clases automáticas para +incluir tu propio modelo. Esto es diferente de enviar el código al Hub en el sentido de que los usuarios necesitarán importar +tu biblioteca para obtener los modelos personalizados (al contrario de descargar automáticamente el código del modelo desde Hub). + +Siempre que tu configuración tenga un atributo `model_type` que sea diferente de los tipos de modelos existentes, y que tus +clases modelo tengan los atributos `config_class` correctos, puedes agregarlos a las clases automáticas de la siguiente manera: + +```py +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification + +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) +``` + +Ten en cuenta que el primer argumento utilizado al registrar tu configuración personalizada en [`AutoConfig`] debe coincidir +con el `model_type` de tu configuración personalizada, y el primer argumento utilizado al registrar tus modelos personalizados +en cualquier clase del modelo automático debe coincidir con el `config_class ` de esos modelos. diff --git a/docs/source/es/custom_models.mdx b/docs/source/es/custom_models.mdx deleted file mode 100644 index 434d59f87dae..000000000000 --- a/docs/source/es/custom_models.mdx +++ /dev/null @@ -1,354 +0,0 @@ - - -# Compartir modelos personalizados - -La biblioteca 🤗 Transformers está diseñada para ser fácilmente ampliable. Cada modelo está completamente codificado -sin abstracción en una subcarpeta determinada del repositorio, por lo que puedes copiar fácilmente un archivo del modelo -y ajustarlo según tus necesidades. - -Si estás escribiendo un modelo completamente nuevo, podría ser más fácil comenzar desde cero. En este tutorial, te mostraremos -cómo escribir un modelo personalizado y su configuración para que pueda usarse dentro de Transformers, y cómo puedes compartirlo -con la comunidad (con el código en el que se basa) para que cualquiera pueda usarlo, incluso si no está presente en la biblioteca -🤗 Transformers. - -Ilustraremos todo esto con un modelo ResNet, envolviendo la clase ResNet de la [biblioteca timm](https://github.com/rwightman/pytorch-image-models) en un [`PreTrainedModel`]. - -## Escribir una configuración personalizada - -Antes de adentrarnos en el modelo, primero escribamos su configuración. La configuración de un modelo es un objeto que -contendrá toda la información necesaria para construir el modelo. Como veremos en la siguiente sección, el modelo solo puede -tomar un `config` para ser inicializado, por lo que realmente necesitamos que ese objeto esté lo más completo posible. - -En nuestro ejemplo, tomaremos un par de argumentos de la clase ResNet que tal vez queramos modificar. Las diferentes -configuraciones nos darán los diferentes tipos de ResNet que son posibles. Luego simplemente almacenamos esos argumentos -después de verificar la validez de algunos de ellos. - -```python -from transformers import PretrainedConfig -from typing import List - - -class ResnetConfig(PretrainedConfig): - model_type = "resnet" - - def __init__( - self, - block_type="bottleneck", - layers: List[int] = [3, 4, 6, 3], - num_classes: int = 1000, - input_channels: int = 3, - cardinality: int = 1, - base_width: int = 64, - stem_width: int = 64, - stem_type: str = "", - avg_down: bool = False, - **kwargs, - ): - if block_type not in ["basic", "bottleneck"]: - raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") - if stem_type not in ["", "deep", "deep-tiered"]: - raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") - - self.block_type = block_type - self.layers = layers - self.num_classes = num_classes - self.input_channels = input_channels - self.cardinality = cardinality - self.base_width = base_width - self.stem_width = stem_width - self.stem_type = stem_type - self.avg_down = avg_down - super().__init__(**kwargs) -``` - -Las tres cosas importantes que debes recordar al escribir tu propia configuración son las siguientes: -- tienes que heredar de `PretrainedConfig`, -- el `__init__` de tu `PretrainedConfig` debe aceptar cualquier `kwargs`, -- esos `kwargs` deben pasarse a la superclase `__init__`. - -La herencia es para asegurarte de obtener toda la funcionalidad de la biblioteca 🤗 Transformers, mientras que las otras dos -restricciones provienen del hecho de que una `PretrainedConfig` tiene más campos que los que estás configurando. Al recargar una -`config` con el método `from_pretrained`, esos campos deben ser aceptados por tu `config` y luego enviados a la superclase. - -Definir un `model_type` para tu configuración (en este caso `model_type="resnet"`) no es obligatorio, a menos que quieras -registrar tu modelo con las clases automáticas (ver la última sección). - -Una vez hecho esto, puedes crear y guardar fácilmente tu configuración como lo harías con cualquier otra configuración de un -modelo de la biblioteca. Así es como podemos crear una configuración resnet50d y guardarla: - -```py -resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) -resnet50d_config.save_pretrained("custom-resnet") -``` - -Esto guardará un archivo llamado `config.json` dentro de la carpeta `custom-resnet`. Luego puedes volver a cargar tu configuración -con el método `from_pretrained`: - -```py -resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") -``` - -También puedes usar cualquier otro método de la clase [`PretrainedConfig`], como [`~PretrainedConfig.push_to_hub`], para cargar -directamente tu configuración en el Hub. - -## Escribir un modelo personalizado - -Ahora que tenemos nuestra configuración de ResNet, podemos seguir escribiendo el modelo. En realidad escribiremos dos: una que -extrae las características ocultas de un grupo de imágenes (como [`BertModel`]) y una que es adecuada para clasificación de -imagenes (como [`BertForSequenceClassification`]). - -Como mencionamos antes, solo escribiremos un envoltura (_wrapper_) libre del modelo para simplificar este ejemplo. Lo único que debemos -hacer antes de escribir esta clase es un mapeo entre los tipos de bloques y las clases de bloques reales. Luego se define el -modelo desde la configuración pasando todo a la clase `ResNet`: - -```py -from transformers import PreTrainedModel -from timm.models.resnet import BasicBlock, Bottleneck, ResNet -from .configuration_resnet import ResnetConfig - - -BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} - - -class ResnetModel(PreTrainedModel): - config_class = ResnetConfig - - def __init__(self, config): - super().__init__(config) - block_layer = BLOCK_MAPPING[config.block_type] - self.model = ResNet( - block_layer, - config.layers, - num_classes=config.num_classes, - in_chans=config.input_channels, - cardinality=config.cardinality, - base_width=config.base_width, - stem_width=config.stem_width, - stem_type=config.stem_type, - avg_down=config.avg_down, - ) - - def forward(self, tensor): - return self.model.forward_features(tensor) -``` - -Para el modelo que clasificará las imágenes, solo cambiamos el método de avance (es decir, el método `forward`): - -```py -import torch - - -class ResnetModelForImageClassification(PreTrainedModel): - config_class = ResnetConfig - - def __init__(self, config): - super().__init__(config) - block_layer = BLOCK_MAPPING[config.block_type] - self.model = ResNet( - block_layer, - config.layers, - num_classes=config.num_classes, - in_chans=config.input_channels, - cardinality=config.cardinality, - base_width=config.base_width, - stem_width=config.stem_width, - stem_type=config.stem_type, - avg_down=config.avg_down, - ) - - def forward(self, tensor, labels=None): - logits = self.model(tensor) - if labels is not None: - loss = torch.nn.cross_entropy(logits, labels) - return {"loss": loss, "logits": logits} - return {"logits": logits} -``` - -En ambos casos, observa cómo heredamos de `PreTrainedModel` y llamamos a la inicialización de la superclase con `config` -(un poco como cuando escribes `torch.nn.Module`). La línea que establece `config_class` no es obligatoria, a menos -que quieras registrar tu modelo con las clases automáticas (consulta la última sección). - - - -Si tu modelo es muy similar a un modelo dentro de la biblioteca, puedes reutilizar la misma configuración de ese modelo. - - - -Puedes hacer que tu modelo devuelva lo que quieras, pero devolver un diccionario como lo hicimos para -`ResnetModelForImageClassification`, con el `loss` incluido cuando se pasan las etiquetas, hará que tu modelo se pueda -usar directamente dentro de la clase [`Trainer`]. Usar otro formato de salida está bien, siempre y cuando estés planeando usar -tu propio bucle de entrenamiento u otra biblioteca para el entrenamiento. - -Ahora que tenemos nuestra clase, vamos a crear un modelo: - -```py -resnet50d = ResnetModelForImageClassification(resnet50d_config) -``` - -Nuevamente, puedes usar cualquiera de los métodos de [`PreTrainedModel`], como [`~PreTrainedModel.save_pretrained`] o -[`~PreTrainedModel.push_to_hub`]. Usaremos el segundo en la siguiente sección y veremos cómo pasar los pesos del modelo -con el código de nuestro modelo. Pero primero, carguemos algunos pesos previamente entrenados dentro de nuestro modelo. - -En tu caso de uso, probablemente estarás entrenando tu modelo personalizado con tus propios datos. Para ir rápido en este -tutorial, usaremos la versión preentrenada de resnet50d. Dado que nuestro modelo es solo un envoltorio alrededor del resnet50d -original, será fácil transferir esos pesos: - -```py -import timm - -pretrained_model = timm.create_model("resnet50d", pretrained=True) -resnet50d.model.load_state_dict(pretrained_model.state_dict()) -``` - -Ahora veamos cómo asegurarnos de que cuando hacemos [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], -se guarda el código del modelo. - -## Enviar el código al _Hub_ - - - -Esta _API_ es experimental y puede tener algunos cambios leves en las próximas versiones. - - - -Primero, asegúrate de que tu modelo esté completamente definido en un archivo `.py`. Puedes basarte en importaciones -relativas a otros archivos, siempre que todos los archivos estén en el mismo directorio (aún no admitimos submódulos -para esta característica). Para nuestro ejemplo, definiremos un archivo `modeling_resnet.py` y un archivo -`configuration_resnet.py` en una carpeta del directorio de trabajo actual llamado `resnet_model`. El archivo de configuración -contiene el código de `ResnetConfig` y el archivo del modelo contiene el código de `ResnetModel` y -`ResnetModelForImageClassification`. - -``` -. -└── resnet_model - ├── __init__.py - ├── configuration_resnet.py - └── modeling_resnet.py -``` - -El `__init__.py` puede estar vacío, solo está ahí para que Python detecte que `resnet_model` se puede usar como un módulo. - - - -Si copias archivos del modelo desde la biblioteca, deberás reemplazar todas las importaciones relativas en la parte superior -del archivo para importarlos desde el paquete `transformers`. - - - -Ten en cuenta que puedes reutilizar (o subclasificar) una configuración o modelo existente. - -Para compartir tu modelo con la comunidad, sigue estos pasos: primero importa el modelo y la configuración de ResNet desde -los archivos recién creados: - -```py -from resnet_model.configuration_resnet import ResnetConfig -from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification -``` - -Luego, debes decirle a la biblioteca que deseas copiar el código de esos objetos cuando usas el método `save_pretrained` -y registrarlos correctamente con una determinada clase automática (especialmente para modelos), simplemente ejecuta: - -```py -ResnetConfig.register_for_auto_class() -ResnetModel.register_for_auto_class("AutoModel") -ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") -``` - -Ten en cuenta que no es necesario especificar una clase automática para la configuración (solo hay una clase automática -para ellos, [`AutoConfig`]), pero es diferente para los modelos. Tu modelo personalizado podría ser adecuado para muchas -tareas diferentes, por lo que debes especificar cuál de las clases automáticas es la correcta para tu modelo. - -A continuación, vamos a crear la configuración y los modelos como lo hicimos antes: - -```py -resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) -resnet50d = ResnetModelForImageClassification(resnet50d_config) - -pretrained_model = timm.create_model("resnet50d", pretrained=True) -resnet50d.model.load_state_dict(pretrained_model.state_dict()) -``` - -Ahora, para enviar el modelo al Hub, asegúrate de haber iniciado sesión. Ejecuta en tu terminal: - -```bash -huggingface-cli login -``` - -o desde un _notebook_: - -```py -from huggingface_hub import notebook_login - -notebook_login() -``` - -Luego puedes ingresar a tu propio espacio (o una organización de la que seas miembro) de esta manera: - -```py -resnet50d.push_to_hub("custom-resnet50d") -``` - -Además de los pesos del modelo y la configuración en formato json, esto también copió los archivos `.py` del modelo y la -configuración en la carpeta `custom-resnet50d` y subió el resultado al Hub. Puedes verificar el resultado en este -[repositorio de modelos](https://huggingface.co/sgugger/custom-resnet50d). - -Consulta el tutorial sobre cómo [compartir modelos](model_sharing) para obtener más información sobre el método para subir modelos al Hub. - -## Usar un modelo con código personalizado - -Puedes usar cualquier configuración, modelo o _tokenizador_ con archivos de código personalizado en tu repositorio con las -clases automáticas y el método `from_pretrained`. Todos los archivos y códigos cargados en el Hub se analizan en busca de -malware (consulta la documentación de [seguridad del Hub](https://huggingface.co/docs/hub/security#malware-scanning) para -obtener más información), pero aún debes revisar el código del modelo y el autor para evitar la ejecución de código malicioso -en tu computadora. Configura `trust_remote_code=True` para usar un modelo con código personalizado: - -```py -from transformers import AutoModelForImageClassification - -model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) -``` - -También se recomienda encarecidamente pasar un _hash_ de confirmación como una "revisión" para asegurarte de que el autor -de los modelos no actualizó el código con algunas líneas nuevas maliciosas (a menos que confíes plenamente en los autores -de los modelos). - -```py -commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" -model = AutoModelForImageClassification.from_pretrained( - "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash -) -``` - -Ten en cuenta que al navegar por el historial de confirmaciones del repositorio del modelo en Hub, hay un botón para copiar -fácilmente el hash de confirmación de cualquier _commit_. - -## Registrar un model con código personalizado a las clases automáticas - -Si estás escribiendo una biblioteca que amplía 🤗 Transformers, es posible que quieras ampliar las clases automáticas para -incluir tu propio modelo. Esto es diferente de enviar el código al Hub en el sentido de que los usuarios necesitarán importar -tu biblioteca para obtener los modelos personalizados (al contrario de descargar automáticamente el código del modelo desde Hub). - -Siempre que tu configuración tenga un atributo `model_type` que sea diferente de los tipos de modelos existentes, y que tus -clases modelo tengan los atributos `config_class` correctos, puedes agregarlos a las clases automáticas de la siguiente manera: - -```py -from transformers import AutoConfig, AutoModel, AutoModelForImageClassification - -AutoConfig.register("resnet", ResnetConfig) -AutoModel.register(ResnetConfig, ResnetModel) -AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) -``` - -Ten en cuenta que el primer argumento utilizado al registrar tu configuración personalizada en [`AutoConfig`] debe coincidir -con el `model_type` de tu configuración personalizada, y el primer argumento utilizado al registrar tus modelos personalizados -en cualquier clase del modelo automático debe coincidir con el `config_class ` de esos modelos. diff --git a/docs/source/es/debugging.md b/docs/source/es/debugging.md new file mode 100644 index 000000000000..313566753052 --- /dev/null +++ b/docs/source/es/debugging.md @@ -0,0 +1,335 @@ + + +# Debugging + +## Debug de problemas de Network multi-GPU + +Cuando entrenas o infieres con `DistributedDataParallel` y varias GPUs, si encuentras problemas de intercomunicación entre procesos y/o nodos, puedes usar el siguiente script para diagnosticar problemas de red. + +```bash +wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py +``` + +Por ejemplo, para probar cómo interactúan 2 GPUs, haz lo siguiente: + +```bash +python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +``` +Si ambos procesos pueden hablar entre sí y asignar la memoria de la GPU, cada uno imprimirá un status OK. + +Para más GPUs o nodos, ajusta los argumentos en el script. + +Encontrarás muchos más detalles dentro del script de diagnóstico e incluso una receta de cómo ejecutarlo en un entorno SLURM. + +Un nivel adicional de debug es agregar la variable de entorno `NCCL_DEBUG=INFO` de la siguiente manera: + +```bash +NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +``` + +Esto mostrará mucha información de debug relacionada con NCCL, que luego puedes buscar online si encuentras que reporta algún problema. O si no estás seguro de cómo interpretar el output, puedes compartir el archivo de log en un Issue. + + +## Detección de Underflow y Overflow + + + +Esta función está disponible actualmente sólo para PyTorch. + + + + + +Para el entrenamiento multi-GPU, requiere DDP (`torch.distributed.launch`). + + + + + +Esta función puede utilizarse con cualquier modelo basado en `nn.Module`. + + + +Si empiezas a obtener `loss=NaN` o el modelo muestra algún otro comportamiento anormal debido a `inf` o `nan` en +activations o weights hay que descubrir dónde se produce el primer underflow o overflow y qué lo ha provocado. Por suerte +puedes lograrlo fácilmente activando un módulo especial que hará la detección automáticamente. + +Si estás usando [`Trainer`], solo necesitas añadir: + +```bash +--debug underflow_overflow +``` + +a los argumentos normales de la línea de comandos, o pasar `debug="underflow_overflow"` al crear el objeto [`TrainingArguments`]. + +Si estás usando tu propio bucle de entrenamiento u otro Trainer puedes lograr lo mismo con: + +```python +from .debug_utils import DebugUnderflowOverflow + +debug_overflow = DebugUnderflowOverflow(model) +``` + +[`~debug_utils.DebugUnderflowOverflow`] inserta hooks en el modelo que inmediatamente después de cada forward +testeará las variables de input y output y también los weights del módulo correspondiente. Tan pronto como se detecte `inf` o +`nan` se detecta en al menos un elemento de las activations o weights, el programa afirmará e imprimirá un informe +como este (esto fue capturado con `google/mt5-small` bajo fp16 mixed precision): + +``` +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata + encoder.block.1.layer.1.DenseReluDense.dropout Dropout +0.00e+00 2.57e+02 input[0] +0.00e+00 2.85e+02 output +[...] + encoder.block.2.layer.0 T5LayerSelfAttention +6.78e-04 3.15e+03 input[0] +2.65e-04 3.42e+03 output[0] + None output[1] +2.25e-01 1.00e+04 output[2] + encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.dropout Dropout +0.00e+00 8.76e+03 input[0] +0.00e+00 9.74e+03 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +``` + +El output del ejemplo se ha recortado en el centro por razones de brevedad. + +La segunda columna muestra el valor del elemento más grande en términos absolutos, por lo que si observas con detenimiento los últimos fotogramas, +los inputs y outputs estaban en el rango de `1e4`. Así que cuando este entrenamiento se hizo con fp16 mixed precision, +el último paso sufrió overflow (ya que bajo `fp16` el mayor número antes de `inf` es `64e3`). Para evitar overflows en +`fp16` las activations deben permanecer muy por debajo de `1e4`, porque `1e4 * 1e4 = 1e8` por lo que cualquier matrix multiplication con +grandes activations va a llevar a una condición de overflow numérico. + +Al principio del output puedes descubrir en qué número de batch se produjo el problema (aquí `Detected inf/nan during batch_number=0` significa que el problema se produjo en el primer batch). + +Cada frame del informe comienza declarando la entrada completamente calificada para el módulo correspondiente que este frame está reportando. +Si nos fijamos sólo en este frame: + +``` + encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output +``` + +Aquí, `encoder.block.2.layer.1.layer_norm` indica que era una layer norm para la primera capa, del segundo +block del encoder. Y la call específica del `forward` es `T5LayerNorm`. + +Veamos los últimos frames de ese informe: + +``` +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata +[...] + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +``` + +El último frame informa para la función `Dropout.forward` con la primera entrada para el único input y la segunda para el +único output. Puedes ver que fue llamada desde un atributo `dropout` dentro de la clase `DenseReluDense`. Podemos ver +que ocurrió durante la primera capa, del segundo block, durante el primer batch. Por último, el mayor absoluto +elementos de input fue `6.27e+04` y el mismo para el output fue `inf`. + +Puedes ver aquí, que `T5DenseGatedGeluDense.forward` resultó en output activations, cuyo valor máximo absoluto fue +alrededor de 62.7K, que está muy cerca del límite máximo de fp16 de 64K. En el siguiente frame tenemos `Dropout`, el cual renormaliza +los weights, después de poner a cero algunos de los elementos, lo que empuja el valor máximo absoluto a más de 64K, y obtenemos un +overflow (`inf`). + +Como puedes ver son los frames anteriores los que tenemos que mirar cuando los números empiezan a ser muy grandes para números fp16. + +Combinemos el informe con el código de `models/t5/modeling_t5.py`: + +```python +class T5DenseGatedGeluDense(nn.Module): + def __init__(self, config): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.gelu_act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states +``` + +Ahora es fácil ver la call `dropout`, y también todas las calls anteriores. + +Dado que la detección se produce en un forward hook, estos informes se imprimen inmediatamente después de que cada `forward` +responda. + +Volviendo al informe completo, para actuar sobre él y arreglar el problema, tenemos que subir unos cuantos frames donde los números +empezaron a subir y probablemente cambiar al modo `fp32` aquí, para que los números no sufran overflow cuando se multipliquen +o al sumarlos. Por supuesto, puede haber otras soluciones. Por ejemplo, podríamos desactivar `amp` temporalmente si está +activado, después de mover el original `forward` dentro de un helper wrapper, así: + +```python +def _forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +import torch + + +def forward(self, hidden_states): + if torch.is_autocast_enabled(): + with torch.cuda.amp.autocast(enabled=False): + return self._forward(hidden_states) + else: + return self._forward(hidden_states) +``` + +Como el detector automático sólo informa de los inputs y outputs de los frames completos, una vez que sepas dónde buscar, puedes +analizar también las etapas intermedias de una función específica de `forward`. En este caso, puede utilizar la función +función de ayuda `detect_overflow` para inyectar el detector donde quieras, por ejemplo: + +```python +from debug_utils import detect_overflow + + +class T5LayerFF(nn.Module): + [...] + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + detect_overflow(forwarded_states, "after layer_norm") + forwarded_states = self.DenseReluDense(forwarded_states) + detect_overflow(forwarded_states, "after DenseReluDense") + return hidden_states + self.dropout(forwarded_states) +``` + +Puedes ver que hemos añadido 2 de estos y ahora se trackea si `inf` o `nan` para `forwarded_states` fue detectado +en algún punto intermedio. + +De hecho, el detector ya informa de esto porque cada una de las llamadas en el ejemplo anterior es un `nn.Module`, pero +digamos que si tuvieras algunos cálculos directos locales, así es como lo harías. + +Además, si estás instanciando el debugger en tu propio código, puedes ajustar el número de frames impresos de +su valor por defecto, por ejemplo: + +```python +from .debug_utils import DebugUnderflowOverflow + +debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) +``` + +### Rastreo de valores mínimos y máximos absolutos de batches específicos + +La misma clase de debugging se puede utilizar para el rastreo por batches con la función de detección de underflow/overflow desactivada. + +Digamos que quieres ver los valores mínimos y máximos absolutos de todos los ingredientes de cada call `forward` de un determinado +batch, y sólo hacerlo para los batches 1 y 3. Entonces instancias esta clase como: + +```python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) +``` + +Y ahora los batches 1 y 3 completos serán rastreados usando el mismo formato que el detector de underflow/overflow. + +Los batches son 0-index. + +Esto es muy útil si sabes que el programa empieza a comportarse mal después de un determinado número de batch, para que puedas avanzar rápidamente +hasta esa área. Aquí hay un ejemplo de output recortado para tal configuración: + +``` + *** Starting batch number=1 *** +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.47e+04 input[0] +5.36e-05 7.92e+02 output +[...] + decoder.dropout Dropout +1.60e-07 2.27e+01 input[0] +0.00e+00 2.52e+01 output + decoder T5Stack + not a tensor output + lm_head Linear +1.01e-06 7.92e+02 weight +0.00e+00 1.11e+00 input[0] +6.06e-02 8.39e+01 output + T5ForConditionalGeneration + not a tensor output + + *** Starting batch number=3 *** +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.78e+04 input[0] +5.36e-05 7.92e+02 output +[...] +``` + +Aquí obtendrás un gran número de frames mostrados - tantos como forward calls haya en tu modelo, por lo que puede o no ser lo que quieras, pero a veces puede ser más fácil de usar para debug que un debugger normal. +Por ejemplo, si un problema comienza a ocurrir en el batch 150. Entonces puedes mostrar las trazas de los batches 149 y 150 y comparar dónde +los números empezaron a divergir. + +También puedes especificar el número de batch después del cual se debe detener el entrenamiento, con: + +```python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) +``` diff --git a/docs/source/es/debugging.mdx b/docs/source/es/debugging.mdx deleted file mode 100644 index a709e0407b8b..000000000000 --- a/docs/source/es/debugging.mdx +++ /dev/null @@ -1,331 +0,0 @@ - - -# Debugging - -## Debug de problemas de Network multi-GPU - -Cuando entrenas o infieres con `DistributedDataParallel` y varias GPUs, si encuentras problemas de intercomunicación entre procesos y/o nodos, puedes usar el siguiente script para diagnosticar problemas de red. - -```bash -wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py -``` - -Por ejemplo, para probar cómo interactúan 2 GPUs, haz lo siguiente: - -```bash -python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py -``` -Si ambos procesos pueden hablar entre sí y asignar la memoria de la GPU, cada uno imprimirá un status OK. - -Para más GPUs o nodos, ajusta los argumentos en el script. - -Encontrarás muchos más detalles dentro del script de diagnóstico e incluso una receta de cómo ejecutarlo en un entorno SLURM. - -Un nivel adicional de debug es agregar la variable de entorno `NCCL_DEBUG=INFO` de la siguiente manera: - -```bash -NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py -``` - -Esto mostrará mucha información de debug relacionada con NCCL, que luego puedes buscar online si encuentras que reporta algún problema. O si no estás seguro de cómo interpretar el output, puedes compartir el archivo de log en un Issue. - - -## Detección de Underflow y Overflow - - - -Esta función está disponible actualmente sólo para PyTorch. - - - - - -Para el entrenamiento multi-GPU, requiere DDP (`torch.distributed.launch`). - - - - - -Esta función puede utilizarse con cualquier modelo basado en `nn.Module`. - - - -Si empiezas a obtener `loss=NaN` o el modelo muestra algún otro comportamiento anormal debido a `inf` o `nan` en -activations o weights hay que descubrir dónde se produce el primer underflow o overflow y qué lo ha provocado. Por suerte -puedes lograrlo fácilmente activando un módulo especial que hará la detección automáticamente. - -Si estás usando [`Trainer`], solo necesitas añadir: - -```bash ---debug underflow_overflow -``` - -a los argumentos normales de la línea de comandos, o pasar `debug="underflow_overflow"` al crear el objeto [`TrainingArguments`]. - -Si estás usando tu propio bucle de entrenamiento u otro Trainer puedes lograr lo mismo con: - -```python -from .debug_utils import DebugUnderflowOverflow - -debug_overflow = DebugUnderflowOverflow(model) -``` - -[`~debug_utils.DebugUnderflowOverflow`] inserta hooks en el modelo que inmediatamente después de cada forward -testeará las variables de input y output y también los weights del módulo correspondiente. Tan pronto como se detecte `inf` o -`nan` se detecta en al menos un elemento de las activations o weights, el programa afirmará e imprimirá un informe -como este (esto fue capturado con `google/mt5-small` bajo fp16 mixed precision): - -``` -Detected inf/nan during batch_number=0 -Last 21 forward frames: -abs min abs max metadata - encoder.block.1.layer.1.DenseReluDense.dropout Dropout -0.00e+00 2.57e+02 input[0] -0.00e+00 2.85e+02 output -[...] - encoder.block.2.layer.0 T5LayerSelfAttention -6.78e-04 3.15e+03 input[0] -2.65e-04 3.42e+03 output[0] - None output[1] -2.25e-01 1.00e+04 output[2] - encoder.block.2.layer.1.layer_norm T5LayerNorm -8.69e-02 4.18e-01 weight -2.65e-04 3.42e+03 input[0] -1.79e-06 4.65e+00 output - encoder.block.2.layer.1.DenseReluDense.wi_0 Linear -2.17e-07 4.50e+00 weight -1.79e-06 4.65e+00 input[0] -2.68e-06 3.70e+01 output - encoder.block.2.layer.1.DenseReluDense.wi_1 Linear -8.08e-07 2.66e+01 weight -1.79e-06 4.65e+00 input[0] -1.27e-04 2.37e+02 output - encoder.block.2.layer.1.DenseReluDense.dropout Dropout -0.00e+00 8.76e+03 input[0] -0.00e+00 9.74e+03 output - encoder.block.2.layer.1.DenseReluDense.wo Linear -1.01e-06 6.44e+00 weight -0.00e+00 9.74e+03 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense -1.79e-06 4.65e+00 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.dropout Dropout -3.18e-04 6.27e+04 input[0] -0.00e+00 inf output -``` - -El output del ejemplo se ha recortado en el centro por razones de brevedad. - -La segunda columna muestra el valor del elemento más grande en términos absolutos, por lo que si observas con detenimiento los últimos fotogramas, -los inputs y outputs estaban en el rango de `1e4`. Así que cuando este entrenamiento se hizo con fp16 mixed precision, -el último paso sufrió overflow (ya que bajo `fp16` el mayor número antes de `inf` es `64e3`). Para evitar overflows en -`fp16` las activations deben permanecer muy por debajo de `1e4`, porque `1e4 * 1e4 = 1e8` por lo que cualquier matrix multiplication con -grandes activations va a llevar a una condición de overflow numérico. - -Al principio del output puedes descubrir en qué número de batch se produjo el problema (aquí `Detected inf/nan during batch_number=0` significa que el problema se produjo en el primer batch). - -Cada frame del informe comienza declarando la entrada completamente calificada para el módulo correspondiente que este frame está reportando. -Si nos fijamos sólo en este frame: - -``` - encoder.block.2.layer.1.layer_norm T5LayerNorm -8.69e-02 4.18e-01 weight -2.65e-04 3.42e+03 input[0] -1.79e-06 4.65e+00 output -``` - -Aquí, `encoder.block.2.layer.1.layer_norm` indica que era una layer norm para la primera capa, del segundo -block del encoder. Y la call específica del `forward` es `T5LayerNorm`. - -Veamos los últimos frames de ese informe: - -``` -Detected inf/nan during batch_number=0 -Last 21 forward frames: -abs min abs max metadata -[...] - encoder.block.2.layer.1.DenseReluDense.wi_0 Linear -2.17e-07 4.50e+00 weight -1.79e-06 4.65e+00 input[0] -2.68e-06 3.70e+01 output - encoder.block.2.layer.1.DenseReluDense.wi_1 Linear -8.08e-07 2.66e+01 weight -1.79e-06 4.65e+00 input[0] -1.27e-04 2.37e+02 output - encoder.block.2.layer.1.DenseReluDense.wo Linear -1.01e-06 6.44e+00 weight -0.00e+00 9.74e+03 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense -1.79e-06 4.65e+00 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.dropout Dropout -3.18e-04 6.27e+04 input[0] -0.00e+00 inf output -``` - -El último frame informa para la función `Dropout.forward` con la primera entrada para el único input y la segunda para el -único output. Puedes ver que fue llamada desde un atributo `dropout` dentro de la clase `DenseReluDense`. Podemos ver -que ocurrió durante la primera capa, del segundo block, durante el primer batch. Por último, el mayor absoluto -elementos de input fue `6.27e+04` y el mismo para el output fue `inf`. - -Puedes ver aquí, que `T5DenseGatedGeluDense.forward` resultó en output activations, cuyo valor máximo absoluto fue -alrededor de 62.7K, que está muy cerca del límite máximo de fp16 de 64K. En el siguiente frame tenemos `Dropout`, el cual renormaliza -los weights, después de poner a cero algunos de los elementos, lo que empuja el valor máximo absoluto a más de 64K, y obtenemos un -overflow (`inf`). - -Como puedes ver son los frames anteriores los que tenemos que mirar cuando los números empiezan a ser muy grandes para números fp16. - -Combinemos el informe con el código de `models/t5/modeling_t5.py`: - -```python -class T5DenseGatedGeluDense(nn.Module): - def __init__(self, config): - super().__init__() - self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout_rate) - self.gelu_act = ACT2FN["gelu_new"] - - def forward(self, hidden_states): - hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) - hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear - hidden_states = self.dropout(hidden_states) - hidden_states = self.wo(hidden_states) - return hidden_states -``` - -Ahora es fácil ver la call `dropout`, y también todas las calls anteriores. - -Dado que la detección se produce en un forward hook, estos informes se imprimen inmediatamente después de que cada `forward` -responda. - -Volviendo al informe completo, para actuar sobre él y arreglar el problema, tenemos que subir unos cuantos frames donde los números -empezaron a subir y probablemente cambiar al modo `fp32` aquí, para que los números no sufran overflow cuando se multipliquen -o al sumarlos. Por supuesto, puede haber otras soluciones. Por ejemplo, podríamos desactivar `amp` temporalmente si está -activado, después de mover el original `forward` dentro de un helper wrapper, así: - -```python -def _forward(self, hidden_states): - hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) - hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear - hidden_states = self.dropout(hidden_states) - hidden_states = self.wo(hidden_states) - return hidden_states - - -import torch - - -def forward(self, hidden_states): - if torch.is_autocast_enabled(): - with torch.cuda.amp.autocast(enabled=False): - return self._forward(hidden_states) - else: - return self._forward(hidden_states) -``` - -Como el detector automático sólo informa de los inputs y outputs de los frames completos, una vez que sepas dónde buscar, puedes -analizar también las etapas intermedias de una función específica de `forward`. En este caso, puede utilizar la función -función de ayuda `detect_overflow` para inyectar el detector donde quieras, por ejemplo: - -```python -from debug_utils import detect_overflow - - -class T5LayerFF(nn.Module): - [...] - - def forward(self, hidden_states): - forwarded_states = self.layer_norm(hidden_states) - detect_overflow(forwarded_states, "after layer_norm") - forwarded_states = self.DenseReluDense(forwarded_states) - detect_overflow(forwarded_states, "after DenseReluDense") - return hidden_states + self.dropout(forwarded_states) -``` - -Puedes ver que hemos añadido 2 de estos y ahora se trackea si `inf` o `nan` para `forwarded_states` fue detectado -en algún punto intermedio. - -De hecho, el detector ya informa de esto porque cada una de las llamadas en el ejemplo anterior es un `nn.Module`, pero -digamos que si tuvieras algunos cálculos directos locales, así es como lo harías. - -Además, si estás instanciando el debugger en tu propio código, puedes ajustar el número de frames impresos de -su valor por defecto, por ejemplo: - -```python -from .debug_utils import DebugUnderflowOverflow - -debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) -``` - -### Rastreo de valores mínimos y máximos absolutos de batches específicos - -La misma clase de debugging se puede utilizar para el rastreo por batches con la función de detección de underflow/overflow desactivada. - -Digamos que quieres ver los valores mínimos y máximos absolutos de todos los ingredientes de cada call `forward` de un determinado -batch, y sólo hacerlo para los batches 1 y 3. Entonces instancias esta clase como: - -```python -debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) -``` - -Y ahora los batches 1 y 3 completos serán rastreados usando el mismo formato que el detector de underflow/overflow. - -Los batches son 0-index. - -Esto es muy útil si sabes que el programa empieza a comportarse mal después de un determinado número de batch, para que puedas avanzar rápidamente -hasta esa área. Aquí hay un ejemplo de output recortado para tal configuración: - -``` - *** Starting batch number=1 *** -abs min abs max metadata - shared Embedding -1.01e-06 7.92e+02 weight -0.00e+00 2.47e+04 input[0] -5.36e-05 7.92e+02 output -[...] - decoder.dropout Dropout -1.60e-07 2.27e+01 input[0] -0.00e+00 2.52e+01 output - decoder T5Stack - not a tensor output - lm_head Linear -1.01e-06 7.92e+02 weight -0.00e+00 1.11e+00 input[0] -6.06e-02 8.39e+01 output - T5ForConditionalGeneration - not a tensor output - - *** Starting batch number=3 *** -abs min abs max metadata - shared Embedding -1.01e-06 7.92e+02 weight -0.00e+00 2.78e+04 input[0] -5.36e-05 7.92e+02 output -[...] -``` - -Aquí obtendrás un gran número de frames mostrados - tantos como forward calls haya en tu modelo, por lo que puede o no ser lo que quieras, pero a veces puede ser más fácil de usar para debug que un debugger normal. -Por ejemplo, si un problema comienza a ocurrir en el batch 150. Entonces puedes mostrar las trazas de los batches 149 y 150 y comparar dónde -los números empezaron a divergir. - -También puedes especificar el número de batch después del cual se debe detener el entrenamiento, con: - -```python -debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) -``` diff --git a/docs/source/es/fast_tokenizers.md b/docs/source/es/fast_tokenizers.md new file mode 100644 index 000000000000..92b925f67f7e --- /dev/null +++ b/docs/source/es/fast_tokenizers.md @@ -0,0 +1,74 @@ + + +# Usa los tokenizadores de 🤗 Tokenizers + +[`PreTrainedTokenizerFast`] depende de la biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). Los tokenizadores obtenidos desde la biblioteca 🤗 Tokenizers pueden ser +cargados de forma muy sencilla en los 🤗 Transformers. + +Antes de entrar en detalles, comencemos creando un tokenizador dummy en unas cuantas líneas: + +```python +>>> from tokenizers import Tokenizer +>>> from tokenizers.models import BPE +>>> from tokenizers.trainers import BpeTrainer +>>> from tokenizers.pre_tokenizers import Whitespace + +>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + +>>> tokenizer.pre_tokenizer = Whitespace() +>>> files = [...] +>>> tokenizer.train(files, trainer) +``` + +Ahora tenemos un tokenizador entrenado en los archivos que definimos. Lo podemos seguir utilizando en ese entorno de ejecución (runtime en inglés), o puedes guardarlo +en un archivo JSON para reutilizarlo en un futuro. + +## Cargando directamente desde el objeto tokenizador + +Veamos cómo utilizar este objeto tokenizador en la biblioteca 🤗 Transformers. La clase +[`PreTrainedTokenizerFast`] permite una instanciación fácil, al aceptar el objeto +*tokenizer* instanciado como argumento: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) +``` + +Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores +](main_classes/tokenizer) para más información. + +## Cargando desde un archivo JSON + +Para cargar un tokenizador desde un archivo JSON, comencemos por guardar nuestro tokenizador: + +```python +>>> tokenizer.save("tokenizer.json") +``` + +La localización (path en inglés) donde este archivo es guardado puede ser incluida en el método de inicialización de [`PreTrainedTokenizerFast`] +utilizando el parámetro `tokenizer_file`: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") +``` + +Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores +](main_classes/tokenizer) para más información. diff --git a/docs/source/es/fast_tokenizers.mdx b/docs/source/es/fast_tokenizers.mdx deleted file mode 100644 index 63b43cc1c4c7..000000000000 --- a/docs/source/es/fast_tokenizers.mdx +++ /dev/null @@ -1,70 +0,0 @@ - - -# Usa los tokenizadores de 🤗 Tokenizers - -[`PreTrainedTokenizerFast`] depende de la biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). Los tokenizadores obtenidos desde la biblioteca 🤗 Tokenizers pueden ser -cargados de forma muy sencilla en los 🤗 Transformers. - -Antes de entrar en detalles, comencemos creando un tokenizador dummy en unas cuantas líneas: - -```python ->>> from tokenizers import Tokenizer ->>> from tokenizers.models import BPE ->>> from tokenizers.trainers import BpeTrainer ->>> from tokenizers.pre_tokenizers import Whitespace - ->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) ->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) - ->>> tokenizer.pre_tokenizer = Whitespace() ->>> files = [...] ->>> tokenizer.train(files, trainer) -``` - -Ahora tenemos un tokenizador entrenado en los archivos que definimos. Lo podemos seguir utilizando en ese entorno de ejecución (runtime en inglés), o puedes guardarlo -en un archivo JSON para reutilizarlo en un futuro. - -## Cargando directamente desde el objeto tokenizador - -Veamos cómo utilizar este objeto tokenizador en la biblioteca 🤗 Transformers. La clase -[`PreTrainedTokenizerFast`] permite una instanciación fácil, al aceptar el objeto -*tokenizer* instanciado como argumento: - -```python ->>> from transformers import PreTrainedTokenizerFast - ->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) -``` - -Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores -](main_classes/tokenizer) para más información. - -## Cargando desde un archivo JSON - -Para cargar un tokenizador desde un archivo JSON, comencemos por guardar nuestro tokenizador: - -```python ->>> tokenizer.save("tokenizer.json") -``` - -La localización (path en inglés) donde este archivo es guardado puede ser incluida en el método de inicialización de [`PreTrainedTokenizerFast`] -utilizando el parámetro `tokenizer_file`: - -```python ->>> from transformers import PreTrainedTokenizerFast - ->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") -``` - -Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores -](main_classes/tokenizer) para más información. diff --git a/docs/source/es/index.md b/docs/source/es/index.md new file mode 100644 index 000000000000..caefdfb7ad7b --- /dev/null +++ b/docs/source/es/index.md @@ -0,0 +1,281 @@ + + +# 🤗 Transformers + +Machine Learning de última generación para PyTorch, TensorFlow y JAX. + +🤗 Transformers proporciona APIs para descargar y entrenar fácilmente modelos preentrenados de última generación. El uso de modelos preentrenados puede reducir tus costos de cómputo, tu huella de carbono y ahorrarte tiempo al entrenar un modelo desde cero. Los modelos se pueden utilizar en diferentes modalidades, tales como: + +* 📝 Texto: clasificación de texto, extracción de información, respuesta a preguntas, resumir, traducción y generación de texto en más de 100 idiomas. +* 🖼️ Imágenes: clasificación de imágenes, detección de objetos y segmentación. +* 🗣️ Audio: reconocimiento de voz y clasificación de audio. +* 🐙 Multimodal: respuesta a preguntas en tablas, reconocimiento óptico de caracteres, extracción de información de documentos escaneados, clasificación de videos y respuesta visual a preguntas. + +Nuestra biblioteca admite una integración perfecta entre tres de las bibliotecas de deep learning más populares: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) y [JAX](https://jax.readthedocs.io/en/latest/). Entrena tu modelo con tres líneas de código en un framework y cárgalo para inferencia con otro. +Cada arquitectura de 🤗 Transformers se define en un módulo de Python independiente para que se puedan personalizar fácilmente para investigación y experimentos. + +## Si estás buscando soporte personalizado del equipo de Hugging Face + + +HuggingFace Expert Acceleration Program + + +## Contenidos + +La documentación está organizada en cuatro partes: + +- **EMPEZAR** contiene un recorrido rápido e instrucciones de instalación para comenzar a usar 🤗 Transformers. +- **TUTORIALES** es un excelente lugar para comenzar. Esta sección te ayudará a obtener las habilidades básicas que necesitas para comenzar a usar 🤗 Transformers. +- **GUÍAS PRÁCTICAS** te mostrará cómo lograr un objetivo específico, cómo hacer fine-tuning a un modelo preentrenado para el modelado de lenguaje o cómo crear un cabezal para un modelo personalizado. +- **GUÍAS CONCEPTUALES** proporciona más discusión y explicación de los conceptos e ideas subyacentes detrás de los modelos, las tareas y la filosofía de diseño de 🤗 Transformers. + +La biblioteca actualmente contiene implementaciones de JAX, PyTorch y TensorFlow, pesos de modelos preentrenados, scripts de uso y utilidades de conversión para los siguientes modelos. + +### Modelos compatibles + + + +1. **[ALBERT](model_doc/albert)** (de Google Research y el Instituto Tecnológico de Toyota en Chicago) publicado con el paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), por Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. +1. **[ALIGN](model_doc/align)** (de Google Research) publicado con el paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) por Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[BART](model_doc/bart)** (de Facebook) publicado con el paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) por Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov y Luke Zettlemoyer. +1. **[BARThez](model_doc/barthez)** (de École polytechnique) publicado con el paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) por Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](model_doc/bartpho)** (de VinAI Research) publicado con el paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) por Nguyen Luong Tran, Duong Minh Le y Dat Quoc Nguyen. +1. **[BEiT](model_doc/beit)** (de Microsoft) publicado con el paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) por Hangbo Bao, Li Dong, Furu Wei. +1. **[BERT](model_doc/bert)** (de Google) publicado con el paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) por Jacob Devlin, Ming-Wei Chang, Kenton Lee y Kristina Toutanova. +1. **[BERTweet](model_doc/bertweet)** (de VinAI Research) publicado con el paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) por Dat Quoc Nguyen, Thanh Vu y Anh Tuan Nguyen. +1. **[BERT For Sequence Generation](model_doc/bert-generation)** (de Google) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[BigBird-RoBERTa](model_doc/big_bird)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[Blenderbot](model_doc/blenderbot)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](model_doc/blenderbot-small)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BORT](model_doc/bort)** (de Alexa) publicado con el paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) por Adrian de Wynter y Daniel J. Perry. +1. **[ByT5](model_doc/byt5)** (de Google Research) publicado con el paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) por Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. +1. **[CamemBERT](model_doc/camembert)** (de Inria/Facebook/Sorbonne) publicado con el paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) por Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah y Benoît Sagot. +1. **[CANINE](model_doc/canine)** (de Google Research) publicado con el paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) por Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. +1. **[ConvNeXT](model_doc/convnext)** (de Facebook AI) publicado con el paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) por Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](model_doc/convnextv2)** (de Facebook AI) publicado con el paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) por Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. +1. **[CLIP](model_doc/clip)** (de OpenAI) publicado con el paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) por Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. +1. **[ConvBERT](model_doc/convbert)** (de YituTech) publicado con el paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) por Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +1. **[CPM](model_doc/cpm)** (de Universidad de Tsinghua) publicado con el paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) por Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CTRL](model_doc/ctrl)** (de Salesforce) publicado con el paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) por Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong y Richard Socher. +1. **[Data2Vec](model_doc/data2vec)** (de Facebook) publicado con el paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) por Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. +1. **[DeBERTa](model_doc/deberta)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[DeBERTa-v2](model_doc/deberta-v2)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[Decision Transformer](model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publicado con el paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) por Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. +1. **[DiT](model_doc/dit)** (de Microsoft Research) publicado con el paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) por Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. +1. **[DeiT](model_doc/deit)** (de Facebook) publicado con el paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) por Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DETR](model_doc/detr)** (de Facebook) publicado con el paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) por Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DialoGPT](model_doc/dialogpt)** (de Microsoft Research) publicado con el paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) por Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) y una versión alemana de DistilBERT. +1. **[DPR](model_doc/dpr)** (de Facebook) publicado con el paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) por Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, y Wen-tau Yih. +1. **[DPT](master/model_doc/dpt)** (de Intel Labs) publicado con el paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) por René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le. +1. **[EncoderDecoder](model_doc/encoder-decoder)** (de Google Research) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ELECTRA](model_doc/electra)** (de Google Research/Universidad de Stanford) publicado con el paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) por Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[FlauBERT](model_doc/flaubert)** (de CNRS) publicado con el paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) por Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. +1. **[FNet](model_doc/fnet)** (de Google Research) publicado con el paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) por James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[Funnel Transformer](model_doc/funnel)** (de CMU/Google Brain) publicado con el paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) por Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GLPN](model_doc/glpn)** (de KAIST) publicado con el paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) por Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. +1. **[GPT](model_doc/openai-gpt)** (de OpenAI) publicado con el paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) por Alec Radford, Karthik Narasimhan, Tim Salimans y Ilya Sutskever. +1. **[GPT-2](model_doc/gpt2)** (de OpenAI) publicado con el paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) por Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** y Ilya Sutskever**. +1. **[GPT-J](model_doc/gptj)** (de EleutherAI) publicado con el repositorio [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) por Ben Wang y Aran Komatsuzaki. +1. **[GPT Neo](model_doc/gpt_neo)** (de EleutherAI) publicado en el paper [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) por Sid Black, Stella Biderman, Leo Gao, Phil Wang y Connor Leahy. +1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released with [GPTSAN](https://github.com/tanreinama/GPTSAN) by Toshiyuki Sakamoto (tanreinama). +1. **[Hubert](model_doc/hubert)** (de Facebook) publicado con el paper [HuBERT: Self-Supervised Speech Representation Learning por Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) por Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. +1. **[I-BERT](model_doc/ibert)** (de Berkeley) publicado con el paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) por Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[ImageGPT](model_doc/imagegpt)** (de OpenAI) publicado con el paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) por Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[LayoutLM](model_doc/layoutlm)** (de Microsoft Research Asia) publicado con el paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) por Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. +1. **[LayoutLMv2](model_doc/layoutlmv2)** (de Microsoft Research Asia) publicado con el paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) por Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. +1. **[LayoutXLM](model_doc/layoutxlm)** (de Microsoft Research Asia) publicado con el paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) por Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. +1. **[LED](model_doc/led)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[Longformer](model_doc/longformer)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LUKE](model_doc/luke)** (de Studio Ousia) publicado con el paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) por Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. +1. **[mLUKE](model_doc/mluke)** (de Studio Ousia) publicado con el paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) por Ryokan Ri, Ikuya Yamada, y Yoshimasa Tsuruoka. +1. **[LXMERT](model_doc/lxmert)** (de UNC Chapel Hill) publicado con el paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) por Hao Tan y Mohit Bansal. +1. **[M2M100](model_doc/m2m_100)** (de Facebook) publicado con el paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) por Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. +1. **[MarianMT](model_doc/marian)** Modelos de traducción automática entrenados usando [OPUS](http://opus.nlpl.eu/) data por Jörg Tiedemann. El [Marian Framework](https://marian-nmt.github.io/) está siendo desarrollado por el equipo de traductores de Microsoft. +1. **[Mask2Former](model_doc/mask2former)** (de FAIR y UIUC) publicado con el paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) por Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. +1. **[MaskFormer](model_doc/maskformer)** (de Meta y UIUC) publicado con el paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) por Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[MBart](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) por Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. +1. **[MBart-50](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) por Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[Megatron-BERT](model_doc/megatron-bert)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro. +1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro. +1. **[MPNet](model_doc/mpnet)** (de Microsoft Research) publicado con el paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) por Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MT5](model_doc/mt5)** (de Google AI) publicado con el paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) por Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[Nyströmformer](model_doc/nystromformer)** (de la Universidad de Wisconsin - Madison) publicado con el paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) por Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](model_doc/oneformer)** (de la SHI Labs) publicado con el paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) por Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[Pegasus](model_doc/pegasus)** (de Google) publicado con el paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) por Jingqing Zhang, Yao Zhao, Mohammad Saleh y Peter J. Liu. +1. **[Perceiver IO](model_doc/perceiver)** (de Deepmind) publicado con el paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) por Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[PhoBERT](model_doc/phobert)** (de VinAI Research) publicado con el paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) por Dat Quoc Nguyen y Anh Tuan Nguyen. +1. **[PLBart](model_doc/plbart)** (de UCLA NLP) publicado con el paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) por Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. +1. **[PoolFormer](model_doc/poolformer)** (de Sea AI Labs) publicado con el paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) por Yu, Weihao y Luo, Mi y Zhou, Pan y Si, Chenyang y Zhou, Yichen y Wang, Xinchao y Feng, Jiashi y Yan, Shuicheng. +1. **[ProphetNet](model_doc/prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou. +1. **[QDQBert](model_doc/qdqbert)** (de NVIDIA) publicado con el paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) por Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev y Paulius Micikevicius. +1. **[REALM](model_doc/realm.html)** (de Google Research) publicado con el paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) por Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat y Ming-Wei Chang. +1. **[Reformer](model_doc/reformer)** (de Google Research) publicado con el paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) por Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. +1. **[RemBERT](model_doc/rembert)** (de Google Research) publicado con el paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) por Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. +1. **[RegNet](model_doc/regnet)** (de META Platforms) publicado con el paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) por Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. +1. **[ResNet](model_doc/resnet)** (de Microsoft Research) publicado con el paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) por Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. +1. **[RoBERTa](model_doc/roberta)** (de Facebook), publicado junto con el paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) por Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. +1. **[RoFormer](model_doc/roformer)** (de ZhuiyiTechnology), publicado junto con el paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) por Jianlin Su y Yu Lu y Shengfeng Pan y Bo Wen y Yunfeng Liu. +1. **[SegFormer](model_doc/segformer)** (de NVIDIA) publicado con el paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) por Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[SEW](model_doc/sew)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SEW-D](model_doc/sew_d)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (de Facebook), publicado junto con el paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) por Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. +1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (de Facebook), publicado junto con el paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) por Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. +1. **[Splinter](model_doc/splinter)** (de Universidad de Tel Aviv), publicado junto con el paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) pory Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. +1. **[SqueezeBert](model_doc/squeezebert)** (de Berkeley) publicado con el paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) por Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, y Kurt W. Keutzer. +1. **[Swin Transformer](model_doc/swin)** (de Microsoft) publicado con el paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) por Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. +1. **[T5](model_doc/t5)** (de Google AI) publicado con el paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu. +1. **[T5v1.1](model_doc/t5v1.1)** (de Google AI) publicado en el repositorio [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu. +1. **[TAPAS](model_doc/tapas)** (de Google AI) publicado con el paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) por Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno y Julian Martin Eisenschlos. +1. **[TAPEX](model_doc/tapex)** (de Microsoft Research) publicado con el paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) por Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Transformer-XL](model_doc/transfo-xl)** (de Google/CMU) publicado con el paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) por Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. +1. **[TrOCR](model_doc/trocr)** (de Microsoft), publicado junto con el paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) por Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[UniSpeech](model_doc/unispeech)** (de Microsoft Research) publicado con el paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) por Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](model_doc/unispeech-sat)** (de Microsoft Research) publicado con el paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) por Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[VAN](model_doc/van)** (de la Universidad de Tsinghua y la Universidad de Nankai) publicado con el paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) por Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. +1. **[ViLT](model_doc/vilt)** (de NAVER AI Lab/Kakao Enterprise/Kakao Brain) publicado con el paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) por Wonjae Kim, Bokyung Son, Ildoo Kim. +1. **[Vision Transformer (ViT)](model_doc/vit)** (de Google AI) publicado con el paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) por Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViTMAE](model_doc/vit_mae)** (de Meta AI) publicado con el paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) por Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[VisualBERT](model_doc/visual_bert)** (de UCLA NLP) publicado con el paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) por Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +1. **[WavLM](model_doc/wavlm)** (de Microsoft Research) publicado con el paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) por Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. +1. **[Wav2Vec2](model_doc/wav2vec2)** (de Facebook AI) publicado con el paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) por Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (de Facebook AI) publicado con el paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) por Qiantong Xu, Alexei Baevski, Michael Auli. +1. **[XGLM](model_doc/xglm)** (de Facebook AI) publicado con el paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) por Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. +1. **[XLM](model_doc/xlm)** (de Facebook) publicado junto con el paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) por Guillaume Lample y Alexis Conneau. +1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou. +1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (de Facebook AI), publicado junto con el paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) por Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer y Veselin Stoyanov. +1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (de Facebook AI), publicado junto con el paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) por Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLNet](model_doc/xlnet)** (de Google/CMU) publicado con el paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) por Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. +1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (de Facebook AI) publicado con el paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) por Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. +1. **[XLS-R](model_doc/xls_r)** (de Facebook AI) publicado con el paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) por Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. +1. **[YOSO](model_doc/yoso)** (de la Universidad de Wisconsin-Madison) publicado con el paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) por Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. + + +### Frameworks compatibles + +La siguiente tabla representa el soporte actual en la biblioteca para cada uno de esos modelos, ya sea que tengan un tokenizador de Python (llamado "slow"). Un tokenizador "fast" respaldado por la biblioteca 🤗 Tokenizers, ya sea que tengan soporte en Jax (a través de +Flax), PyTorch y/o TensorFlow. + + + +| Modelo | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | +|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| +| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | +| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | +| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | +| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ | +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | +| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| Canine | ✅ | ❌ | ✅ | ❌ | ❌ | +| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ | +| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | +| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | +| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ | +| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ | +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | +| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | +| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | +| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | +| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | +| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | +| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | +| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | +| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | +| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LED | ✅ | ✅ | ✅ | ✅ | ❌ | +| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | +| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | +| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | +| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | +| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | +| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | +| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | +| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | +| Realm | ✅ | ✅ | ✅ | ❌ | ❌ | +| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | +| RegNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ResNet | ❌ | ❌ | ✅ | ❌ | ✅ | +| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | +| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | +| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | +| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | +| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | +| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| Swin | ❌ | ❌ | ✅ | ❌ | ❌ | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +| TAPEX | ✅ | ✅ | ✅ | ✅ | ✅ | +| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | +| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | +| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | +| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | +| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | +| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | +| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | +| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | +| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | +| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | + + diff --git a/docs/source/es/index.mdx b/docs/source/es/index.mdx deleted file mode 100644 index 5091a52c8231..000000000000 --- a/docs/source/es/index.mdx +++ /dev/null @@ -1,271 +0,0 @@ - - -# 🤗 Transformers - -Machine Learning de última generación para PyTorch, TensorFlow y JAX. - -🤗 Transformers proporciona APIs para descargar y entrenar fácilmente modelos preentrenados de última generación. El uso de modelos preentrenados puede reducir tus costos de cómputo, tu huella de carbono y ahorrarte tiempo al entrenar un modelo desde cero. Los modelos se pueden utilizar en diferentes modalidades, tales como: - -* 📝 Texto: clasificación de texto, extracción de información, respuesta a preguntas, resumir, traducción y generación de texto en más de 100 idiomas. -* 🖼️ Imágenes: clasificación de imágenes, detección de objetos y segmentación. -* 🗣️ Audio: reconocimiento de voz y clasificación de audio. -* 🐙 Multimodal: respuesta a preguntas en tablas, reconocimiento óptico de caracteres, extracción de información de documentos escaneados, clasificación de videos y respuesta visual a preguntas. - -Nuestra biblioteca admite una integración perfecta entre tres de las bibliotecas de deep learning más populares: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) y [JAX](https://jax.readthedocs.io/en/latest/). Entrena tu modelo con tres líneas de código en un framework y cárgalo para inferencia con otro. -Cada arquitectura de 🤗 Transformers se define en un módulo de Python independiente para que se puedan personalizar fácilmente para investigación y experimentos. - -## Si estás buscando soporte personalizado del equipo de Hugging Face - - -HuggingFace Expert Acceleration Program - - -## Contenidos - -La documentación está organizada en cuatro partes: - -- **EMPEZAR** contiene un recorrido rápido e instrucciones de instalación para comenzar a usar 🤗 Transformers. -- **TUTORIALES** es un excelente lugar para comenzar. Esta sección te ayudará a obtener las habilidades básicas que necesitas para comenzar a usar 🤗 Transformers. -- **GUÍAS PRÁCTICAS** te mostrará cómo lograr un objetivo específico, cómo hacer fine-tuning a un modelo preentrenado para el modelado de lenguaje o cómo crear un cabezal para un modelo personalizado. -- **GUÍAS CONCEPTUALES** proporciona más discusión y explicación de los conceptos e ideas subyacentes detrás de los modelos, las tareas y la filosofía de diseño de 🤗 Transformers. - -La biblioteca actualmente contiene implementaciones de JAX, PyTorch y TensorFlow, pesos de modelos preentrenados, scripts de uso y utilidades de conversión para los siguientes modelos. - -### Modelos compatibles - - - -1. **[ALBERT](model_doc/albert)** (de Google Research y el Instituto Tecnológico de Toyota en Chicago) publicado con el paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), por Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[BART](model_doc/bart)** (de Facebook) publicado con el paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) por Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov y Luke Zettlemoyer. -1. **[BARThez](model_doc/barthez)** (de École polytechnique) publicado con el paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) por Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. -1. **[BARTpho](model_doc/bartpho)** (de VinAI Research) publicado con el paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) por Nguyen Luong Tran, Duong Minh Le y Dat Quoc Nguyen. -1. **[BEiT](model_doc/beit)** (de Microsoft) publicado con el paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) por Hangbo Bao, Li Dong, Furu Wei. -1. **[BERT](model_doc/bert)** (de Google) publicado con el paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) por Jacob Devlin, Ming-Wei Chang, Kenton Lee y Kristina Toutanova. -1. **[BERTweet](model_doc/bertweet)** (de VinAI Research) publicado con el paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) por Dat Quoc Nguyen, Thanh Vu y Anh Tuan Nguyen. -1. **[BERT For Sequence Generation](model_doc/bert-generation)** (de Google) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -1. **[BigBird-RoBERTa](model_doc/big_bird)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[Blenderbot](model_doc/blenderbot)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BlenderbotSmall](model_doc/blenderbot-small)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BORT](model_doc/bort)** (de Alexa) publicado con el paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) por Adrian de Wynter y Daniel J. Perry. -1. **[ByT5](model_doc/byt5)** (de Google Research) publicado con el paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) por Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. -1. **[CamemBERT](model_doc/camembert)** (de Inria/Facebook/Sorbonne) publicado con el paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) por Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah y Benoît Sagot. -1. **[CANINE](model_doc/canine)** (de Google Research) publicado con el paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) por Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. -1. **[ConvNeXT](model_doc/convnext)** (de Facebook AI) publicado con el paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) por Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. -1. **[CLIP](model_doc/clip)** (de OpenAI) publicado con el paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) por Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. -1. **[ConvBERT](model_doc/convbert)** (de YituTech) publicado con el paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) por Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. -1. **[CPM](model_doc/cpm)** (de Universidad de Tsinghua) publicado con el paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) por Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. -1. **[CTRL](model_doc/ctrl)** (de Salesforce) publicado con el paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) por Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong y Richard Socher. -1. **[Data2Vec](model_doc/data2vec)** (de Facebook) publicado con el paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) por Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. -1. **[DeBERTa](model_doc/deberta)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -1. **[DeBERTa-v2](model_doc/deberta-v2)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -1. **[Decision Transformer](model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publicado con el paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) por Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. -1. **[DiT](model_doc/dit)** (de Microsoft Research) publicado con el paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) por Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. -1. **[DeiT](model_doc/deit)** (de Facebook) publicado con el paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) por Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. -1. **[DETR](model_doc/detr)** (de Facebook) publicado con el paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) por Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. -1. **[DialoGPT](model_doc/dialogpt)** (de Microsoft Research) publicado con el paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) por Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) y una versión alemana de DistilBERT. -1. **[DPR](model_doc/dpr)** (de Facebook) publicado con el paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) por Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, y Wen-tau Yih. -1. **[DPT](master/model_doc/dpt)** (de Intel Labs) publicado con el paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) por René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. -1. **[EncoderDecoder](model_doc/encoder-decoder)** (de Google Research) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -1. **[ELECTRA](model_doc/electra)** (de Google Research/Universidad de Stanford) publicado con el paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) por Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -1. **[FlauBERT](model_doc/flaubert)** (de CNRS) publicado con el paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) por Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -1. **[FNet](model_doc/fnet)** (de Google Research) publicado con el paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) por James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. -1. **[Funnel Transformer](model_doc/funnel)** (de CMU/Google Brain) publicado con el paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) por Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GLPN](model_doc/glpn)** (de KAIST) publicado con el paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) por Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. -1. **[GPT](model_doc/openai-gpt)** (de OpenAI) publicado con el paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) por Alec Radford, Karthik Narasimhan, Tim Salimans y Ilya Sutskever. -1. **[GPT-2](model_doc/gpt2)** (de OpenAI) publicado con el paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) por Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** y Ilya Sutskever**. -1. **[GPT-J](model_doc/gptj)** (de EleutherAI) publicado con el repositorio [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) por Ben Wang y Aran Komatsuzaki. -1. **[GPT Neo](model_doc/gpt_neo)** (de EleutherAI) publicado en el paper [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) por Sid Black, Stella Biderman, Leo Gao, Phil Wang y Connor Leahy. -1. **[Hubert](model_doc/hubert)** (de Facebook) publicado con el paper [HuBERT: Self-Supervised Speech Representation Learning por Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) por Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. -1. **[I-BERT](model_doc/ibert)** (de Berkeley) publicado con el paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) por Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. -1. **[ImageGPT](model_doc/imagegpt)** (de OpenAI) publicado con el paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) por Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. -1. **[LayoutLM](model_doc/layoutlm)** (de Microsoft Research Asia) publicado con el paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) por Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -1. **[LayoutLMv2](model_doc/layoutlmv2)** (de Microsoft Research Asia) publicado con el paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) por Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. -1. **[LayoutXLM](model_doc/layoutxlm)** (de Microsoft Research Asia) publicado con el paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) por Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. -1. **[LED](model_doc/led)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan. -1. **[Longformer](model_doc/longformer)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan. -1. **[LUKE](model_doc/luke)** (de Studio Ousia) publicado con el paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) por Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. -1. **[mLUKE](model_doc/mluke)** (de Studio Ousia) publicado con el paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) por Ryokan Ri, Ikuya Yamada, y Yoshimasa Tsuruoka. -1. **[LXMERT](model_doc/lxmert)** (de UNC Chapel Hill) publicado con el paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) por Hao Tan y Mohit Bansal. -1. **[M2M100](model_doc/m2m_100)** (de Facebook) publicado con el paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) por Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. -1. **[MarianMT](model_doc/marian)** Modelos de traducción automática entrenados usando [OPUS](http://opus.nlpl.eu/) data por Jörg Tiedemann. El [Marian Framework](https://marian-nmt.github.io/) está siendo desarrollado por el equipo de traductores de Microsoft. -1. **[MaskFormer](model_doc/maskformer)** (de Meta y UIUC) publicado con el paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) por Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. -1. **[MBart](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) por Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -1. **[MBart-50](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) por Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. -1. **[Megatron-BERT](model_doc/megatron-bert)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro. -1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro. -1. **[MPNet](model_doc/mpnet)** (de Microsoft Research) publicado con el paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) por Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -1. **[MT5](model_doc/mt5)** (de Google AI) publicado con el paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) por Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -1. **[Nyströmformer](model_doc/nystromformer)** (de la Universidad de Wisconsin - Madison) publicado con el paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) por Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. -1. **[Pegasus](model_doc/pegasus)** (de Google) publicado con el paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) por Jingqing Zhang, Yao Zhao, Mohammad Saleh y Peter J. Liu. -1. **[Perceiver IO](model_doc/perceiver)** (de Deepmind) publicado con el paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) por Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. -1. **[PhoBERT](model_doc/phobert)** (de VinAI Research) publicado con el paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) por Dat Quoc Nguyen y Anh Tuan Nguyen. -1. **[PLBart](model_doc/plbart)** (de UCLA NLP) publicado con el paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) por Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. -1. **[PoolFormer](model_doc/poolformer)** (de Sea AI Labs) publicado con el paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) por Yu, Weihao y Luo, Mi y Zhou, Pan y Si, Chenyang y Zhou, Yichen y Wang, Xinchao y Feng, Jiashi y Yan, Shuicheng. -1. **[ProphetNet](model_doc/prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou. -1. **[QDQBert](model_doc/qdqbert)** (de NVIDIA) publicado con el paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) por Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev y Paulius Micikevicius. -1. **[REALM](model_doc/realm.html)** (de Google Research) publicado con el paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) por Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat y Ming-Wei Chang. -1. **[Reformer](model_doc/reformer)** (de Google Research) publicado con el paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) por Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -1. **[RemBERT](model_doc/rembert)** (de Google Research) publicado con el paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) por Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. -1. **[RegNet](model_doc/regnet)** (de META Platforms) publicado con el paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) por Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. -1. **[ResNet](model_doc/resnet)** (de Microsoft Research) publicado con el paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) por Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. -1. **[RoBERTa](model_doc/roberta)** (de Facebook), publicado junto con el paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) por Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -1. **[RoFormer](model_doc/roformer)** (de ZhuiyiTechnology), publicado junto con el paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) por Jianlin Su y Yu Lu y Shengfeng Pan y Bo Wen y Yunfeng Liu. -1. **[SegFormer](model_doc/segformer)** (de NVIDIA) publicado con el paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) por Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. -1. **[SEW](model_doc/sew)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -1. **[SEW-D](model_doc/sew_d)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (de Facebook), publicado junto con el paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) por Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. -1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (de Facebook), publicado junto con el paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) por Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. -1. **[Splinter](model_doc/splinter)** (de Universidad de Tel Aviv), publicado junto con el paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) pory Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. -1. **[SqueezeBert](model_doc/squeezebert)** (de Berkeley) publicado con el paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) por Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, y Kurt W. Keutzer. -1. **[Swin Transformer](model_doc/swin)** (de Microsoft) publicado con el paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) por Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. -1. **[T5](model_doc/t5)** (de Google AI) publicado con el paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu. -1. **[T5v1.1](model_doc/t5v1.1)** (de Google AI) publicado en el repositorio [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu. -1. **[TAPAS](model_doc/tapas)** (de Google AI) publicado con el paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) por Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno y Julian Martin Eisenschlos. -1. **[TAPEX](model_doc/tapex)** (de Microsoft Research) publicado con el paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) por Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. -1. **[Transformer-XL](model_doc/transfo-xl)** (de Google/CMU) publicado con el paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) por Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -1. **[TrOCR](model_doc/trocr)** (de Microsoft), publicado junto con el paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) por Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. -1. **[UniSpeech](model_doc/unispeech)** (de Microsoft Research) publicado con el paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) por Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. -1. **[UniSpeechSat](model_doc/unispeech-sat)** (de Microsoft Research) publicado con el paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) por Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. -1. **[VAN](model_doc/van)** (de la Universidad de Tsinghua y la Universidad de Nankai) publicado con el paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) por Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. -1. **[ViLT](model_doc/vilt)** (de NAVER AI Lab/Kakao Enterprise/Kakao Brain) publicado con el paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) por Wonjae Kim, Bokyung Son, Ildoo Kim. -1. **[Vision Transformer (ViT)](model_doc/vit)** (de Google AI) publicado con el paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) por Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. -1. **[ViTMAE](model_doc/vit_mae)** (de Meta AI) publicado con el paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) por Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. -1. **[VisualBERT](model_doc/visual_bert)** (de UCLA NLP) publicado con el paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) por Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -1. **[WavLM](model_doc/wavlm)** (de Microsoft Research) publicado con el paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) por Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. -1. **[Wav2Vec2](model_doc/wav2vec2)** (de Facebook AI) publicado con el paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) por Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. -1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (de Facebook AI) publicado con el paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) por Qiantong Xu, Alexei Baevski, Michael Auli. -1. **[XGLM](model_doc/xglm)** (de Facebook AI) publicado con el paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) por Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. -1. **[XLM](model_doc/xlm)** (de Facebook) publicado junto con el paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) por Guillaume Lample y Alexis Conneau. -1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou. -1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (de Facebook AI), publicado junto con el paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) por Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer y Veselin Stoyanov. -1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (de Facebook AI), publicado junto con el paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) por Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. -1. **[XLNet](model_doc/xlnet)** (de Google/CMU) publicado con el paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) por Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. -1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (de Facebook AI) publicado con el paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) por Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. -1. **[XLS-R](model_doc/xls_r)** (de Facebook AI) publicado con el paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) por Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. -1. **[YOSO](model_doc/yoso)** (de la Universidad de Wisconsin-Madison) publicado con el paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) por Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. - - -### Frameworks compatibles - -La siguiente tabla representa el soporte actual en la biblioteca para cada uno de esos modelos, ya sea que tengan un tokenizador de Python (llamado "slow"). Un tokenizador "fast" respaldado por la biblioteca 🤗 Tokenizers, ya sea que tengan soporte en Jax (a través de -Flax), PyTorch y/o TensorFlow. - - - -| Modelo | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | -|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| -| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| BART | ✅ | ✅ | ✅ | ✅ | ✅ | -| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | -| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | -| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | -| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ | -| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | -| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | -| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| Canine | ✅ | ❌ | ✅ | ❌ | ❌ | -| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | -| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ | -| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | -| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | -| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | -| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ | -| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ | -| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | -| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | -| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | -| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | -| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | -| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | -| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | -| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | -| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | -| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | -| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | -| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | -| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | -| LED | ✅ | ✅ | ✅ | ✅ | ❌ | -| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | -| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | -| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | -| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | -| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | -| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | -| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ | -| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | -| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | -| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | -| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | -| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | -| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | -| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | -| Realm | ✅ | ✅ | ✅ | ❌ | ❌ | -| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | -| RegNet | ❌ | ❌ | ✅ | ❌ | ❌ | -| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ResNet | ❌ | ❌ | ✅ | ❌ | ❌ | -| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | -| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | -| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | -| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | -| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | -| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | -| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | -| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | -| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| Swin | ❌ | ❌ | ✅ | ❌ | ❌ | -| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | -| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | -| TAPEX | ✅ | ✅ | ✅ | ✅ | ✅ | -| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | -| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | -| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | -| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | -| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | -| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | -| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | -| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | -| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | -| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | -| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | -| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | -| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | -| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | -| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | -| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | -| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | - - diff --git a/docs/source/es/installation.md b/docs/source/es/installation.md new file mode 100644 index 000000000000..0eb2dcb03a44 --- /dev/null +++ b/docs/source/es/installation.md @@ -0,0 +1,242 @@ + + +# Instalación + +En esta guía puedes encontrar información para instalar 🤗 Transformers para cualquier biblioteca de Machine Learning con la que estés trabajando. Además, encontrarás información sobre cómo establecer el caché y cómo configurar 🤗 Transformers para correrlo de manera offline (opcional). + +🤗 Transformers ha sido probada en Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, y Flax. Para instalar la biblioteca de deep learning con la que desees trabajar, sigue las instrucciones correspondientes listadas a continuación: + +* [PyTorch](https://pytorch.org/get-started/locally/) +* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) +* [Flax](https://flax.readthedocs.io/en/latest/) + +## Instalación con pip + +Es necesario instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si necesitas más información sobre entornos virtuales de Python, consulta esta [guía](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/ +). Un entorno virtual facilita el manejo de proyectos y evita problemas de compatibilidad entre dependencias. + +Comienza por crear un entorno virtual en el directorio de tu proyecto: + +```bash +python -m venv .env +``` + +Activa el entorno virtual: + +```bash +source .env/bin/activate +``` + +Ahora puedes instalar 🤗 Transformers con el siguiente comando: + +```bash +pip install transformers +``` + +Solo para CPU, puedes instalar 🤗 Transformers y una biblioteca de deep learning con un comando de una sola línea. + +Por ejemplo, instala 🤗 Transformers y Pytorch: + +```bash +pip install transformers[torch] +``` + +🤗 Transformers y TensorFlow 2.0: + +```bash +pip install transformers[tf-cpu] +``` + +🤗 Transformers y Flax: + +```bash +pip install transformers[flax] +``` + +Por último, revisa si 🤗 Transformers ha sido instalada exitosamente con el siguiente comando que descarga un modelo pre-entrenado: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" +``` +Después imprime la etiqueta y el puntaje: + +```bash +[{'label': 'POSITIVE', 'score': 0.9998704791069031}] +``` + +## Instalación desde la fuente + +Instala 🤗 Transformers desde la fuente con el siguiente comando: + +```bash +pip install git+https://github.com/huggingface/transformers +``` + +El comando de arriba instala la versión `master` más actual en vez de la última versión estable. La versión `master` es útil para obtener los últimos avances de 🤗 Transformers. Por ejemplo, se puede dar el caso de que un error fue corregido después de la última versión estable pero aún no se ha liberado un nuevo lanzamiento. Sin embargo, existe la posibilidad de que la versión `master` no sea estable. El equipo trata de mantener la versión `master` operacional y la mayoría de los errores son resueltos en unas cuantas horas o un día. Si encuentras algún problema, por favor abre un [Issue](https://github.com/huggingface/transformers/issues) para que pueda ser corregido más rápido. + +Verifica si 🤗 Transformers está instalada apropiadamente con el siguiente comando: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" +``` + +## Instalación editable + +Necesitarás una instalación editable si deseas: +* Usar la versión `master` del código fuente. +* Contribuir a 🤗 Transformers y necesitas probar cambios en el código. + +Clona el repositorio e instala 🤗 Transformers con los siguientes comandos: + +```bash +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install -e . +``` + +Éstos comandos van a ligar el directorio desde donde clonamos el repositorio al path de las bibliotecas de Python. Python ahora buscará dentro de la carpeta que clonaste además de los paths normales de la biblioteca. Por ejemplo, si los paquetes de Python se encuentran instalados en `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python también buscará en el directorio desde donde clonamos el repositorio `~/transformers/`. + + + +Debes mantener el directorio `transformers` si deseas seguir usando la biblioteca. + + + +Puedes actualizar tu copia local a la última versión de 🤗 Transformers con el siguiente comando: + +```bash +cd ~/transformers/ +git pull +``` + +El entorno de Python que creaste para la instalación de 🤗 Transformers encontrará la versión `master` en la siguiente ejecución. + +## Instalación con conda + +Puedes instalar 🤗 Transformers desde el canal de conda `huggingface` con el siguiente comando: + +```bash +conda install -c huggingface transformers +``` + +## Configuración de Caché + +Los modelos preentrenados se descargan y almacenan en caché localmente en: `~/.cache/huggingface/transformers/`. Este es el directorio predeterminado proporcionado por la variable de entorno de shell `TRANSFORMERS_CACHE`. En Windows, el directorio predeterminado es dado por `C:\Users\username\.cache\huggingface\transformers`. Puedes cambiar las variables de entorno de shell que se muestran a continuación, en orden de prioridad, para especificar un directorio de caché diferente: + +1. Variable de entorno del shell (por defecto): `TRANSFORMERS_CACHE`. +2. Variable de entorno del shell:`HF_HOME` + `transformers/`. +3. Variable de entorno del shell: `XDG_CACHE_HOME` + `/huggingface/transformers`. + + + +🤗 Transformers usará las variables de entorno de shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` si viene de una iteración anterior de la biblioteca y ha configurado esas variables de entorno, a menos que especifiques la variable de entorno de shell `TRANSFORMERS_CACHE`. + + + + +## Modo Offline + +🤗 Transformers puede ejecutarse en un entorno con firewall o fuera de línea (offline) usando solo archivos locales. Configura la variable de entorno `TRANSFORMERS_OFFLINE=1` para habilitar este comportamiento. + + + +Puedes añadir [🤗 Datasets](https://huggingface.co/docs/datasets/) al flujo de entrenamiento offline declarando la variable de entorno `HF_DATASETS_OFFLINE=1`. + + + +Por ejemplo, normalmente ejecutarías un programa en una red normal con firewall para instancias externas con el siguiente comando: + +```bash +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +Ejecuta este mismo programa en una instancia offline con el siguiente comando: + +```bash +HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +El script ahora debería ejecutarse sin bloquearse ni esperar a que se agote el tiempo de espera porque sabe que solo debe buscar archivos locales. + +### Obtener modelos y tokenizers para uso offline + +Otra opción para usar 🤗 Transformers offline es descargando previamente los archivos y después apuntar al path local donde se encuentren. Hay tres maneras de hacer esto: + +* Descarga un archivo mediante la interfaz de usuario del [Model Hub](https://huggingface.co/models) haciendo click en el ícono ↓. + + ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) + + +* Utiliza el flujo de [`PreTrainedModel.from_pretrained`] y [`PreTrainedModel.save_pretrained`]: + 1. Descarga previamente los archivos con [`PreTrainedModel.from_pretrained`]: + + ```py + >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + + >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") + ``` + + + 2. Guarda los archivos en un directorio específico con [`PreTrainedModel.save_pretrained`]: + + ```py + >>> tokenizer.save_pretrained("./your/path/bigscience_t0") + >>> model.save_pretrained("./your/path/bigscience_t0") + ``` + + 3. Cuando te encuentres offline, recarga los archivos con [`PreTrainedModel.from_pretrained`] desde el directorio especificado: + + ```py + >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0") + >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0") + ``` + +* Descarga de manera programática los archivos con la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub): + + 1. Instala la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) en tu entorno virtual: + + ```bash + python -m pip install huggingface_hub + ``` + + 2. Utiliza la función [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) para descargar un archivo a un path específico. Por ejemplo, el siguiente comando descarga el archivo `config.json` del modelo [T0](https://huggingface.co/bigscience/T0_3B) al path deseado: + + ```py + >>> from huggingface_hub import hf_hub_download + + >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0") + ``` + +Una vez que el archivo se descargue y se almacene en caché localmente, especifica tu ruta local para cargarlo y usarlo: + +```py +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json") +``` + + + +Para más detalles sobre cómo descargar archivos almacenados en el Hub consulta la sección [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream). + + diff --git a/docs/source/es/installation.mdx b/docs/source/es/installation.mdx deleted file mode 100644 index 01b9d81409d4..000000000000 --- a/docs/source/es/installation.mdx +++ /dev/null @@ -1,238 +0,0 @@ - - -# Instalación - -En esta guía puedes encontrar información para instalar 🤗 Transformers para cualquier biblioteca de Machine Learning con la que estés trabajando. Además, encontrarás información sobre cómo establecer el caché y cómo configurar 🤗 Transformers para correrlo de manera offline (opcional). - -🤗 Transformers ha sido probada en Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, y Flax. Para instalar la biblioteca de deep learning con la que desees trabajar, sigue las instrucciones correspondientes listadas a continuación: - -* [PyTorch](https://pytorch.org/get-started/locally/) -* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) -* [Flax](https://flax.readthedocs.io/en/latest/) - -## Instalación con pip - -Es necesario instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si necesitas más información sobre entornos virtuales de Python, consulta esta [guía](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/ -). Un entorno virtual facilita el manejo de proyectos y evita problemas de compatibilidad entre dependencias. - -Comienza por crear un entorno virtual en el directorio de tu proyecto: - -```bash -python -m venv .env -``` - -Activa el entorno virtual: - -```bash -source .env/bin/activate -``` - -Ahora puedes instalar 🤗 Transformers con el siguiente comando: - -```bash -pip install transformers -``` - -Solo para CPU, puedes instalar 🤗 Transformers y una biblioteca de deep learning con un comando de una sola línea. - -Por ejemplo, instala 🤗 Transformers y Pytorch: - -```bash -pip install transformers[torch] -``` - -🤗 Transformers y TensorFlow 2.0: - -```bash -pip install transformers[tf-cpu] -``` - -🤗 Transformers y Flax: - -```bash -pip install transformers[flax] -``` - -Por último, revisa si 🤗 Transformers ha sido instalada exitosamente con el siguiente comando que descarga un modelo pre-entrenado: - -```bash -python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" -``` -Después imprime la etiqueta y el puntaje: - -```bash -[{'label': 'POSITIVE', 'score': 0.9998704791069031}] -``` - -## Instalación desde la fuente - -Instala 🤗 Transformers desde la fuente con el siguiente comando: - -```bash -pip install git+https://github.com/huggingface/transformers -``` - -El comando de arriba instala la versión `master` más actual en vez de la última versión estable. La versión `master` es útil para obtener los últimos avances de 🤗 Transformers. Por ejemplo, se puede dar el caso de que un error fue corregido después de la última versión estable pero aún no se ha liberado un nuevo lanzamiento. Sin embargo, existe la posibilidad de que la versión `master` no sea estable. El equipo trata de mantener la versión `master` operacional y la mayoría de los errores son resueltos en unas cuantas horas o un día. Si encuentras algún problema, por favor abre un [Issue](https://github.com/huggingface/transformers/issues) para que pueda ser corregido más rápido. - -Verifica si 🤗 Transformers está instalada apropiadamente con el siguiente comando: - -```bash -python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" -``` - -## Instalación editable - -Necesitarás una instalación editable si deseas: -* Usar la versión `master` del código fuente. -* Contribuir a 🤗 Transformers y necesitas probar cambios en el código. - -Clona el repositorio e instala 🤗 Transformers con los siguientes comandos: - -```bash -git clone https://github.com/huggingface/transformers.git -cd transformers -pip install -e . -``` - -Éstos comandos van a ligar el directorio desde donde clonamos el repositorio al path de las bibliotecas de Python. Python ahora buscará dentro de la carpeta que clonaste además de los paths normales de la biblioteca. Por ejemplo, si los paquetes de Python se encuentran instalados en `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python también buscará en el directorio desde donde clonamos el repositorio `~/transformers/`. - - - -Debes mantener el directorio `transformers` si deseas seguir usando la biblioteca. - - - -Puedes actualizar tu copia local a la última versión de 🤗 Transformers con el siguiente comando: - -```bash -cd ~/transformers/ -git pull -``` - -El entorno de Python que creaste para la instalación de 🤗 Transformers encontrará la versión `master` en la siguiente ejecución. - -## Instalación con conda - -Puedes instalar 🤗 Transformers desde el canal de conda `huggingface` con el siguiente comando: - -```bash -conda install -c huggingface transformers -``` - -## Configuración de Caché - -Los modelos preentrenados se descargan y almacenan en caché localmente en: `~/.cache/huggingface/transformers/`. Este es el directorio predeterminado proporcionado por la variable de entorno de shell `TRANSFORMERS_CACHE`. En Windows, el directorio predeterminado es dado por `C:\Users\username\.cache\huggingface\transformers`. Puedes cambiar las variables de entorno de shell que se muestran a continuación, en orden de prioridad, para especificar un directorio de caché diferente: - -1. Variable de entorno del shell (por defecto): `TRANSFORMERS_CACHE`. -2. Variable de entorno del shell:`HF_HOME` + `transformers/`. -3. Variable de entorno del shell: `XDG_CACHE_HOME` + `/huggingface/transformers`. - - - -🤗 Transformers usará las variables de entorno de shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` si viene de una iteración anterior de la biblioteca y ha configurado esas variables de entorno, a menos que especifiques la variable de entorno de shell `TRANSFORMERS_CACHE`. - - - - -## Modo Offline - -🤗 Transformers puede ejecutarse en un entorno con firewall o fuera de línea (offline) usando solo archivos locales. Configura la variable de entorno `TRANSFORMERS_OFFLINE=1` para habilitar este comportamiento. - - - -Puedes añadir [🤗 Datasets](https://huggingface.co/docs/datasets/) al flujo de entrenamiento offline declarando la variable de entorno `HF_DATASETS_OFFLINE=1`. - - - -Por ejemplo, normalmente ejecutarías un programa en una red normal con firewall para instancias externas con el siguiente comando: - -```bash -python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... -``` - -Ejecuta este mismo programa en una instancia offline con el siguiente comando: - -```bash -HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ -python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... -``` - -El script ahora debería ejecutarse sin bloquearse ni esperar a que se agote el tiempo de espera porque sabe que solo debe buscar archivos locales. - -### Obtener modelos y tokenizers para uso offline - -Otra opción para usar 🤗 Transformers offline es descargando previamente los archivos y después apuntar al path local donde se encuentren. Hay tres maneras de hacer esto: - -* Descarga un archivo mediante la interfaz de usuario del [Model Hub](https://huggingface.co/models) haciendo click en el ícono ↓. - - ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) - - -* Utiliza el flujo de [`PreTrainedModel.from_pretrained`] y [`PreTrainedModel.save_pretrained`]: - 1. Descarga previamente los archivos con [`PreTrainedModel.from_pretrained`]: - - ```py - >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - - >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") - ``` - - - 2. Guarda los archivos en un directorio específico con [`PreTrainedModel.save_pretrained`]: - - ```py - >>> tokenizer.save_pretrained("./your/path/bigscience_t0") - >>> model.save_pretrained("./your/path/bigscience_t0") - ``` - - 3. Cuando te encuentres offline, recarga los archivos con [`PreTrainedModel.from_pretrained`] desde el directorio especificado: - - ```py - >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0") - >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0") - ``` - -* Descarga de manera programática los archivos con la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub): - - 1. Instala la biblioteca [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) en tu entorno virtual: - - ```bash - python -m pip install huggingface_hub - ``` - - 2. Utiliza la función [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) para descargar un archivo a un path específico. Por ejemplo, el siguiente comando descarga el archivo `config.json` del modelo [T0](https://huggingface.co/bigscience/T0_3B) al path deseado: - - ```py - >>> from huggingface_hub import hf_hub_download - - >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0") - ``` - -Una vez que el archivo se descargue y se almacene en caché localmente, especifica tu ruta local para cargarlo y usarlo: - -```py ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json") -``` - - - -Para más detalles sobre cómo descargar archivos almacenados en el Hub consulta la sección [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream). - - diff --git a/docs/source/es/model_sharing.md b/docs/source/es/model_sharing.md new file mode 100644 index 000000000000..46e1ee07a9a5 --- /dev/null +++ b/docs/source/es/model_sharing.md @@ -0,0 +1,223 @@ + + +# Compartir un modelo + +Los últimos dos tutoriales mostraron cómo puedes realizar fine-tunning a un modelo con PyTorch, Keras y 🤗 Accelerate para configuraciones distribuidas. ¡El siguiente paso es compartir tu modelo con la comunidad! En Hugging Face creemos en compartir abiertamente a todos el conocimiento y los recursos para democratizar la inteligencia artificial. En este sentido, te animamos a considerar compartir tu modelo con la comunidad, de esta forma ayudas a otros ahorrando tiempo y recursos. + +En este tutorial aprenderás dos métodos para compartir un modelo trained o fine-tuned en el [Model Hub](https://huggingface.co/models): + +- Mediante Código, enviando (push) tus archivos al Hub. +- Con la interfaz Web, con Drag-and-drop de tus archivos al Hub. + + + + + +Para compartir un modelo con la comunidad necesitas una cuenta en [huggingface.co](https://huggingface.co/join). También puedes unirte a una organización existente o crear una nueva. + + + +## Características de los repositorios + +Cada repositorio en el Model Hub se comporta como cualquier otro repositorio en GitHub. Nuestros repositorios ofrecen versioning, commit history, y la habilidad para visualizar diferencias. + +El versioning desarrollado dentro del Model Hub es basado en git y [git-lfs](https://git-lfs.github.com/). En otras palabras, puedes tratar un modelo como un repositorio, brindando un mejor control de acceso y escalabilidad. Version control permite *revisions*, un método para apuntar a una versión específica de un modelo utilizando un commit hash, tag o branch. + +Como resultado, puedes cargar una versión específica del modelo con el parámetro `revision`: + +```py +>>> model = AutoModel.from_pretrained( +... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... ) +``` + +Los archivos son editados fácilmente dentro de un repositorio. Incluso puedes observar el commit history y las diferencias: + +![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) + +## Configuración inicial + +Antes de compartir un modelo al Hub necesitarás tus credenciales de Hugging Face. Si tienes acceso a una terminal ejecuta el siguiente comando en el entorno virtual donde 🤗 Transformers esté instalado. Esto guardará tu token de acceso dentro de tu carpeta cache de Hugging Face (~/.cache/ by default): + +```bash +huggingface-cli login +``` + +Si usas un notebook como Jupyter o Colaboratory, asegúrate de tener instalada la biblioteca [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). Esta biblioteca te permitirá interactuar por código con el Hub. + +```bash +pip install huggingface_hub +``` + +Luego usa `notebook_login` para iniciar sesión al Hub, y sigue el link [aquí](https://huggingface.co/settings/token) para generar un token con el que iniciaremos sesión: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Convertir un modelo para todos los Frameworks + +Para asegurarnos que tu modelo pueda ser usado por alguien que esté trabajando con un framework diferente, te recomendamos convertir y subir tu modelo con checkpoints de pytorch y tensorflow. Aunque los usuarios aún son capaces de cargar su modelo desde un framework diferente, si se omite este paso será más lento debido a que 🤗 Transformers necesitará convertir el checkpoint sobre-la-marcha. + +Convertir un checkpoint para otro framework es fácil. Asegúrate tener Pytorch y TensorFlow instalado (Véase [aquí](installation) para instrucciones de instalación), y luego encuentra el modelo específico para tu tarea en el otro Framework. + +Por ejemplo, supongamos que has entrenado DistilBert para clasificación de secuencias en PyTorch y quieres convertirlo a su equivalente en TensorFlow. Cargas el equivalente en TensorFlow de tu modelo para tu tarea y especificas `from_pt=True` así 🤗 Transformers convertirá el Pytorch checkpoint a un TensorFlow Checkpoint: + +```py +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) +``` + +Luego guardas tu nuevo modelo TensorFlow con su nuevo checkpoint: + +```py +>>> tf_model.save_pretrained("path/to/awesome-name-you-picked") +``` + +De manera similar, especificas `from_tf=True` para convertir un checkpoint de TensorFlow a Pytorch: + +```py +>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) +>>> pt_model.save_pretrained("path/to/awesome-name-you-picked") +``` + +Si algún modelo está disponible en Flax, también puedes convertir un checkpoint de Pytorch a Flax: + +```py +>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( +... "path/to/awesome-name-you-picked", from_pt=True +... ) +``` + +## Compartir un modelo con `Trainer` + + + +Compartir un modelo al Hub es tan simple como añadir un parámetro extra o un callback. Si recuerdas del tutorial de [fine-tuning tutorial](training), la clase [`TrainingArguments`] es donde especificas los Hiperparámetros y opciones de entrenamiento adicionales. Una de estas opciones incluye la habilidad de compartir un modelo directamente al Hub. Para ello configuras `push_to_hub=True` dentro de [`TrainingArguments`]: + +```py +>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) +``` + +A continuación, como usualmente, pasa tus argumentos de entrenamiento a [`Trainer`]: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +Luego que realizas fine-tune a tu modelo, llamas [`~transformers.Trainer.push_to_hub`] en [`Trainer`] para enviar el modelo al Hub!🤗 Transformers incluso añadirá automáticamente los Hiperparámetros de entrenamiento, resultados de entrenamiento y versiones del Framework a tu model card! + +```py +>>> trainer.push_to_hub() +``` + +## Compartir un modelo con `PushToHubCallback` + +Los usuarios de TensorFlow pueden activar la misma funcionalidad con [`PushToHubCallback`]. En la funcion [`PushToHubCallback`], agrega: + +- Un directorio de salida para tu modelo. +- Un tokenizador. +- El `hub_model_id`, el cual es tu usuario Hub y el nombre del modelo. + +```py +>>> from transformers import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" +... ) +``` + +Agregamos el callback a [`fit`](https://keras.io/api/models/model_training_apis/), y 🤗 Transformers enviará el modelo entrenado al Hub: + +```py +>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) +``` + +## Usando la función `push_to_hub` + +Puedes llamar la función `push_to_hub` directamente en tu modelo para subirlo al Hub. + +Especifica el nombre del modelo en `push_to_hub`: + +```py +>>> pt_model.push_to_hub("my-awesome-model") +``` + +Esto creará un repositorio bajo tu usuario con el nombre del modelo `my-awesome-model`. Ahora los usuarios pueden cargar tu modelo con la función `from_pretrained`: + +```py +>>> from transformers import AutoModel + +>>> model = AutoModel.from_pretrained("your_username/my-awesome-model") +``` + +Si perteneces a una organización y quieres compartir tu modelo bajo el nombre de la organización, añade el parámetro `organization`: + +```py +>>> pt_model.push_to_hub("my-awesome-model", organization="my-awesome-org") +``` + +La función `push_to_hub` también puede ser usada para añadir archivos al repositorio del modelo. Por ejemplo, añade un tokenizador al repositorio: + +```py +>>> tokenizer.push_to_hub("my-awesome-model") +``` + +O quizás te gustaría añadir la versión de TensorFlow de tu modelo fine-tuned en Pytorch: + +```py +>>> tf_model.push_to_hub("my-awesome-model") +``` + +Ahora, cuando navegues a tu perfil en Hugging Face, deberías observar el repositorio de tu modelo creado recientemente. Si das click en el tab **Files** observarás todos los archivos que has subido al repositorio. + +Para más detalles sobre cómo crear y subir archivos al repositorio, consulta la [documentación del Hub](https://huggingface.co/docs/hub/how-to-upstream). + +## Compartir con la interfaz web + +Los usuarios que prefieran un enfoque no-code tienen la opción de cargar su modelo a través de la interfaz gráfica del Hub. Visita la página [huggingface.co/new](https://huggingface.co/new) para crear un nuevo repositorio: + +![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) + +Desde aquí, añade información acerca del modelo: + +- Selecciona el **owner** (la persona propietaria) del repositorio. Puedes ser tú o cualquier organización a la que pertenezcas. +- Escoge un nombre para tu modelo. También será el nombre del repositorio. +- Elige si tu modelo es público o privado. +- Especifica la licencia que usará tu modelo. + +Ahora puedes hacer click en el tab **Files** y luego en el botón **Add file** para subir un nuevo archivo a tu repositorio. Luego arrastra y suelta un archivo a subir y le añades un mensaje al commit. + +![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) + +## Añadiendo una tarjeta de modelo + +Para asegurarnos que los usuarios entiendan las capacidades de tu modelo, sus limitaciones, posibles sesgos y consideraciones éticas, por favor añade una tarjeta (como una tarjeta de presentación) al repositorio del modelo. La tarjeta de modelo es definida en el archivo `README.md`. Puedes agregar una de la siguiente manera: + +* Elaborando y subiendo manualmente el archivo`README.md`. +* Dando click en el botón **Edit model card** dentro del repositorio. + +Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí] (https://huggingface.co/docs/hub/models-cards). diff --git a/docs/source/es/model_sharing.mdx b/docs/source/es/model_sharing.mdx deleted file mode 100644 index 38a52072b41b..000000000000 --- a/docs/source/es/model_sharing.mdx +++ /dev/null @@ -1,219 +0,0 @@ - - -# Compartir un modelo - -Los últimos dos tutoriales mostraron cómo puedes realizar fine-tunning a un modelo con PyTorch, Keras y 🤗 Accelerate para configuraciones distribuidas. ¡El siguiente paso es compartir tu modelo con la comunidad! En Hugging Face creemos en compartir abiertamente a todos el conocimiento y los recursos para democratizar la inteligencia artificial. En este sentido, te animamos a considerar compartir tu modelo con la comunidad, de esta forma ayudas a otros ahorrando tiempo y recursos. - -En este tutorial aprenderás dos métodos para compartir un modelo trained o fine-tuned en el [Model Hub](https://huggingface.co/models): - -- Mediante Código, enviando (push) tus archivos al Hub. -- Con la interfaz Web, con Drag-and-drop de tus archivos al Hub. - - - - - -Para compartir un modelo con la comunidad necesitas una cuenta en [huggingface.co](https://huggingface.co/join). También puedes unirte a una organización existente o crear una nueva. - - - -## Características de los repositorios - -Cada repositorio en el Model Hub se comporta como cualquier otro repositorio en GitHub. Nuestros repositorios ofrecen versioning, commit history, y la habilidad para visualizar diferencias. - -El versioning desarrollado dentro del Model Hub es basado en git y [git-lfs](https://git-lfs.github.com/). En otras palabras, puedes tratar un modelo como un repositorio, brindando un mejor control de acceso y escalabilidad. Version control permite *revisions*, un método para apuntar a una versión específica de un modelo utilizando un commit hash, tag o branch. - -Como resultado, puedes cargar una versión específica del modelo con el parámetro `revision`: - -```py ->>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash -... ) -``` - -Los archivos son editados fácilmente dentro de un repositorio. Incluso puedes observar el commit history y las diferencias: - -![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) - -## Configuración inicial - -Antes de compartir un modelo al Hub necesitarás tus credenciales de Hugging Face. Si tienes acceso a una terminal ejecuta el siguiente comando en el entorno virtual donde 🤗 Transformers esté instalado. Esto guardará tu token de acceso dentro de tu carpeta cache de Hugging Face (~/.cache/ by default): - -```bash -huggingface-cli login -``` - -Si usas un notebook como Jupyter o Colaboratory, asegúrate de tener instalada la biblioteca [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). Esta biblioteca te permitirá interactuar por código con el Hub. - -```bash -pip install huggingface_hub -``` - -Luego usa `notebook_login` para iniciar sesión al Hub, y sigue el link [aquí](https://huggingface.co/settings/token) para generar un token con el que iniciaremos sesión: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Convertir un modelo para todos los Frameworks - -Para asegurarnos que tu modelo pueda ser usado por alguien que esté trabajando con un framework diferente, te recomendamos convertir y subir tu modelo con checkpoints de pytorch y tensorflow. Aunque los usuarios aún son capaces de cargar su modelo desde un framework diferente, si se omite este paso será más lento debido a que 🤗 Transformers necesitará convertir el checkpoint sobre-la-marcha. - -Convertir un checkpoint para otro framework es fácil. Asegúrate tener Pytorch y TensorFlow instalado (Véase [aquí](installation) para instrucciones de instalación), y luego encuentra el modelo específico para tu tarea en el otro Framework. - -Por ejemplo, supongamos que has entrenado DistilBert para clasificación de secuencias en PyTorch y quieres convertirlo a su equivalente en TensorFlow. Cargas el equivalente en TensorFlow de tu modelo para tu tarea y especificas `from_pt=True` así 🤗 Transformers convertirá el Pytorch checkpoint a un TensorFlow Checkpoint: - -```py ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) -``` - -Luego guardas tu nuevo modelo TensorFlow con su nuevo checkpoint: - -```py ->>> tf_model.save_pretrained("path/to/awesome-name-you-picked") -``` - -De manera similar, especificas `from_tf=True` para convertir un checkpoint de TensorFlow a Pytorch: - -```py ->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) ->>> pt_model.save_pretrained("path/to/awesome-name-you-picked") -``` - -Si algún modelo está disponible en Flax, también puedes convertir un checkpoint de Pytorch a Flax: - -```py ->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( -... "path/to/awesome-name-you-picked", from_pt=True -... ) -``` - -## Compartir un modelo con `Trainer` - - - -Compartir un modelo al Hub es tan simple como añadir un parámetro extra o un callback. Si recuerdas del tutorial de [fine-tuning tutorial](training), la clase [`TrainingArguments`] es donde especificas los Hiperparámetros y opciones de entrenamiento adicionales. Una de estas opciones incluye la habilidad de compartir un modelo directamente al Hub. Para ello configuras `push_to_hub=True` dentro de [`TrainingArguments`]: - -```py ->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) -``` - -A continuación, como usualmente, pasa tus argumentos de entrenamiento a [`Trainer`]: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -Luego que realizas fine-tune a tu modelo, llamas [`~transformers.Trainer.push_to_hub`] en [`Trainer`] para enviar el modelo al Hub!🤗 Transformers incluso añadirá automáticamente los Hiperparámetros de entrenamiento, resultados de entrenamiento y versiones del Framework a tu model card! - -```py ->>> trainer.push_to_hub() -``` - -## Compartir un modelo con `PushToHubCallback` - -Los usuarios de TensorFlow pueden activar la misma funcionalidad con [`PushToHubCallback`]. En la funcion [`PushToHubCallback`], agrega: - -- Un directorio de salida para tu modelo. -- Un tokenizador. -- El `hub_model_id`, el cual es tu usuario Hub y el nombre del modelo. - -```py ->>> from transformers.keras.callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" -... ) -``` - -Agregamos el callback a [`fit`](https://keras.io/api/models/model_training_apis/), y 🤗 Transformers enviará el modelo entrenado al Hub: - -```py ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) -``` - -## Usando la función `push_to_hub` - -Puedes llamar la función `push_to_hub` directamente en tu modelo para subirlo al Hub. - -Especifica el nombre del modelo en `push_to_hub`: - -```py ->>> pt_model.push_to_hub("my-awesome-model") -``` - -Esto creará un repositorio bajo tu usuario con el nombre del modelo `my-awesome-model`. Ahora los usuarios pueden cargar tu modelo con la función `from_pretrained`: - -```py ->>> from transformers import AutoModel - ->>> model = AutoModel.from_pretrained("your_username/my-awesome-model") -``` - -Si perteneces a una organización y quieres compartir tu modelo bajo el nombre de la organización, añade el parámetro `organization`: - -```py ->>> pt_model.push_to_hub("my-awesome-model", organization="my-awesome-org") -``` - -La función `push_to_hub` también puede ser usada para añadir archivos al repositorio del modelo. Por ejemplo, añade un tokenizador al repositorio: - -```py ->>> tokenizer.push_to_hub("my-awesome-model") -``` - -O quizás te gustaría añadir la versión de TensorFlow de tu modelo fine-tuned en Pytorch: - -```py ->>> tf_model.push_to_hub("my-awesome-model") -``` - -Ahora, cuando navegues a tu perfil en Hugging Face, deberías observar el repositorio de tu modelo creado recientemente. Si das click en el tab **Files** observarás todos los archivos que has subido al repositorio. - -Para más detalles sobre cómo crear y subir archivos al repositorio, consulta la [documentación del Hub](https://huggingface.co/docs/hub/how-to-upstream). - -## Compartir con la interfaz web - -Los usuarios que prefieran un enfoque no-code tienen la opción de cargar su modelo a través de la interfaz gráfica del Hub. Visita la página [huggingface.co/new](https://huggingface.co/new) para crear un nuevo repositorio: - -![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) - -Desde aquí, añade información acerca del modelo: - -- Selecciona el **owner** (la persona propietaria) del repositorio. Puedes ser tú o cualquier organización a la que pertenezcas. -- Escoge un nombre para tu modelo. También será el nombre del repositorio. -- Elige si tu modelo es público o privado. -- Especifica la licencia que usará tu modelo. - -Ahora puedes hacer click en el tab **Files** y luego en el botón **Add file** para subir un nuevo archivo a tu repositorio. Luego arrastra y suelta un archivo a subir y le añades un mensaje al commit. - -![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) - -## Añadiendo una tarjeta de modelo - -Para asegurarnos que los usuarios entiendan las capacidades de tu modelo, sus limitaciones, posibles sesgos y consideraciones éticas, por favor añade una tarjeta (como una tarjeta de presentación) al repositorio del modelo. La tarjeta de modelo es definida en el archivo `README.md`. Puedes agregar una de la siguiente manera: - -* Elaborando y subiendo manualmente el archivo`README.md`. -* Dando click en el botón **Edit model card** dentro del repositorio. - -Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí] (https://huggingface.co/docs/hub/models-cards). diff --git a/docs/source/es/multilingual.md b/docs/source/es/multilingual.md new file mode 100644 index 000000000000..fa60cac68c26 --- /dev/null +++ b/docs/source/es/multilingual.md @@ -0,0 +1,179 @@ + + +# Modelos multilingües para inferencia + +[[open-in-colab]] + +Existen varios modelos multilingües en 🤗 Transformers y su uso para inferencia difiere de los modelos monolingües. Sin embargo, no *todos* los usos de los modelos multilingües son diferentes. Algunos modelos, como [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), pueden utilizarse igual que un modelo monolingüe. Esta guía te enseñará cómo utilizar modelos multilingües cuyo uso difiere en la inferencia. + +## XLM + +XLM tiene diez checkpoints diferentes de los cuales solo uno es monolingüe. Los nueve checkpoints restantes del modelo pueden dividirse en dos categorías: los checkpoints que utilizan language embeddings y los que no. + +### XLM con language embeddings + +Los siguientes modelos XLM usan language embeddings para especificar el lenguaje utilizado en la inferencia: + +- `xlm-mlm-ende-1024` (Masked language modeling, English-German) +- `xlm-mlm-enfr-1024` (Masked language modeling, English-French) +- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian) +- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages) +- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages) +- `xlm-clm-enfr-1024` (Causal language modeling, English-French) +- `xlm-clm-ende-1024` (Causal language modeling, English-German) + +Los language embeddings son representados como un tensor de la mismas dimensiones que los `input_ids` pasados al modelo. Los valores de estos tensores dependen del idioma utilizado y se identifican mediante los atributos `lang2id` y `id2lang` del tokenizador. + +En este ejemplo, carga el checkpoint `xlm-clm-enfr-1024` (Causal language modeling, English-French): + +```py +>>> import torch +>>> from transformers import XLMTokenizer, XLMWithLMHeadModel + +>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") +>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") +``` + +El atributo `lang2id` del tokenizador muestra los idiomas de este modelo y sus ids: + +```py +>>> print(tokenizer.lang2id) +{'en': 0, 'fr': 1} +``` + +A continuación, crea un input de ejemplo: + +```py +>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 +``` + +Establece el id del idioma, por ejemplo `"en"`, y utilízalo para definir el language embedding. El language embedding es un tensor lleno de `0` ya que es el id del idioma para inglés. Este tensor debe ser del mismo tamaño que `input_ids`. + +```py +>>> language_id = tokenizer.lang2id["en"] # 0 +>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) + +>>> # We reshape it to be of size (batch_size, sequence_length) +>>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) +``` + +Ahora puedes pasar los `input_ids` y el language embedding al modelo: + +```py +>>> outputs = model(input_ids, langs=langs) +``` + +El script [run_generation.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-generation/run_generation.py) puede generar texto con language embeddings utilizando los checkpoints `xlm-clm`. + +### XLM sin language embeddings + +Los siguientes modelos XLM no requieren language embeddings durante la inferencia: + +- `xlm-mlm-17-1280` (modelado de lenguaje enmascarado, 17 idiomas) +- `xlm-mlm-100-1280` (modelado de lenguaje enmascarado, 100 idiomas) + +Estos modelos se utilizan para representaciones genéricas de frases a diferencia de los anteriores checkpoints XLM. + +## BERT + +Los siguientes modelos de BERT pueden utilizarse para tareas multilingües: + +- `bert-base-multilingual-uncased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 102 idiomas) +- `bert-base-multilingual-cased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 104 idiomas) + +Estos modelos no requieren language embeddings durante la inferencia. Deben identificar la lengua a partir del +contexto e inferir en consecuencia. + +## XLM-RoBERTa + +Los siguientes modelos de XLM-RoBERTa pueden utilizarse para tareas multilingües: + +- `xlm-roberta-base` (modelado de lenguaje enmascarado, 100 idiomas) +- `xlm-roberta-large` (Modelado de lenguaje enmascarado, 100 idiomas) + +XLM-RoBERTa se entrenó con 2,5 TB de datos CommonCrawl recién creados y depurados en 100 idiomas. Proporciona fuertes ventajas sobre los modelos multilingües publicados anteriormente como mBERT o XLM en tareas posteriores como la clasificación, el etiquetado de secuencias y la respuesta a preguntas. + +## M2M100 + +Los siguientes modelos de M2M100 pueden utilizarse para traducción multilingüe: + +- `facebook/m2m100_418M` (traducción) +- `facebook/m2m100_1.2B` (traducción) + +En este ejemplo, carga el checkpoint `facebook/m2m100_418M` para traducir del chino al inglés. Puedes establecer el idioma de origen en el tokenizador: + +```py +>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." + +>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +``` + +Tokeniza el texto: + +```py +>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") +``` + +M2M100 fuerza el id del idioma de destino como el primer token generado para traducir al idioma de destino.. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés: + +```py +>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' +``` + +## MBart + +Los siguientes modelos de MBart pueden utilizarse para traducción multilingüe: + +- `facebook/mbart-large-50-one-to-many-mmt` (traducción automática multilingüe de uno a muchos, 50 idiomas) +- `facebook/mbart-large-50-many-to-many-mmt` (traducción automática multilingüe de muchos a muchos, 50 idiomas) +- `facebook/mbart-large-50-many-to-one-mmt` (traducción automática multilingüe muchos a uno, 50 idiomas) +- `facebook/mbart-large-50` (traducción multilingüe, 50 idiomas) +- `facebook/mbart-large-cc25` + +En este ejemplo, carga el checkpoint `facebook/mbart-large-50-many-to-many-mmt` para traducir del finlandés al inglés. Puedes establecer el idioma de origen en el tokenizador: + +```py +>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." + +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +``` + +Tokeniza el texto: + +```py +>>> encoded_en = tokenizer(en_text, return_tensors="pt") +``` + +MBart fuerza el id del idioma de destino como el primer token generado para traducirlo. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés: + +```py +>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." +``` + +Si estás usando el checkpoint `facebook/mbart-large-50-many-to-one-mmt` no necesitas forzar el id del idioma de destino como el primer token generado, de lo contrario el uso es el mismo. diff --git a/docs/source/es/multilingual.mdx b/docs/source/es/multilingual.mdx deleted file mode 100644 index 4849416a44db..000000000000 --- a/docs/source/es/multilingual.mdx +++ /dev/null @@ -1,175 +0,0 @@ - - -# Modelos multilingües para inferencia - -[[open-in-colab]] - -Existen varios modelos multilingües en 🤗 Transformers y su uso para inferencia difiere de los modelos monolingües. Sin embargo, no *todos* los usos de los modelos multilingües son diferentes. Algunos modelos, como [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), pueden utilizarse igual que un modelo monolingüe. Esta guía te enseñará cómo utilizar modelos multilingües cuyo uso difiere en la inferencia. - -## XLM - -XLM tiene diez checkpoints diferentes de los cuales solo uno es monolingüe. Los nueve checkpoints restantes del modelo pueden dividirse en dos categorías: los checkpoints que utilizan language embeddings y los que no. - -### XLM con language embeddings - -Los siguientes modelos XLM usan language embeddings para especificar el lenguaje utilizado en la inferencia: - -- `xlm-mlm-ende-1024` (Masked language modeling, English-German) -- `xlm-mlm-enfr-1024` (Masked language modeling, English-French) -- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian) -- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages) -- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages) -- `xlm-clm-enfr-1024` (Causal language modeling, English-French) -- `xlm-clm-ende-1024` (Causal language modeling, English-German) - -Los language embeddings son representados como un tensor de la mismas dimensiones que los `input_ids` pasados al modelo. Los valores de estos tensores dependen del idioma utilizado y se identifican mediante los atributos `lang2id` y `id2lang` del tokenizador. - -En este ejemplo, carga el checkpoint `xlm-clm-enfr-1024` (Causal language modeling, English-French): - -```py ->>> import torch ->>> from transformers import XLMTokenizer, XLMWithLMHeadModel - ->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") ->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") -``` - -El atributo `lang2id` del tokenizador muestra los idiomas de este modelo y sus ids: - -```py ->>> print(tokenizer.lang2id) -{'en': 0, 'fr': 1} -``` - -A continuación, crea un input de ejemplo: - -```py ->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 -``` - -Establece el id del idioma, por ejemplo `"en"`, y utilízalo para definir el language embedding. El language embedding es un tensor lleno de `0` ya que es el id del idioma para inglés. Este tensor debe ser del mismo tamaño que `input_ids`. - -```py ->>> language_id = tokenizer.lang2id["en"] # 0 ->>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) - ->>> # We reshape it to be of size (batch_size, sequence_length) ->>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) -``` - -Ahora puedes pasar los `input_ids` y el language embedding al modelo: - -```py ->>> outputs = model(input_ids, langs=langs) -``` - -El script [run_generation.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-generation/run_generation.py) puede generar texto con language embeddings utilizando los checkpoints `xlm-clm`. - -### XLM sin language embeddings - -Los siguientes modelos XLM no requieren language embeddings durante la inferencia: - -- `xlm-mlm-17-1280` (modelado de lenguaje enmascarado, 17 idiomas) -- `xlm-mlm-100-1280` (modelado de lenguaje enmascarado, 100 idiomas) - -Estos modelos se utilizan para representaciones genéricas de frases a diferencia de los anteriores checkpoints XLM. - -## BERT - -Los siguientes modelos de BERT pueden utilizarse para tareas multilingües: - -- `bert-base-multilingual-uncased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 102 idiomas) -- `bert-base-multilingual-cased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 104 idiomas) - -Estos modelos no requieren language embeddings durante la inferencia. Deben identificar la lengua a partir del -contexto e inferir en consecuencia. - -## XLM-RoBERTa - -Los siguientes modelos de XLM-RoBERTa pueden utilizarse para tareas multilingües: - -- `xlm-roberta-base` (modelado de lenguaje enmascarado, 100 idiomas) -- `xlm-roberta-large` (Modelado de lenguaje enmascarado, 100 idiomas) - -XLM-RoBERTa se entrenó con 2,5 TB de datos CommonCrawl recién creados y depurados en 100 idiomas. Proporciona fuertes ventajas sobre los modelos multilingües publicados anteriormente como mBERT o XLM en tareas posteriores como la clasificación, el etiquetado de secuencias y la respuesta a preguntas. - -## M2M100 - -Los siguientes modelos de M2M100 pueden utilizarse para traducción multilingüe: - -- `facebook/m2m100_418M` (traducción) -- `facebook/m2m100_1.2B` (traducción) - -En este ejemplo, carga el checkpoint `facebook/m2m100_418M` para traducir del chino al inglés. Puedes establecer el idioma de origen en el tokenizador: - -```py ->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." - ->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") ->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") -``` - -Tokeniza el texto: - -```py ->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") -``` - -M2M100 fuerza el id del idioma de destino como el primer token generado para traducir al idioma de destino.. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés: - -```py ->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' -``` - -## MBart - -Los siguientes modelos de MBart pueden utilizarse para traducción multilingüe: - -- `facebook/mbart-large-50-one-to-many-mmt` (traducción automática multilingüe de uno a muchos, 50 idiomas) -- `facebook/mbart-large-50-many-to-many-mmt` (traducción automática multilingüe de muchos a muchos, 50 idiomas) -- `facebook/mbart-large-50-many-to-one-mmt` (traducción automática multilingüe muchos a uno, 50 idiomas) -- `facebook/mbart-large-50` (traducción multilingüe, 50 idiomas) -- `facebook/mbart-large-cc25` - -En este ejemplo, carga el checkpoint `facebook/mbart-large-50-many-to-many-mmt` para traducir del finlandés al inglés. Puedes establecer el idioma de origen en el tokenizador: - -```py ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." - ->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") ->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") -``` - -Tokeniza el texto: - -```py ->>> encoded_en = tokenizer(en_text, return_tensors="pt") -``` - -MBart fuerza el id del idioma de destino como el primer token generado para traducirlo. Establece el `forced_bos_token_id` a `en` en el método `generate` para traducir al inglés: - -```py ->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." -``` - -Si estás usando el checkpoint `facebook/mbart-large-50-many-to-one-mmt` no necesitas forzar el id del idioma de destino como el primer token generado, de lo contrario el uso es el mismo. diff --git a/docs/source/es/philosophy.md b/docs/source/es/philosophy.md new file mode 100644 index 000000000000..4054ac0ae507 --- /dev/null +++ b/docs/source/es/philosophy.md @@ -0,0 +1,79 @@ + + +# Filosofía + +🤗 Transformers es una biblioteca construida para: + +- Los investigadores y educadores de NLP que busquen usar/estudiar/extender modelos transformers a gran escala +- Profesionales que quieren optimizar esos modelos y/o ponerlos en producción +- Ingenieros que solo quieren descargar un modelo preentrenado y usarlo para resolver una tarea NLP dada. + +La biblioteca fue diseñada con dos fuertes objetivos en mente: + +- Que sea tan fácil y rápida de utilizar como sea posible: + + - Hemos limitado enormemente el número de abstracciones que el usuario tiene que aprender. De hecho, no hay casi abstracciones, + solo tres clases estándar necesarias para usar cada modelo: [configuration](main_classes/configuration), + [models](main_classes/model) y [tokenizer](main_classes/tokenizer). + - Todas estas clases pueden ser inicializadas de forma simple y unificada a partir de ejemplos pre-entrenados mediante el uso de un método + `from_pretrained()` común de solicitud que se encargará de descargar (si es necesario), almacenar y cargar la solicitud de clase relacionada y datos asociados + (configurations' hyper-parameters, tokenizers' vocabulary, and models' weights) a partir de un control pre-entrenado proporcionado en + [Hugging Face Hub](https://huggingface.co/models) o de tu propio control guardado. + - Por encima de esas tres clases estándar, la biblioteca proporciona dos APIs: [`pipeline`] para usar rápidamente un modelo (junto a su configuracion y tokenizer asociados) + sobre una tarea dada, y [`Trainer`]/`Keras.fit` para entrenar u optimizar de forma rápida un modelo dado. + - Como consecuencia, esta biblioteca NO es una caja de herramientas modular de bloques individuales para redes neuronales. Si quieres extender/construir sobre la biblioteca, + usa simplemente los módulos regulares de Python/PyTorch/TensorFlow/Keras y emplea las clases estándar de la biblioteca como punto de partida para reutilizar funcionalidades + tales como abrir/guardar modelo. + +- Proporciona modelos modernos con rendimientos lo más parecido posible a los modelos originales: + + - Proporcionamos al menos un ejemplo para cada arquitectura que reproduce un resultado proporcionado por los autores de dicha arquitectura. + - El código normalmente es parecido al código base original, lo cual significa que algún código Pytorch puede no ser tan + *pytorchic* como podría ser por haber sido convertido a código TensorFlow, y viceversa. + +Unos cuantos objetivos adicionales: + +- Exponer las características internas de los modelos de la forma más coherente posible: + + - Damos acceso, mediante una sola API, a todos los estados ocultos y pesos de atención. + - Tokenizer y el modelo de API base están estandarizados para cambiar fácilmente entre modelos. + +- Incorporar una selección subjetiva de herramientas de gran potencial para la optimización/investigación de estos modelos: + + - Una forma sencilla/coherente de añadir nuevos tokens al vocabulario e incrustraciones (embeddings, en inglés) para optimización. + - Formas sencillas de camuflar y reducir "transformer heads". + +- Cambiar fácilmente entre PyTorch y TensorFlow 2.0, permitiendo el entrenamiento usando un marco y la inferencia usando otro. + +## Conceptos principales + +La biblioteca está construida alrededor de tres tipos de clases para cada modelo: + +- **Model classes** como [`BertModel`], que consisten en más de 30 modelos PyTorch ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)) o modelos Keras ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) que funcionan con pesos pre-entrenados proporcionados en la + biblioteca. +- **Configuration classes** como [`BertConfig`], que almacena todos los parámetros necesarios para construir un modelo. + No siempre tienes que generarla tu. En particular, si estas usando un modelo pre-entrenado sin ninguna modificación, + la creación del modelo se encargará automáticamente de generar la configuración (que es parte del modelo). +- **Tokenizer classes** como [`BertTokenizer`], que almacena el vocabulario para cada modelo y proporciona métodos para + codificar/decodificar strings en una lista de índices de "token embeddings" para ser empleados en un modelo. + +Todas estas clases pueden ser generadas a partir de ejemplos pre-entrenados, y guardados localmente usando dos métodos: + +- `from_pretrained()` permite generar un modelo/configuración/tokenizer a partir de una versión pre-entrenada proporcionada ya sea por + la propia biblioteca (los modelos compatibles se pueden encontrar en [Model Hub](https://huggingface.co/models)) o + guardados localmente (o en un servidor) por el usuario. +- `save_pretrained()` permite guardar un modelo/configuración/tokenizer localmente, de forma que puede ser empleado de nuevo usando + `from_pretrained()`. diff --git a/docs/source/es/philosophy.mdx b/docs/source/es/philosophy.mdx deleted file mode 100644 index 65e9a2c67a42..000000000000 --- a/docs/source/es/philosophy.mdx +++ /dev/null @@ -1,75 +0,0 @@ - - -# Filosofía - -🤗 Transformers es una biblioteca construida para: - -- Los investigadores y educadores de NLP que busquen usar/estudiar/extender modelos transformers a gran escala -- Profesionales que quieren optimizar esos modelos y/o ponerlos en producción -- Ingenieros que solo quieren descargar un modelo preentrenado y usarlo para resolver una tarea NLP dada. - -La biblioteca fue diseñada con dos fuertes objetivos en mente: - -- Que sea tan fácil y rápida de utilizar como sea posible: - - - Hemos limitado enormemente el número de abstracciones que el usuario tiene que aprender. De hecho, no hay casi abstracciones, - solo tres clases estándar necesarias para usar cada modelo: [configuration](main_classes/configuration), - [models](main_classes/model) y [tokenizer](main_classes/tokenizer). - - Todas estas clases pueden ser inicializadas de forma simple y unificada a partir de ejemplos pre-entrenados mediante el uso de un método - `from_pretrained()` común de solicitud que se encargará de descargar (si es necesario), almacenar y cargar la solicitud de clase relacionada y datos asociados - (configurations' hyper-parameters, tokenizers' vocabulary, and models' weights) a partir de un control pre-entrenado proporcionado en - [Hugging Face Hub](https://huggingface.co/models) o de tu propio control guardado. - - Por encima de esas tres clases estándar, la biblioteca proporciona dos APIs: [`pipeline`] para usar rápidamente un modelo (junto a su configuracion y tokenizer asociados) - sobre una tarea dada, y [`Trainer`]/`Keras.fit` para entrenar u optimizar de forma rápida un modelo dado. - - Como consecuencia, esta biblioteca NO es una caja de herramientas modular de bloques individuales para redes neuronales. Si quieres extender/construir sobre la biblioteca, - usa simplemente los módulos regulares de Python/PyTorch/TensorFlow/Keras y emplea las clases estándar de la biblioteca como punto de partida para reutilizar funcionalidades - tales como abrir/guardar modelo. - -- Proporciona modelos modernos con rendimientos lo más parecido posible a los modelos originales: - - - Proporcionamos al menos un ejemplo para cada arquitectura que reproduce un resultado proporcionado por los autores de dicha arquitectura. - - El código normalmente es parecido al código base original, lo cual significa que algún código Pytorch puede no ser tan - *pytorchic* como podría ser por haber sido convertido a código TensorFlow, y viceversa. - -Unos cuantos objetivos adicionales: - -- Exponer las características internas de los modelos de la forma más coherente posible: - - - Damos acceso, mediante una sola API, a todos los estados ocultos y pesos de atención. - - Tokenizer y el modelo de API base están estandarizados para cambiar fácilmente entre modelos. - -- Incorporar una selección subjetiva de herramientas de gran potencial para la optimización/investigación de estos modelos: - - - Una forma sencilla/coherente de añadir nuevos tokens al vocabulario e incrustraciones (embeddings, en inglés) para optimización. - - Formas sencillas de camuflar y reducir "transformer heads". - -- Cambiar fácilmente entre PyTorch y TensorFlow 2.0, permitiendo el entrenamiento usando un marco y la inferencia usando otro. - -## Conceptos principales - -La biblioteca está construida alrededor de tres tipos de clases para cada modelo: - -- **Model classes** como [`BertModel`], que consisten en más de 30 modelos PyTorch ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)) o modelos Keras ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) que funcionan con pesos pre-entrenados proporcionados en la - biblioteca. -- **Configuration classes** como [`BertConfig`], que almacena todos los parámetros necesarios para construir un modelo. - No siempre tienes que generarla tu. En particular, si estas usando un modelo pre-entrenado sin ninguna modificación, - la creación del modelo se encargará automáticamente de generar la configuración (que es parte del modelo). -- **Tokenizer classes** como [`BertTokenizer`], que almacena el vocabulario para cada modelo y proporciona métodos para - codificar/decodificar strings en una lista de índices de "token embeddings" para ser empleados en un modelo. - -Todas estas clases pueden ser generadas a partir de ejemplos pre-entrenados, y guardados localmente usando dos métodos: - -- `from_pretrained()` permite generar un modelo/configuración/tokenizer a partir de una versión pre-entrenada proporcionada ya sea por - la propia biblioteca (los modelos compatibles se pueden encontrar en [Model Hub](https://huggingface.co/models)) o - guardados localmente (o en un servidor) por el usuario. -- `save_pretrained()` permite guardar un modelo/configuración/tokenizer localmente, de forma que puede ser empleado de nuevo usando - `from_pretrained()`. diff --git a/docs/source/es/pipeline_tutorial.md b/docs/source/es/pipeline_tutorial.md new file mode 100644 index 000000000000..0f77c3c3db83 --- /dev/null +++ b/docs/source/es/pipeline_tutorial.md @@ -0,0 +1,143 @@ + + +# Pipelines para inferencia + +Un [`pipeline`] simplifica el uso de cualquier modelo del [Model Hub](https://huggingface.co/models) para la inferencia en una variedad de tareas como la generación de texto, la segmentación de imágenes y la clasificación de audio. Incluso si no tienes experiencia con una modalidad específica o no comprendes el código que alimenta los modelos, ¡aún puedes usarlos con el [`pipeline`]! Este tutorial te enseñará a: + +* Utilizar un [`pipeline`] para inferencia. +* Utilizar un tokenizador o modelo específico. +* Utilizar un [`pipeline`] para tareas de audio y visión. + + + +Echa un vistazo a la documentación de [`pipeline`] para obtener una lista completa de tareas admitidas. + + + +## Uso del pipeline + +Si bien cada tarea tiene un [`pipeline`] asociado, es más sencillo usar la abstracción general [`pipeline`] que contiene todos los pipelines de tareas específicas. El [`pipeline`] carga automáticamente un modelo predeterminado y un tokenizador con capacidad de inferencia para tu tarea. + +1. Comienza creando un [`pipeline`] y específica una tarea de inferencia: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline(task="text-generation") +``` + +2. Pasa tu texto de entrada al [`pipeline`]: + +```py +>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone") +[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}] +``` + +Si tienes más de una entrada, pásala como una lista: + +```py +>>> generator( +... [ +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", +... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne", +... ] +... ) +``` + +Cualquier parámetro adicional para tu tarea también se puede incluir en el [`pipeline`]. La tarea `text-generation` tiene un método [`~generation.GenerationMixin.generate`] con varios parámetros para controlar la salida. Por ejemplo, si deseas generar más de una salida, defínelo en el parámetro `num_return_sequences`: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", +... num_return_sequences=2, +... ) +``` + +### Selecciona un modelo y un tokenizador + +El [`pipeline`] acepta cualquier modelo del [Model Hub](https://huggingface.co/models). Hay etiquetas en el Model Hub que te permiten filtrar por el modelo que te gustaría utilizar para tu tarea. Una vez que hayas elegido un modelo apropiado, cárgalo con la clase `AutoModelFor` y [`AutoTokenizer`] correspondientes. Por ejemplo, carga la clase [`AutoModelForCausalLM`] para una tarea de modelado de lenguaje causal: + +```py +>>> from transformers import AutoTokenizer, AutoModelForCausalLM + +>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") +>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") +``` + +Crea un [`pipeline`] para tu tarea y específica el modelo y el tokenizador que cargaste: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer) +``` + +Pasa tu texto de entrada a [`pipeline`] para generar algo de texto: + +```py +>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone") +[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}] +``` + +## Pipeline de audio + +La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio. + +Por ejemplo, clasifiquemos la emoción de un breve fragmento del famoso discurso de John F. Kennedy ["We choose to go to the Moon"](https://en.wikipedia.org/wiki/We_choose_to_go_to_the_Moon). Encuentra un modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para reconocimiento de emociones en el Model Hub y cárgalo en el [`pipeline`]: + +```py +>>> from transformers import pipeline + +>>> audio_classifier = pipeline( +... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +Pasa el archivo de audio al [`pipeline`]: + +```py +>>> audio_classifier("jfk_moon_speech.wav") +[{'label': 'calm', 'score': 0.13856211304664612}, + {'label': 'disgust', 'score': 0.13148026168346405}, + {'label': 'happy', 'score': 0.12635163962841034}, + {'label': 'angry', 'score': 0.12439591437578201}, + {'label': 'fearful', 'score': 0.12404385954141617}] +``` + +## Pipeline de visión + +Finalmente, utilizar un [`pipeline`] para tareas de visión es prácticamente igual. + +Específica tu tarea de visión y pasa tu imagen al clasificador. La imagen puede ser un enlace o una ruta local a la imagen. Por ejemplo, ¿qué especie de gato se muestra a continuación? + +![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) + +```py +>>> from transformers import pipeline + +>>> vision_classifier = pipeline(task="image-classification") +>>> vision_classifier( +... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +[{'label': 'lynx, catamount', 'score': 0.4403027892112732}, + {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor', + 'score': 0.03433405980467796}, + {'label': 'snow leopard, ounce, Panthera uncia', + 'score': 0.032148055732250214}, + {'label': 'Egyptian cat', 'score': 0.02353910356760025}, + {'label': 'tiger cat', 'score': 0.023034192621707916}] +``` diff --git a/docs/source/es/pipeline_tutorial.mdx b/docs/source/es/pipeline_tutorial.mdx deleted file mode 100644 index af202758eb13..000000000000 --- a/docs/source/es/pipeline_tutorial.mdx +++ /dev/null @@ -1,139 +0,0 @@ - - -# Pipelines para inferencia - -Un [`pipeline`] simplifica el uso de cualquier modelo del [Model Hub](https://huggingface.co/models) para la inferencia en una variedad de tareas como la generación de texto, la segmentación de imágenes y la clasificación de audio. Incluso si no tienes experiencia con una modalidad específica o no comprendes el código que alimenta los modelos, ¡aún puedes usarlos con el [`pipeline`]! Este tutorial te enseñará a: - -* Utilizar un [`pipeline`] para inferencia. -* Utilizar un tokenizador o modelo específico. -* Utilizar un [`pipeline`] para tareas de audio y visión. - - - -Echa un vistazo a la documentación de [`pipeline`] para obtener una lista completa de tareas admitidas. - - - -## Uso del pipeline - -Si bien cada tarea tiene un [`pipeline`] asociado, es más sencillo usar la abstracción general [`pipeline`] que contiene todos los pipelines de tareas específicas. El [`pipeline`] carga automáticamente un modelo predeterminado y un tokenizador con capacidad de inferencia para tu tarea. - -1. Comienza creando un [`pipeline`] y específica una tarea de inferencia: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="text-generation") -``` - -2. Pasa tu texto de entrada al [`pipeline`]: - -```py ->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone") -[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}] -``` - -Si tienes más de una entrada, pásala como una lista: - -```py ->>> generator( -... [ -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", -... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne", -... ] -... ) -``` - -Cualquier parámetro adicional para tu tarea también se puede incluir en el [`pipeline`]. La tarea `text-generation` tiene un método [`~generation.GenerationMixin.generate`] con varios parámetros para controlar la salida. Por ejemplo, si deseas generar más de una salida, defínelo en el parámetro `num_return_sequences`: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", -... num_return_sequences=2, -... ) -``` - -### Selecciona un modelo y un tokenizador - -El [`pipeline`] acepta cualquier modelo del [Model Hub](https://huggingface.co/models). Hay etiquetas en el Model Hub que te permiten filtrar por el modelo que te gustaría utilizar para tu tarea. Una vez que hayas elegido un modelo apropiado, cárgalo con la clase `AutoModelFor` y [`AutoTokenizer`] correspondientes. Por ejemplo, carga la clase [`AutoModelForCausalLM`] para una tarea de modelado de lenguaje causal: - -```py ->>> from transformers import AutoTokenizer, AutoModelForCausalLM - ->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") ->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") -``` - -Crea un [`pipeline`] para tu tarea y específica el modelo y el tokenizador que cargaste: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer) -``` - -Pasa tu texto de entrada a [`pipeline`] para generar algo de texto: - -```py ->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone") -[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}] -``` - -## Pipeline de audio - -La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio. - -Por ejemplo, clasifiquemos la emoción de un breve fragmento del famoso discurso de John F. Kennedy ["We choose to go to the Moon"](https://en.wikipedia.org/wiki/We_choose_to_go_to_the_Moon). Encuentra un modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para reconocimiento de emociones en el Model Hub y cárgalo en el [`pipeline`]: - -```py ->>> from transformers import pipeline - ->>> audio_classifier = pipeline( -... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -Pasa el archivo de audio al [`pipeline`]: - -```py ->>> audio_classifier("jfk_moon_speech.wav") -[{'label': 'calm', 'score': 0.13856211304664612}, - {'label': 'disgust', 'score': 0.13148026168346405}, - {'label': 'happy', 'score': 0.12635163962841034}, - {'label': 'angry', 'score': 0.12439591437578201}, - {'label': 'fearful', 'score': 0.12404385954141617}] -``` - -## Pipeline de visión - -Finalmente, utilizar un [`pipeline`] para tareas de visión es prácticamente igual. - -Específica tu tarea de visión y pasa tu imagen al clasificador. La imagen puede ser un enlace o una ruta local a la imagen. Por ejemplo, ¿qué especie de gato se muestra a continuación? - -![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) - -```py ->>> from transformers import pipeline - ->>> vision_classifier = pipeline(task="image-classification") ->>> vision_classifier( -... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" -... ) -[{'label': 'lynx, catamount', 'score': 0.4403027892112732}, - {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor', - 'score': 0.03433405980467796}, - {'label': 'snow leopard, ounce, Panthera uncia', - 'score': 0.032148055732250214}, - {'label': 'Egyptian cat', 'score': 0.02353910356760025}, - {'label': 'tiger cat', 'score': 0.023034192621707916}] -``` diff --git a/docs/source/es/pr_checks.md b/docs/source/es/pr_checks.md new file mode 100644 index 000000000000..ba67e85306d3 --- /dev/null +++ b/docs/source/es/pr_checks.md @@ -0,0 +1,132 @@ + + +# Verificaciones en un Pull Request + +Cuando abres un _pull request_ en 🤗 Transformers, se ejecutarán una serie de verificaciones para asegurarte de que el _patch_ que estás agregando no rompa nada existente. Estas verificaciones son de cuatro tipos: +- pruebas regulares +- creación de la documentación +- estilo del código y documentación +- consistencia del repositorio + +En este documento, intentaremos explicar cuáles son esas diferentes verificaciones y el motivo detrás de ellas, así como también cómo depurarlas localmente si una falla en tu PR. + +Recuerda que todas las verificaciones requieren que tengas una instalación de desarrollo: + +```bash +pip install transformers[dev] +``` + +o una instalación editable: + +```bash +pip install -e .[dev] +``` + +del repositorio de Transformers. + +## Pruebas + +Todos los procesos que comienzan con `ci/circleci: run_tests_` ejecutan partes del conjunto de pruebas de Transformers. Cada uno de esos procesos se enfoca en una parte de la biblioteca en un entorno determinado: por ejemplo, `ci/circleci: run_tests_pipelines_tf` ejecuta la prueba de _pipelines_ en un entorno donde solo está instalado TensorFlow. + +Ten en cuenta que para evitar ejecutar pruebas cuando no hay un cambio real en los módulos que estás probando, solo se ejecuta una parte del conjunto de pruebas: se ejecuta una tarea auxiliar para determinar las diferencias en la biblioteca antes y después del PR (lo que GitHub te muestra en la pestaña "Files changes") y selecciona las pruebas afectadas por esa diferencia. Este auxiliar se puede ejecutar localmente usando: + +```bash +python utils/tests_fetcher.py +``` + +desde el directorio raiz del repositorio de Transformers. Se ejecutará lo siguiente: + +1. Verificación para cada archivo en el _diff_ si los cambios están en el código, solo en comentarios o _docstrings_. Solo los archivos con cambios reales de código se conservan. +2. Creación de un mapa interno que proporciona para cada archivo del código fuente de la biblioteca todos los archivos a los que impacta recursivamente. Se dice que el módulo A impacta al módulo B si el módulo B importa el módulo A. Para el impacto recursivo, necesitamos una cadena de módulos que va del módulo A al módulo B en la que cada módulo importa el anterior. +3. Aplicación de este mapa en los archivos recopilados en el paso 1, lo que nos da una lista de archivos modelo afectados por el PR. +4. Asignación de cada uno de esos archivos a sus archivos de prueba correspondientes y para obtener una la lista de pruebas a ejecutar. + +Al ejecutar el _script_ localmente, debes obtener los resultados de los pasos 1, 3 y 4 impresos y así saber qué pruebas se ejecutarán. El _script_ también creará un archivo llamado `test_list.txt` que contiene la lista de pruebas para ejecutar, y puede ejecutarlas localmente con el siguiente comando: + +```bash +python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt) +``` + +En caso de que se te escape algo, el conjunto completo de pruebas también se ejecuta a diario. + +## Creación de la documentación + +El proceso `build_pr_documentation` compila y genera una vista previa de la documentación para asegurarse de que todo se vea bien una vez que se fusione tu PR. Un bot agregará un enlace para obtener una vista previa de la documentación en tu PR. Cualquier cambio que realices en el PR se actualiza automáticamente en la vista previa. Si la documentación no se genera, haz clic en **Detalles** junto al proceso fallido para ver dónde salió mal. A menudo, el error es tan simple como que falta un archivo en `toctree`. + +Si estás interesado en compilar u obtener una vista previa de la documentación localmente, echa un vistazo al [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) en la carpeta `docs`. + +## Estilo de código y documentación. + +El formato de código se aplica a todos los archivos fuente, los ejemplos y las pruebas utilizando `black` e `ruff`. También tenemos una herramienta personalizada que se ocupa del formato de los _docstrings_ y archivos `rst` (`utils/style_doc.py`), así como del orden de las importaciones _lazy_ realizadas en los archivos `__init__.py` de Transformers (`utils /custom_init_isort.py`). Todo esto se puede probar ejecutando + +```bash +make style +``` + +CI verifica que se hayan aplicado dentro de la verificación `ci/circleci: check_code_quality`. También se ejecuta `ruff`, que hará una verificación básica a tu código y te hará saber si encuentra una variable no definida, o una que no se usa. Para ejecutar esa verificación localmente, usa + +```bash +make quality +``` + +Esto puede llevar mucho tiempo, así que para ejecutar lo mismo solo en los archivos que modificaste en la rama actual, ejecuta + +```bash +make fixup +``` + +Este último comando también ejecutará todas las verificaciones adicionales para la consistencia del repositorio. Echemos un vistazo a estas pruebas. + +## Consistencia del repositorio + +Esta verificación reagrupa todas las pruebas para asegurarse de que tu PR deja el repositorio en buen estado, y se realiza mediante `ci/circleci: check_repository_consistency`. Puedes ejecutar localmente esta verificación ejecutando lo siguiente: + +```bash +make repo-consistency +``` + +Esta instrucción verifica que: + +- Todos los objetos agregados al _init_ están documentados (realizados por `utils/check_repo.py`) +- Todos los archivos `__init__.py` tienen el mismo contenido en sus dos secciones (realizado por `utils/check_inits.py`) +- Todo el código identificado como una copia de otro módulo es consistente con el original (realizado por `utils/check_copies.py`) +- Todas las clases de configuración tienen al menos _checkpoint_ válido mencionado en sus _docstrings_ (realizado por `utils/check_config_docstrings.py`) +- Las traducciones de los README y el índice del documento tienen la misma lista de modelos que el README principal (realizado por `utils/check_copies.py`) +- Las tablas generadas automaticamente en la documentación están actualizadas (realizadas por `utils/check_table.py`) +- La biblioteca tiene todos los objetos disponibles incluso si no están instaladas todas las dependencias opcionales (realizadas por `utils/check_dummies.py`) + +Si esta verificación falla, los primeros dos elementos requieren una reparación manual, los últimos cuatro pueden repararse automáticamente ejecutando el comando + +```bash +make fix-copies +``` + +Las verificaciones adicionales se refieren a los PRs que agregan nuevos modelos, principalmente que: + +- Todos los modelos agregados están en un Auto-mapping (realizado por `utils/check_repo.py`) + +- Todos los modelos se verifican correctamente (realizados por `utils/check_repo.py`) + + diff --git a/docs/source/es/pr_checks.mdx b/docs/source/es/pr_checks.mdx deleted file mode 100644 index b4ae0f1c7a12..000000000000 --- a/docs/source/es/pr_checks.mdx +++ /dev/null @@ -1,128 +0,0 @@ - - -# Verificaciones en un Pull Request - -Cuando abres un _pull request_ en 🤗 Transformers, se ejecutarán una serie de verificaciones para asegurarte de que el _patch_ que estás agregando no rompa nada existente. Estas verificaciones son de cuatro tipos: -- pruebas regulares -- creación de la documentación -- estilo del código y documentación -- consistencia del repositorio - -En este documento, intentaremos explicar cuáles son esas diferentes verificaciones y el motivo detrás de ellas, así como también cómo depurarlas localmente si una falla en tu PR. - -Recuerda que todas las verificaciones requieren que tengas una instalación de desarrollo: - -```bash -pip install transformers[dev] -``` - -o una instalación editable: - -```bash -pip install -e .[dev] -``` - -del repositorio de Transformers. - -## Pruebas - -Todos los procesos que comienzan con `ci/circleci: run_tests_` ejecutan partes del conjunto de pruebas de Transformers. Cada uno de esos procesos se enfoca en una parte de la biblioteca en un entorno determinado: por ejemplo, `ci/circleci: run_tests_pipelines_tf` ejecuta la prueba de _pipelines_ en un entorno donde solo está instalado TensorFlow. - -Ten en cuenta que para evitar ejecutar pruebas cuando no hay un cambio real en los módulos que estás probando, solo se ejecuta una parte del conjunto de pruebas: se ejecuta una tarea auxiliar para determinar las diferencias en la biblioteca antes y después del PR (lo que GitHub te muestra en la pestaña "Files changes") y selecciona las pruebas afectadas por esa diferencia. Este auxiliar se puede ejecutar localmente usando: - -```bash -python utils/tests_fetcher.py -``` - -desde el directorio raiz del repositorio de Transformers. Se ejecutará lo siguiente: - -1. Verificación para cada archivo en el _diff_ si los cambios están en el código, solo en comentarios o _docstrings_. Solo los archivos con cambios reales de código se conservan. -2. Creación de un mapa interno que proporciona para cada archivo del código fuente de la biblioteca todos los archivos a los que impacta recursivamente. Se dice que el módulo A impacta al módulo B si el módulo B importa el módulo A. Para el impacto recursivo, necesitamos una cadena de módulos que va del módulo A al módulo B en la que cada módulo importa el anterior. -3. Aplicación de este mapa en los archivos recopilados en el paso 1, lo que nos da una lista de archivos modelo afectados por el PR. -4. Asignación de cada uno de esos archivos a sus archivos de prueba correspondientes y para obtener una la lista de pruebas a ejecutar. - -Al ejecutar el _script_ localmente, debes obtener los resultados de los pasos 1, 3 y 4 impresos y así saber qué pruebas se ejecutarán. El _script_ también creará un archivo llamado `test_list.txt` que contiene la lista de pruebas para ejecutar, y puede ejecutarlas localmente con el siguiente comando: - -```bash -python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt) -``` - -En caso de que se te escape algo, el conjunto completo de pruebas también se ejecuta a diario. - -## Creación de la documentación - -El proceso `build_pr_documentation` compila y genera una vista previa de la documentación para asegurarse de que todo se vea bien una vez que se fusione tu PR. Un bot agregará un enlace para obtener una vista previa de la documentación en tu PR. Cualquier cambio que realices en el PR se actualiza automáticamente en la vista previa. Si la documentación no se genera, haz clic en **Detalles** junto al proceso fallido para ver dónde salió mal. A menudo, el error es tan simple como que falta un archivo en `toctree`. - -Si estás interesado en compilar u obtener una vista previa de la documentación localmente, echa un vistazo al [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) en la carpeta `docs`. - -## Estilo de código y documentación. - -El formato de código se aplica a todos los archivos fuente, los ejemplos y las pruebas utilizando `black` e `isort`. También tenemos una herramienta personalizada que se ocupa del formato de los _docstrings_ y archivos `rst` (`utils/style_doc.py`), así como del orden de las importaciones _lazy_ realizadas en los archivos `__init__.py` de Transformers (`utils /custom_init_isort.py`). Todo esto se puede probar ejecutando - -```bash -make style -``` - -CI verifica que se hayan aplicado dentro de la verificación `ci/circleci: check_code_quality`. También se ejecuta `flake8`, que hará una verificación básica a tu código y te hará saber si encuentra una variable no definida, o una que no se usa. Para ejecutar esa verificación localmente, usa - -```bash -make quality -``` - -Esto puede llevar mucho tiempo, así que para ejecutar lo mismo solo en los archivos que modificaste en la rama actual, ejecuta - -```bash -make fixup -``` - -Este último comando también ejecutará todas las verificaciones adicionales para la consistencia del repositorio. Echemos un vistazo a estas pruebas. - -## Consistencia del repositorio - -Esta verificación reagrupa todas las pruebas para asegurarse de que tu PR deja el repositorio en buen estado, y se realiza mediante `ci/circleci: check_repository_consistency`. Puedes ejecutar localmente esta verificación ejecutando lo siguiente: - -```bash -make repo-consistency -``` - -Esta instrucción verifica que: - -- Todos los objetos agregados al _init_ están documentados (realizados por `utils/check_repo.py`) -- Todos los archivos `__init__.py` tienen el mismo contenido en sus dos secciones (realizado por `utils/check_inits.py`) -- Todo el código identificado como una copia de otro módulo es consistente con el original (realizado por `utils/check_copies.py`) -- Todas las clases de configuración tienen al menos _checkpoint_ válido mencionado en sus _docstrings_ (realizado por `utils/check_config_docstrings.py`) -- Las traducciones de los README y el índice del documento tienen la misma lista de modelos que el README principal (realizado por `utils/check_copies.py`) -- Las tablas generadas automaticamente en la documentación están actualizadas (realizadas por `utils/check_table.py`) -- La biblioteca tiene todos los objetos disponibles incluso si no están instaladas todas las dependencias opcionales (realizadas por `utils/check_dummies.py`) - -Si esta verificación falla, los primeros dos elementos requieren una reparación manual, los últimos cuatro pueden repararse automáticamente ejecutando el comando - -```bash -make fix-copies -``` - -Las verificaciones adicionales se refieren a los PRs que agregan nuevos modelos, principalmente que: - -- Todos los modelos agregados están en un Auto-mapping (realizado por `utils/check_repo.py`) - -- Todos los modelos se verifican correctamente (realizados por `utils/check_repo.py`) - - diff --git a/docs/source/es/preprocessing.md b/docs/source/es/preprocessing.md new file mode 100644 index 000000000000..f4eec4862be8 --- /dev/null +++ b/docs/source/es/preprocessing.md @@ -0,0 +1,560 @@ + + +# Preprocesamiento + +[[open-in-colab]] + +Antes de que puedas utilizar los datos en un modelo, debes procesarlos en un formato aceptable para el modelo. Un modelo no entiende el texto en bruto, las imágenes o el audio. Estas entradas necesitan ser convertidas en números y ensambladas en tensores. En este tutorial, podrás: + +* Preprocesar los datos textuales con un tokenizador. +* Preprocesar datos de imagen o audio con un extractor de características. +* Preprocesar datos para una tarea multimodal con un procesador. + +## NLP + + + +La principal herramienta para procesar datos textuales es un [tokenizador](main_classes/tokenizer). Un tokenizador comienza dividiendo el texto en *tokens* según un conjunto de reglas. Los tokens se convierten en números, que se utilizan para construir tensores como entrada a un modelo. El tokenizador también añade cualquier entrada adicional que requiera el modelo. + + + +Si tienes previsto utilizar un modelo pre-entrenado, es importante que utilices el tokenizador pre-entrenado asociado. Esto te asegura que el texto se divide de la misma manera que el corpus de pre-entrenamiento y utiliza el mismo índice de tokens correspondiente (usualmente referido como el *vocab*) durante el pre-entrenamiento. + + + +Comienza rápidamente cargando un tokenizador pre-entrenado con la clase [`AutoTokenizer`]. Esto descarga el *vocab* utilizado cuando un modelo es pre-entrenado. + +### Tokenizar + +Carga un tokenizador pre-entrenado con [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +``` + +A continuación, pasa tu frase al tokenizador: + +```py +>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") +>>> print(encoded_input) +{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +El tokenizador devuelve un diccionario con tres ítems importantes: + +* [input_ids](glossary#input-ids) son los índices correspondientes a cada token de la frase. +* [attention_mask](glossary#attention-mask) indica si un token debe ser atendido o no. +* [token_type_ids](glossary#token-type-ids) identifica a qué secuencia pertenece un token cuando hay más de una secuencia. + +Tu puedes decodificar el `input_ids` para devolver la entrada original: + +```py +>>> tokenizer.decode(encoded_input["input_ids"]) +'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' +``` + +Como puedes ver, el tokenizador ha añadido dos tokens especiales - `CLS` y `SEP` (clasificador y separador) - a la frase. No todos los modelos necesitan +tokens especiales, pero si lo llegas a necesitar, el tokenizador los añadirá automáticamente. + +Si hay varias frases que quieres preprocesar, pasa las frases como una lista al tokenizador: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_inputs = tokenizer(batch_sentences) +>>> print(encoded_inputs) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1]]} +``` + +### Pad + +Esto nos lleva a un tema importante. Cuando se procesa un batch de frases, no siempre tienen la misma longitud. Esto es un problema porque los tensores que se introducen en el modelo deben tener una forma uniforme. El pad es una estrategia para asegurar que los tensores sean rectangulares añadiendo un "padding token" especial a las oraciones con menos tokens. + +Establece el parámetro `padding` en `True` aplicando el pad a las secuencias más cortas del batch para que coincidan con la secuencia más larga: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +Observa que el tokenizador ha aplicado el pad a la primera y la tercera frase con un "0" porque son más cortas. + +### Truncamiento + +En el otro extremo del espectro, a veces una secuencia puede ser demasiado larga para un modelo. En este caso, tendrás que truncar la secuencia a una longitud más corta. + +Establece el parámetro `truncation` a `True` para truncar una secuencia a la longitud máxima aceptada por el modelo: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +### Construye tensores + +Finalmente, si quieres que el tokenizador devuelva los tensores reales que se introducen en el modelo. + +Establece el parámetro `return_tensors` como `pt` para PyTorch, o `tf` para TensorFlow: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt") +>>> print(encoded_input) +{'input_ids': tensor([[ 101, 153, 7719, 21490, 1122, 1114, 9582, 1623, 102], + [ 101, 5226, 1122, 9649, 1199, 2610, 1236, 102, 0]]), + 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 0]])} +===PT-TF-SPLIT=== +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf") +>>> print(encoded_input) +{'input_ids': , + 'token_type_ids': , + 'attention_mask': } +``` + +## Audio + +Las entradas de audio se preprocesan de forma diferente a las entradas textuales, pero el objetivo final es el mismo: crear secuencias numéricas que el modelo pueda entender. Un [extractor de características](main_classes/feature_extractor) (o feature extractor en inglés) está diseñado para extraer características de datos provenientes de imágenes o audio sin procesar y convertirlos en tensores. Antes de empezar, instala 🤗 Datasets para cargar un dataset de audio para experimentar: + +```bash +pip install datasets +``` + +Carga la tarea de detección de palabras clave del benchmark [SUPERB](https://huggingface.co/datasets/superb) (consulta el [tutorial 🤗 Dataset](https://huggingface.co/docs/datasets/load_hub.html) para que obtengas más detalles sobre cómo cargar un dataset): + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("superb", "ks") +``` + +Accede al primer elemento de la columna `audio` para echar un vistazo a la entrada. Al llamar a la columna `audio` se cargará y volverá a muestrear automáticamente el archivo de audio: + +```py +>>> dataset["train"][0]["audio"] +{'array': array([ 0. , 0. , 0. , ..., -0.00592041, + -0.00405884, -0.00253296], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav', + 'sampling_rate': 16000} +``` + +Esto devuelve tres elementos: + +* `array` es la señal de voz cargada - y potencialmente remuestreada - como un array 1D. +* `path` apunta a la ubicación del archivo de audio. +* `sampling_rate` se refiere a cuántos puntos de datos de la señal de voz se miden por segundo. + +### Resample + +Para este tutorial, se utilizará el modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Como puedes ver en la model card, el modelo Wav2Vec2 está pre-entrenado en audio de voz muestreado a 16kHz. Es importante que la tasa de muestreo de tus datos de audio coincida con la tasa de muestreo del dataset utilizado para pre-entrenar el modelo. Si la tasa de muestreo de tus datos no es la misma, deberás volver a muestrear tus datos de audio. + +Por ejemplo, carga el dataset [LJ Speech](https://huggingface.co/datasets/lj_speech) que tiene una tasa de muestreo de 22050kHz. Para utilizar el modelo Wav2Vec2 con este dataset, reduce la tasa de muestreo a 16kHz: + +```py +>>> lj_speech = load_dataset("lj_speech", split="train") +>>> lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} +``` + +1. Usa el método 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) para reducir la tasa de muestreo a 16kHz: + +```py +>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +2. Carga el archivo de audio: + +```py +>>> lj_speech[0]["audio"] +{'array': array([-0.00064146, -0.00074657, -0.00068768, ..., 0.00068341, + 0.00014045, 0. ], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 16000} +``` + +Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. Ahora que sabes cómo funciona el resampling, volvamos a nuestro ejemplo anterior con el dataset SUPERB. + +### Extractor de características + +El siguiente paso es cargar un extractor de características para normalizar y aplicar el pad a la entrada. Cuando se aplica padding a los datos textuales, se añade un "0" para las secuencias más cortas. La misma idea se aplica a los datos de audio y el extractor de características de audio añadirá un "0" - interpretado como silencio - al "array". + +Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +``` + +Pasa el `array` de audio al extractor de características. También te recomendamos añadir el argumento `sampling_rate` en el extractor de características para poder depurar mejor los errores silenciosos que puedan producirse. + +```py +>>> audio_input = [dataset["train"][0]["audio"]["array"]] +>>> feature_extractor(audio_input, sampling_rate=16000) +{'input_values': [array([ 0.00045439, 0.00045439, 0.00045439, ..., -0.1578519 , -0.10807519, -0.06727459], dtype=float32)]} +``` + +### Pad y truncamiento + +Al igual que el tokenizador, puedes aplicar padding o truncamiento para manejar secuencias variables en un batch. Fíjate en la longitud de la secuencia de estas dos muestras de audio: + +```py +>>> dataset["train"][0]["audio"]["array"].shape +(1522930,) + +>>> dataset["train"][1]["audio"]["array"].shape +(988891,) +``` + +Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. + +```py +>>> def preprocess_function(examples): +... audio_arrays = [x["array"] for x in examples["audio"]] +... inputs = feature_extractor( +... audio_arrays, +... sampling_rate=16000, +... padding=True, +... max_length=1000000, +... truncation=True, +... ) +... return inputs +``` + +Aplica la función a los primeros ejemplos del dataset: + +```py +>>> processed_dataset = preprocess_function(dataset["train"][:5]) +``` + +Ahora echa un vistazo a las longitudes de las muestras procesadas: + +```py +>>> processed_dataset["input_values"][0].shape +(1000000,) + +>>> processed_dataset["input_values"][1].shape +(1000000,) +``` + +Las longitudes de las dos primeras muestras coinciden ahora con la longitud máxima especificada. + +## Visión + +También se utiliza un extractor de características para procesar imágenes para tareas de visión por computadora. Una vez más, el objetivo es convertir la imagen en bruto en un batch de tensores como entrada. + +Vamos a cargar el dataset [food101](https://huggingface.co/datasets/food101) para este tutorial. Usa el parámetro 🤗 Datasets `split` para cargar solo una pequeña muestra de la división de entrenamiento ya que el dataset es bastante grande: + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("food101", split="train[:100]") +``` + +A continuación, observa la imagen con la función 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image): + +```py +>>> dataset[0]["image"] +``` + +![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png) + +### Extractor de características + +Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224") +``` + +### Aumento de Datos + +Para las tareas de visión por computadora es común añadir algún tipo de aumento de datos (o data augmentation) a las imágenes como parte del preprocesamiento. Puedes añadir el método de aumento de datos con cualquier librería que quieras, pero en este tutorial utilizarás el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. + +1. Normaliza la imagen y utiliza [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) para encadenar algunas transformaciones - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) y [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - juntas: + +```py +>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor + +>>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) +>>> _transforms = Compose( +... [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize] +... ) +``` + +2. El modelo acepta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) como entrada. Este valor es generado por el extractor de características. Crea una función que genere `pixel_values` a partir de las transformaciones: + +```py +>>> def transforms(examples): +... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] +... return examples +``` + +3. A continuación, utiliza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) para aplicar las transformaciones sobre la marcha: + +```py +>>> dataset.set_transform(transforms) +``` + +4. Ahora, cuando accedes a la imagen, observarás que el extractor de características ha añadido a la entrada del modelo `pixel_values`: + +```py +>>> dataset[0]["image"] +{'image': , + 'label': 6, + 'pixel_values': tensor([[[ 0.0353, 0.0745, 0.1216, ..., -0.9922, -0.9922, -0.9922], + [-0.0196, 0.0667, 0.1294, ..., -0.9765, -0.9843, -0.9922], + [ 0.0196, 0.0824, 0.1137, ..., -0.9765, -0.9686, -0.8667], + ..., + [ 0.0275, 0.0745, 0.0510, ..., -0.1137, -0.1216, -0.0824], + [ 0.0667, 0.0824, 0.0667, ..., -0.0588, -0.0745, -0.0980], + [ 0.0353, 0.0353, 0.0431, ..., -0.0039, -0.0039, -0.0588]], + + [[ 0.2078, 0.2471, 0.2863, ..., -0.9451, -0.9373, -0.9451], + [ 0.1608, 0.2471, 0.3098, ..., -0.9373, -0.9451, -0.9373], + [ 0.2078, 0.2706, 0.3020, ..., -0.9608, -0.9373, -0.8275], + ..., + [-0.0353, 0.0118, -0.0039, ..., -0.2392, -0.2471, -0.2078], + [ 0.0196, 0.0353, 0.0196, ..., -0.1843, -0.2000, -0.2235], + [-0.0118, -0.0039, -0.0039, ..., -0.0980, -0.0980, -0.1529]], + + [[ 0.3961, 0.4431, 0.4980, ..., -0.9216, -0.9137, -0.9216], + [ 0.3569, 0.4510, 0.5216, ..., -0.9059, -0.9137, -0.9137], + [ 0.4118, 0.4745, 0.5216, ..., -0.9137, -0.8902, -0.7804], + ..., + [-0.2314, -0.1922, -0.2078, ..., -0.4196, -0.4275, -0.3882], + [-0.1843, -0.1686, -0.2000, ..., -0.3647, -0.3804, -0.4039], + [-0.1922, -0.1922, -0.1922, ..., -0.2941, -0.2863, -0.3412]]])} +``` + +Este es el aspecto de la imagen después de preprocesarla. Como era de esperar por las transformaciones aplicadas, la imagen ha sido recortada aleatoriamente y sus propiedades de color son diferentes. + +```py +>>> import numpy as np +>>> import matplotlib.pyplot as plt + +>>> img = dataset[0]["pixel_values"] +>>> plt.imshow(img.permute(1, 2, 0)) +``` + +![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png) + +## Multimodal + +Para las tareas multimodales utilizarás una combinación de todo lo que has aprendido hasta ahora y aplicarás tus habilidades a una tarea de reconocimiento automático de voz (ASR). Esto significa que necesitarás un: + +* Extractor de características para preprocesar los datos de audio. +* Un tokenizador para procesar el texto. + +Volvamos al dataset [LJ Speech](https://huggingface.co/datasets/lj_speech): + +```py +>>> from datasets import load_dataset + +>>> lj_speech = load_dataset("lj_speech", split="train") +``` + +Suponiendo que te interesan principalmente las columnas `audio` y `texto`, elimina las demás columnas: + +```py +>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) +``` + +Ahora echa un vistazo a las columnas `audio` y `texto`: + +```py +>>> lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} + +>>> lj_speech[0]["text"] +'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' +``` + +Recuerda la sección anterior sobre el procesamiento de datos de audio, siempre debes [volver a muestrear](preprocessing#audio) la tasa de muestreo de tus datos de audio para que coincida con la tasa de muestreo del dataset utilizado para preentrenar un modelo: + +```py +>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +### Processor + +Un processor combina un extractor de características y un tokenizador. Cargue un procesador con [`AutoProcessor.from_pretrained]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") +``` + +1. Crea una función para procesar los datos de audio en `input_values`, y tokeniza el texto en `labels`. Estas son las entradas del modelo: + +```py +>>> def prepare_dataset(example): +... audio = example["audio"] + +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) + +... return example +``` + +2. Aplica la función `prepare_dataset` a una muestra: + +```py +>>> prepare_dataset(lj_speech[0]) +``` + +Observa que el método processor ha añadido `input_values` y `labels`. La tasa de muestreo también se ha reducido correctamente a 16kHz. + +Genial, ahora deberías ser capaz de preprocesar datos para cualquier modalidad e incluso combinar diferentes modalidades. En el siguiente tutorial, aprenderás aplicar fine tuning a un modelo en tus datos recién preprocesados. + +## Todo lo que siempre quisiste saber sobre el padding y el truncamiento + +Hemos visto los comandos que funcionarán para la mayoría de los casos (hacer pad a tu batch teniendo en cuenta la longitud de la frase máxima y +truncar a la longitud máxima que el modelo puede aceptar). Sin embargo, la API admite más estrategias si las necesitas. Los +tres argumentos que necesitas conocer para ello son `padding`, `truncation` y `max_length`. + +- `padding` controla el aplicarme padding al texto. Puede ser un booleano o una cadena que debe ser: + + - `True` o `'longest'` para aplicar el pad hasta la secuencia más larga del batch (no apliques el padding si sólo le proporcionas + una sola secuencia). + - `'max_length'` para aplicar el pad hasta la longitud especificada por el argumento `max_length` o la longitud máxima aceptada + por el modelo si no le proporcionas `longitud_máxima` (`longitud_máxima=None`). Si sólo le proporcionas una única secuencia + se le aplicará el padding. + `False` o `'do_not_pad'` para no aplicar pad a las secuencias. Como hemos visto antes, este es el comportamiento por + defecto. + +- `truncation` controla el truncamiento. Puede ser un booleano o una string que debe ser: + + - `True` o `'longest_first'` truncan hasta la longitud máxima especificada por el argumento `max_length` o + la longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto + truncará token por token, eliminando un token de la secuencia más larga del par hasta alcanzar la longitud + adecuada. + - `'only_second'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la + longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto sólo truncará + la segunda frase de un par si le proporcionas un par de secuencias (o un batch de pares de secuencias). + - `'only_first'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la longitud máxima + aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto sólo truncará + la primera frase de un par si se proporciona un par de secuencias (o un lote de pares de secuencias). + - `False` o `'do_not_truncate'` para no truncar las secuencias. Como hemos visto antes, este es el comportamiento + por defecto. + +- `max_length` para controlar la longitud del padding/truncamiento. Puede ser un número entero o `None`, en cuyo caso +será por defecto la longitud máxima que el modelo puede aceptar. Si el modelo no tiene una longitud máxima de entrada específica, el +padding/truncamiento a `longitud_máxima` se desactiva. + +A continuación te mostramos en una tabla que resume la forma recomendada de configurar el padding y el truncamiento. Si utilizas un par de secuencias de entrada en +algunos de los siguientes ejemplos, puedes sustituir `truncation=True` por una `STRATEGY` seleccionada en +`['only_first', 'only_second', 'longest_first']`, es decir, `truncation='only_second'` o `truncation= 'longest_first'` para controlar cómo se truncan ambas secuencias del par como se ha detallado anteriormente. + +| Truncation | Padding | Instrucciones | +|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------| +| no truncation | no padding | `tokenizer(batch_sentences)` | +| | padding secuencia max del batch | `tokenizer(batch_sentences, padding=True)` or | +| | | `tokenizer(batch_sentences, padding='longest')` | +| | padding long max de input model | `tokenizer(batch_sentences, padding='max_length')` | +| | padding a una long especifica | `tokenizer(batch_sentences, padding='max_length', max_length=42)` | +| truncation long max del input model | no padding | `tokenizer(batch_sentences, truncation=True)` or | +| | | `tokenizer(batch_sentences, truncation=STRATEGY)` | +| | padding secuencia max del batch | `tokenizer(batch_sentences, padding=True, truncation=True)` or | +| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)` | +| | padding long max de input model | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or | +| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)` | +| | padding a una long especifica | Not possible | +| truncation a una long especifica | no padding | `tokenizer(batch_sentences, truncation=True, max_length=42)` or | +| | | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)` | +| | padding secuencia max del batch | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or | +| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)` | +| | padding long max de input model | Not possible | +| | padding a una long especifica | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or | +| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` | + + + + + + + + diff --git a/docs/source/es/preprocessing.mdx b/docs/source/es/preprocessing.mdx deleted file mode 100644 index 869f90c41773..000000000000 --- a/docs/source/es/preprocessing.mdx +++ /dev/null @@ -1,556 +0,0 @@ - - -# Preprocesamiento - -[[open-in-colab]] - -Antes de que puedas utilizar los datos en un modelo, debes procesarlos en un formato aceptable para el modelo. Un modelo no entiende el texto en bruto, las imágenes o el audio. Estas entradas necesitan ser convertidas en números y ensambladas en tensores. En este tutorial, podrás: - -* Preprocesar los datos textuales con un tokenizador. -* Preprocesar datos de imagen o audio con un extractor de características. -* Preprocesar datos para una tarea multimodal con un procesador. - -## NLP - - - -La principal herramienta para procesar datos textuales es un [tokenizador](main_classes/tokenizer). Un tokenizador comienza dividiendo el texto en *tokens* según un conjunto de reglas. Los tokens se convierten en números, que se utilizan para construir tensores como entrada a un modelo. El tokenizador también añade cualquier entrada adicional que requiera el modelo. - - - -Si tienes previsto utilizar un modelo pre-entrenado, es importante que utilices el tokenizador pre-entrenado asociado. Esto te asegura que el texto se divide de la misma manera que el corpus de pre-entrenamiento y utiliza el mismo índice de tokens correspondiente (usualmente referido como el *vocab*) durante el pre-entrenamiento. - - - -Comienza rápidamente cargando un tokenizador pre-entrenado con la clase [`AutoTokenizer`]. Esto descarga el *vocab* utilizado cuando un modelo es pre-entrenado. - -### Tokenizar - -Carga un tokenizador pre-entrenado con [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -``` - -A continuación, pasa tu frase al tokenizador: - -```py ->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") ->>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -El tokenizador devuelve un diccionario con tres ítems importantes: - -* [input_ids](glossary#input-ids) son los índices correspondientes a cada token de la frase. -* [attention_mask](glossary#attention-mask) indica si un token debe ser atendido o no. -* [token_type_ids](glossary#token-type-ids) identifica a qué secuencia pertenece un token cuando hay más de una secuencia. - -Tu puedes decodificar el `input_ids` para devolver la entrada original: - -```py ->>> tokenizer.decode(encoded_input["input_ids"]) -'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' -``` - -Como puedes ver, el tokenizador ha añadido dos tokens especiales - `CLS` y `SEP` (clasificador y separador) - a la frase. No todos los modelos necesitan -tokens especiales, pero si lo llegas a necesitar, el tokenizador los añadirá automáticamente. - -Si hay varias frases que quieres preprocesar, pasa las frases como una lista al tokenizador: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_inputs = tokenizer(batch_sentences) ->>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1]]} -``` - -### Pad - -Esto nos lleva a un tema importante. Cuando se procesa un batch de frases, no siempre tienen la misma longitud. Esto es un problema porque los tensores que se introducen en el modelo deben tener una forma uniforme. El pad es una estrategia para asegurar que los tensores sean rectangulares añadiendo un "padding token" especial a las oraciones con menos tokens. - -Establece el parámetro `padding` en `True` aplicando el pad a las secuencias más cortas del batch para que coincidan con la secuencia más larga: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -Observa que el tokenizador ha aplicado el pad a la primera y la tercera frase con un "0" porque son más cortas. - -### Truncamiento - -En el otro extremo del espectro, a veces una secuencia puede ser demasiado larga para un modelo. En este caso, tendrás que truncar la secuencia a una longitud más corta. - -Establece el parámetro `truncation` a `True` para truncar una secuencia a la longitud máxima aceptada por el modelo: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -### Construye tensores - -Finalmente, si quieres que el tokenizador devuelva los tensores reales que se introducen en el modelo. - -Establece el parámetro `return_tensors` como `pt` para PyTorch, o `tf` para TensorFlow: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt") ->>> print(encoded_input) -{'input_ids': tensor([[ 101, 153, 7719, 21490, 1122, 1114, 9582, 1623, 102], - [ 101, 5226, 1122, 9649, 1199, 2610, 1236, 102, 0]]), - 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 0]])} -===PT-TF-SPLIT=== ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf") ->>> print(encoded_input) -{'input_ids': , - 'token_type_ids': , - 'attention_mask': } -``` - -## Audio - -Las entradas de audio se preprocesan de forma diferente a las entradas textuales, pero el objetivo final es el mismo: crear secuencias numéricas que el modelo pueda entender. Un [extractor de características](main_classes/feature_extractor) (o feature extractor en inglés) está diseñado para extraer características de datos provenientes de imágenes o audio sin procesar y convertirlos en tensores. Antes de empezar, instala 🤗 Datasets para cargar un dataset de audio para experimentar: - -```bash -pip install datasets -``` - -Carga la tarea de detección de palabras clave del benchmark [SUPERB](https://huggingface.co/datasets/superb) (consulta el [tutorial 🤗 Dataset](https://huggingface.co/docs/datasets/load_hub.html) para que obtengas más detalles sobre cómo cargar un dataset): - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("superb", "ks") -``` - -Accede al primer elemento de la columna `audio` para echar un vistazo a la entrada. Al llamar a la columna `audio` se cargará y volverá a muestrear automáticamente el archivo de audio: - -```py ->>> dataset["train"][0]["audio"] -{'array': array([ 0. , 0. , 0. , ..., -0.00592041, - -0.00405884, -0.00253296], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/05734a36d88019a09725c20cc024e1c4e7982e37d7d55c0c1ca1742ea1cdd47f/_background_noise_/doing_the_dishes.wav', - 'sampling_rate': 16000} -``` - -Esto devuelve tres elementos: - -* `array` es la señal de voz cargada - y potencialmente remuestreada - como un array 1D. -* `path` apunta a la ubicación del archivo de audio. -* `sampling_rate` se refiere a cuántos puntos de datos de la señal de voz se miden por segundo. - -### Resample - -Para este tutorial, se utilizará el modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Como puedes ver en la model card, el modelo Wav2Vec2 está pre-entrenado en audio de voz muestreado a 16kHz. Es importante que la tasa de muestreo de tus datos de audio coincida con la tasa de muestreo del dataset utilizado para pre-entrenar el modelo. Si la tasa de muestreo de tus datos no es la misma, deberás volver a muestrear tus datos de audio. - -Por ejemplo, carga el dataset [LJ Speech](https://huggingface.co/datasets/lj_speech) que tiene una tasa de muestreo de 22050kHz. Para utilizar el modelo Wav2Vec2 con este dataset, reduce la tasa de muestreo a 16kHz: - -```py ->>> lj_speech = load_dataset("lj_speech", split="train") ->>> lj_speech[0]["audio"] -{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., - 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 22050} -``` - -1. Usa el método 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) para reducir la tasa de muestreo a 16kHz: - -```py ->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -2. Carga el archivo de audio: - -```py ->>> lj_speech[0]["audio"] -{'array': array([-0.00064146, -0.00074657, -0.00068768, ..., 0.00068341, - 0.00014045, 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 16000} -``` - -Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. Ahora que sabes cómo funciona el resampling, volvamos a nuestro ejemplo anterior con el dataset SUPERB. - -### Extractor de características - -El siguiente paso es cargar un extractor de características para normalizar y aplicar el pad a la entrada. Cuando se aplica padding a los datos textuales, se añade un "0" para las secuencias más cortas. La misma idea se aplica a los datos de audio y el extractor de características de audio añadirá un "0" - interpretado como silencio - al "array". - -Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") -``` - -Pasa el `array` de audio al extractor de características. También te recomendamos añadir el argumento `sampling_rate` en el extractor de características para poder depurar mejor los errores silenciosos que puedan producirse. - -```py ->>> audio_input = [dataset["train"][0]["audio"]["array"]] ->>> feature_extractor(audio_input, sampling_rate=16000) -{'input_values': [array([ 0.00045439, 0.00045439, 0.00045439, ..., -0.1578519 , -0.10807519, -0.06727459], dtype=float32)]} -``` - -### Pad y truncamiento - -Al igual que el tokenizador, puedes aplicar padding o truncamiento para manejar secuencias variables en un batch. Fíjate en la longitud de la secuencia de estas dos muestras de audio: - -```py ->>> dataset["train"][0]["audio"]["array"].shape -(1522930,) - ->>> dataset["train"][1]["audio"]["array"].shape -(988891,) -``` - -Como puedes ver, el `sampling_rate` se ha reducido a 16kHz. - -```py ->>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] -... inputs = feature_extractor( -... audio_arrays, -... sampling_rate=16000, -... padding=True, -... max_length=1000000, -... truncation=True, -... ) -... return inputs -``` - -Aplica la función a los primeros ejemplos del dataset: - -```py ->>> processed_dataset = preprocess_function(dataset["train"][:5]) -``` - -Ahora echa un vistazo a las longitudes de las muestras procesadas: - -```py ->>> processed_dataset["input_values"][0].shape -(1000000,) - ->>> processed_dataset["input_values"][1].shape -(1000000,) -``` - -Las longitudes de las dos primeras muestras coinciden ahora con la longitud máxima especificada. - -## Visión - -También se utiliza un extractor de características para procesar imágenes para tareas de visión por computadora. Una vez más, el objetivo es convertir la imagen en bruto en un batch de tensores como entrada. - -Vamos a cargar el dataset [food101](https://huggingface.co/datasets/food101) para este tutorial. Usa el parámetro 🤗 Datasets `split` para cargar solo una pequeña muestra de la división de entrenamiento ya que el dataset es bastante grande: - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("food101", split="train[:100]") -``` - -A continuación, observa la imagen con la función 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image): - -```py ->>> dataset[0]["image"] -``` - -![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png) - -### Extractor de características - -Carga el extractor de características con [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224") -``` - -### Aumento de Datos - -Para las tareas de visión por computadora es común añadir algún tipo de aumento de datos (o data augmentation) a las imágenes como parte del preprocesamiento. Puedes añadir el método de aumento de datos con cualquier librería que quieras, pero en este tutorial utilizarás el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. - -1. Normaliza la imagen y utiliza [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) para encadenar algunas transformaciones - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) y [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - juntas: - -```py ->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor - ->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) ->>> _transforms = Compose( -... [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize] -... ) -``` - -2. El modelo acepta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) como entrada. Este valor es generado por el extractor de características. Crea una función que genere `pixel_values` a partir de las transformaciones: - -```py ->>> def transforms(examples): -... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] -... return examples -``` - -3. A continuación, utiliza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) para aplicar las transformaciones sobre la marcha: - -```py ->>> dataset.set_transform(transforms) -``` - -4. Ahora, cuando accedes a la imagen, observarás que el extractor de características ha añadido a la entrada del modelo `pixel_values`: - -```py ->>> dataset[0]["image"] -{'image': , - 'label': 6, - 'pixel_values': tensor([[[ 0.0353, 0.0745, 0.1216, ..., -0.9922, -0.9922, -0.9922], - [-0.0196, 0.0667, 0.1294, ..., -0.9765, -0.9843, -0.9922], - [ 0.0196, 0.0824, 0.1137, ..., -0.9765, -0.9686, -0.8667], - ..., - [ 0.0275, 0.0745, 0.0510, ..., -0.1137, -0.1216, -0.0824], - [ 0.0667, 0.0824, 0.0667, ..., -0.0588, -0.0745, -0.0980], - [ 0.0353, 0.0353, 0.0431, ..., -0.0039, -0.0039, -0.0588]], - - [[ 0.2078, 0.2471, 0.2863, ..., -0.9451, -0.9373, -0.9451], - [ 0.1608, 0.2471, 0.3098, ..., -0.9373, -0.9451, -0.9373], - [ 0.2078, 0.2706, 0.3020, ..., -0.9608, -0.9373, -0.8275], - ..., - [-0.0353, 0.0118, -0.0039, ..., -0.2392, -0.2471, -0.2078], - [ 0.0196, 0.0353, 0.0196, ..., -0.1843, -0.2000, -0.2235], - [-0.0118, -0.0039, -0.0039, ..., -0.0980, -0.0980, -0.1529]], - - [[ 0.3961, 0.4431, 0.4980, ..., -0.9216, -0.9137, -0.9216], - [ 0.3569, 0.4510, 0.5216, ..., -0.9059, -0.9137, -0.9137], - [ 0.4118, 0.4745, 0.5216, ..., -0.9137, -0.8902, -0.7804], - ..., - [-0.2314, -0.1922, -0.2078, ..., -0.4196, -0.4275, -0.3882], - [-0.1843, -0.1686, -0.2000, ..., -0.3647, -0.3804, -0.4039], - [-0.1922, -0.1922, -0.1922, ..., -0.2941, -0.2863, -0.3412]]])} -``` - -Este es el aspecto de la imagen después de preprocesarla. Como era de esperar por las transformaciones aplicadas, la imagen ha sido recortada aleatoriamente y sus propiedades de color son diferentes. - -```py ->>> import numpy as np ->>> import matplotlib.pyplot as plt - ->>> img = dataset[0]["pixel_values"] ->>> plt.imshow(img.permute(1, 2, 0)) -``` - -![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png) - -## Multimodal - -Para las tareas multimodales utilizarás una combinación de todo lo que has aprendido hasta ahora y aplicarás tus habilidades a una tarea de reconocimiento automático de voz (ASR). Esto significa que necesitarás un: - -* Extractor de características para preprocesar los datos de audio. -* Un tokenizador para procesar el texto. - -Volvamos al dataset [LJ Speech](https://huggingface.co/datasets/lj_speech): - -```py ->>> from datasets import load_dataset - ->>> lj_speech = load_dataset("lj_speech", split="train") -``` - -Suponiendo que te interesan principalmente las columnas `audio` y `texto`, elimina las demás columnas: - -```py ->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) -``` - -Ahora echa un vistazo a las columnas `audio` y `texto`: - -```py ->>> lj_speech[0]["audio"] -{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., - 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 22050} - ->>> lj_speech[0]["text"] -'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' -``` - -Recuerda la sección anterior sobre el procesamiento de datos de audio, siempre debes [volver a muestrear](preprocessing#audio) la tasa de muestreo de tus datos de audio para que coincida con la tasa de muestreo del dataset utilizado para preentrenar un modelo: - -```py ->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -### Processor - -Un processor combina un extractor de características y un tokenizador. Cargue un procesador con [`AutoProcessor.from_pretrained]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") -``` - -1. Crea una función para procesar los datos de audio en `input_values`, y tokeniza el texto en `labels`. Estas son las entradas del modelo: - -```py ->>> def prepare_dataset(example): -... audio = example["audio"] - -... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) - -... return example -``` - -2. Aplica la función `prepare_dataset` a una muestra: - -```py ->>> prepare_dataset(lj_speech[0]) -``` - -Observa que el método processor ha añadido `input_values` y `labels`. La tasa de muestreo también se ha reducido correctamente a 16kHz. - -Genial, ahora deberías ser capaz de preprocesar datos para cualquier modalidad e incluso combinar diferentes modalidades. En el siguiente tutorial, aprenderás aplicar fine tuning a un modelo en tus datos recién preprocesados. - -## Todo lo que siempre quisiste saber sobre el padding y el truncamiento - -Hemos visto los comandos que funcionarán para la mayoría de los casos (hacer pad a tu batch teniendo en cuenta la longitud de la frase máxima y -truncar a la longitud máxima que el modelo puede aceptar). Sin embargo, la API admite más estrategias si las necesitas. Los -tres argumentos que necesitas conocer para ello son `padding`, `truncation` y `max_length`. - -- `padding` controla el aplicarme padding al texto. Puede ser un booleano o una cadena que debe ser: - - - `True` o `'longest'` para aplicar el pad hasta la secuencia más larga del batch (no apliques el padding si sólo le proporcionas - una sola secuencia). - - `'max_length'` para aplicar el pad hasta la longitud especificada por el argumento `max_length` o la longitud máxima aceptada - por el modelo si no le proporcionas `longitud_máxima` (`longitud_máxima=None`). Si sólo le proporcionas una única secuencia - se le aplicará el padding. - `False` o `'do_not_pad'` para no aplicar pad a las secuencias. Como hemos visto antes, este es el comportamiento por - defecto. - -- `truncation` controla el truncamiento. Puede ser un booleano o una string que debe ser: - - - `True` o `'longest_first'` truncan hasta la longitud máxima especificada por el argumento `max_length` o - la longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto - truncará token por token, eliminando un token de la secuencia más larga del par hasta alcanzar la longitud - adecuada. - - `'only_second'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la - longitud máxima aceptada por el modelo si no le proporcionas `max_length` (`max_length=None`). Esto sólo truncará - la segunda frase de un par si le proporcionas un par de secuencias (o un batch de pares de secuencias). - - `'only_first'` trunca hasta la longitud máxima especificada por el argumento `max_length` o la longitud máxima - aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto sólo truncará - la primera frase de un par si se proporciona un par de secuencias (o un lote de pares de secuencias). - - `False` o `'do_not_truncate'` para no truncar las secuencias. Como hemos visto antes, este es el comportamiento - por defecto. - -- `max_length` para controlar la longitud del padding/truncamiento. Puede ser un número entero o `None`, en cuyo caso -será por defecto la longitud máxima que el modelo puede aceptar. Si el modelo no tiene una longitud máxima de entrada específica, el -padding/truncamiento a `longitud_máxima` se desactiva. - -A continuación te mostramos en una tabla que resume la forma recomendada de configurar el padding y el truncamiento. Si utilizas un par de secuencias de entrada en -algunos de los siguientes ejemplos, puedes sustituir `truncation=True` por una `STRATEGY` seleccionada en -`['only_first', 'only_second', 'longest_first']`, es decir, `truncation='only_second'` o `truncation= 'longest_first'` para controlar cómo se truncan ambas secuencias del par como se ha detallado anteriormente. - -| Truncation | Padding | Instrucciones | -|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------| -| no truncation | no padding | `tokenizer(batch_sentences)` | -| | padding secuencia max del batch | `tokenizer(batch_sentences, padding=True)` or | -| | | `tokenizer(batch_sentences, padding='longest')` | -| | padding long max de input model | `tokenizer(batch_sentences, padding='max_length')` | -| | padding a una long especifica | `tokenizer(batch_sentences, padding='max_length', max_length=42)` | -| truncation long max del input model | no padding | `tokenizer(batch_sentences, truncation=True)` or | -| | | `tokenizer(batch_sentences, truncation=STRATEGY)` | -| | padding secuencia max del batch | `tokenizer(batch_sentences, padding=True, truncation=True)` or | -| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)` | -| | padding long max de input model | `tokenizer(batch_sentences, padding='max_length', truncation=True)` or | -| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)` | -| | padding a una long especifica | Not possible | -| truncation a una long especifica | no padding | `tokenizer(batch_sentences, truncation=True, max_length=42)` or | -| | | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)` | -| | padding secuencia max del batch | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or | -| | | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)` | -| | padding long max de input model | Not possible | -| | padding a una long especifica | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or | -| | | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` | - - - - - - - - diff --git a/docs/source/es/quicktour.md b/docs/source/es/quicktour.md new file mode 100644 index 000000000000..ad2549ef450b --- /dev/null +++ b/docs/source/es/quicktour.md @@ -0,0 +1,401 @@ + + +# Tour rápido + +[[open-in-colab]] + +¡Entra en marcha con los 🤗 Transformers! Comienza usando [`pipeline`] para una inferencia veloz, carga un modelo preentrenado y un tokenizador con una [AutoClass](./model_doc/auto) para resolver tu tarea de texto, visión o audio. + + + +Todos los ejemplos de código presentados en la documentación tienen un botón arriba a la derecha para elegir si quieres ocultar o mostrar el código en Pytorch o TensorFlow. +Si no fuese así, se espera que el código funcione para ambos backends sin ningún cambio. + + + +## Pipeline + +[`pipeline`] es la forma más fácil de usar un modelo preentrenado para una tarea dada. + + + +El [`pipeline`] soporta muchas tareas comunes listas para usar: + +**Texto**: +* Análisis de Sentimiento (Sentiment Analysis, en inglés): clasifica la polaridad de un texto dado. +* Generación de Texto (Text Generation, en inglés): genera texto a partir de un input dado. +* Reconocimiento de Entidades (Name Entity Recognition o NER, en inglés): etiqueta cada palabra con la entidad que representa (persona, fecha, ubicación, etc.). +* Responder Preguntas (Question answering, en inglés): extrae la respuesta del contexto dado un contexto y una pregunta. +* Rellenar Máscara (Fill-mask, en inglés): rellena el espacio faltante dado un texto con palabras enmascaradas. +* Resumir (Summarization, en inglés): genera un resumen de una secuencia larga de texto o un documento. +* Traducción (Translation, en inglés): traduce un texto a otro idioma. +* Extracción de Características (Feature Extraction, en inglés): crea una representación tensorial del texto. + +**Imagen**: +* Clasificación de Imágenes (Image Classification, en inglés): clasifica una imagen. +* Segmentación de Imágenes (Image Segmentation, en inglés): clasifica cada pixel de una imagen. +* Detección de Objetos (Object Detection, en inglés): detecta objetos dentro de una imagen. + +**Audio**: +* Clasificación de Audios (Audio Classification, en inglés): asigna una etiqueta a un segmento de audio. +* Reconocimiento de Voz Automático (Automatic Speech Recognition o ASR, en inglés): transcribe datos de audio a un texto. + + + +Para más detalles acerca del [`pipeline`] y tareas asociadas, consulta la documentación [aquí](./main_classes/pipelines). + + + +### Uso del Pipeline + +En el siguiente ejemplo, usarás el [`pipeline`] para análisis de sentimiento. + +Instala las siguientes dependencias si aún no lo has hecho: + + + + +```bash +pip install torch +``` + + + +```bash +pip install tensorflow +``` + + + +Importa [`pipeline`] y especifica la tarea que deseas completar: + +```py +>>> from transformers import pipeline + +>>> clasificador = pipeline("sentiment-analysis", model="pysentimiento/robertuito-sentiment-analysis") +``` + +El pipeline descarga y almacena en caché el [modelo preentrenado](https://huggingface.co/pysentimiento/robertuito-sentiment-analysis) y tokeniza para análisis de sentimiento. Si no hubieramos elegido un modelo el pipeline habría elegido uno por defecto. Ahora puedes usar `clasificador` en tu texto objetivo: + +```py +>>> clasificador("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.") +[{'label': 'POS', 'score': 0.9320}] +``` + +Para más de un enunciado, entrega una lista al [`pipeline`] que devolverá una lista de diccionarios: + +El [`pipeline`] también puede iterar sobre un dataset entero. Comienza instalando la biblioteca [🤗 Datasets](https://huggingface.co/docs/datasets/): + +```bash +pip install datasets +``` + +Crea un [`pipeline`] con la tarea que deseas resolver y el modelo que quieres usar. Coloca el parámetro `device` a `0` para poner los tensores en un dispositivo CUDA: + +```py +>>> import torch +>>> from transformers import pipeline + +>>> reconocedor_de_voz = pipeline( +... "automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish", device=0 +... ) +``` + +A continuación, carga el dataset (ve 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) para más detalles) sobre el que quisieras iterar. Por ejemplo, vamos a cargar el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14): + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="es-ES", split="train") # doctest: +IGNORE_RESULT +``` + +Debemos asegurarnos de que la frecuencia de muestreo del conjunto de datos coincide con la frecuencia de muestreo con la que se entrenó `jonatasgrosman/wav2vec2-large-xlsr-53-spanish`. + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=reconocedor_de_voz.feature_extractor.sampling_rate)) +``` + +Los archivos de audio se cargan y remuestrean automáticamente cuando llamamos a la columna `"audio"`. +Extraigamos las matrices de onda cruda (raw waveform, en inglés) de las primeras 4 muestras y pasémosla como una lista al pipeline: + +```py +>>> resultado = reconocedor_de_voz(dataset[:4]["audio"]) +>>> print([d["text"] for d in resultado]) +['ahora buenas eh a ver tengo un problema con vuestra aplicación resulta que que quiero hacer una transferencia bancaria a una cuenta conocida pero me da error la aplicación a ver que a ver que puede ser', 'la aplicación no cargue saldo de mi nueva cuenta', 'hola tengo un problema con la aplicación no carga y y tampoco veo que carga el saldo de mi cuenta nueva dice que la aplicación está siendo reparada y ahora no puedo acceder a mi cuenta no necesito inmediatamente', 'hora buena la aplicación no se carga la vida no carga el saldo de mi cuenta nueva dice que la villadenta siendo reparada y oro no puedo hacer a mi cuenta'] +``` + +Para un dataset más grande, donde los inputs son de mayor tamaño (como en habla/audio o visión), querrás pasar un generador en lugar de una lista que carga todos los inputs en memoria. Ve la [documentación del pipeline](./main_classes/pipelines) para más información. + +### Usa otro modelo y otro tokenizador en el pipeline + +El [`pipeline`] puede acomodarse a cualquier modelo del [Model Hub](https://huggingface.co/models) haciendo más fácil adaptar el [`pipeline`] para otros casos de uso. Por ejemplo, si quisieras un modelo capaz de manejar texto en francés, usa los tags en el Model Hub para filtrar entre los modelos apropiados. El resultado mejor filtrado devuelve un [modelo BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingual fine-tuned para el análisis de sentimiento. Genial, ¡vamos a usar este modelo! + +```py +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +``` + + + +Usa [`AutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `AutoClass` debajo): + +```py +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + + + +Usa [`TFAutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `TFAutoClass` debajo): + +```py +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + + + +Después puedes especificar el modelo y el tokenizador en el [`pipeline`], y aplicar el `classifier` en tu texto objetivo: + +```py +>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) +>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") +[{'label': '5 stars', 'score': 0.7273}] +``` + +Si no pudieras encontrar el modelo para tu caso respectivo de uso necesitarás ajustar un modelo preentrenado a tus datos. Mira nuestro [tutorial de fine-tuning](./training) para aprender cómo. Finalmente, después de que has ajustado tu modelo preentrenado, ¡por favor considera compartirlo (ve el tutorial [aquí](./model_sharing)) con la comunidad en el Model Hub para democratizar el NLP! 🤗 + +## AutoClass + + + +Por debajo, las clases [`AutoModelForSequenceClassification`] y [`AutoTokenizer`] trabajan juntas para dar poder al [`pipeline`]. Una [AutoClass](./model_doc/auto) es un atajo que automáticamente recupera la arquitectura de un modelo preentrenado con su nombre o el path. Sólo necesitarás seleccionar el `AutoClass` apropiado para tu tarea y tu tokenizador asociado con [`AutoTokenizer`]. + +Regresemos a nuestro ejemplo y veamos cómo puedes usar el `AutoClass` para reproducir los resultados del [`pipeline`]. + +### AutoTokenizer + +Un tokenizador es responsable de procesar el texto a un formato que sea entendible para el modelo. Primero, el tokenizador separará el texto en palabras llamadas *tokens*. Hay múltiples reglas que gobiernan el proceso de tokenización incluyendo el cómo separar una palabra y en qué nivel (aprende más sobre tokenización [aquí](./tokenizer_summary)). Lo más importante es recordar que necesitarás instanciar el tokenizador con el mismo nombre del modelo para asegurar que estás usando las mismas reglas de tokenización con las que el modelo fue preentrenado. + +Carga un tokenizador con [`AutoTokenizer`]: + +```py +>>> from transformers import AutoTokenizer + +>>> nombre_del_modelo = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tokenizer = AutoTokenizer.from_pretrained(nombre_del_modelo) +``` + +Después, el tokenizador convierte los tokens a números para construir un tensor que servirá como input para el modelo. Esto es conocido como el *vocabulario* del modelo. + +Pasa tu texto al tokenizador: + +```py +>>> encoding = tokenizer("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.") +>>> print(encoding) +{'input_ids': [101, 10602, 14000, 13653, 43353, 10107, 10102, 47201, 10218, 10106, 18283, 10102, 100, 58263, 119, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +El tokenizador devolverá un diccionario conteniendo: + +* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens. +* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos. + +Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme: + + + + +```py +>>> pt_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="pt", +... ) +``` + + + +```py +>>> tf_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="tf", +... ) +``` + + + +Lee el tutorial de [preprocessing](./preprocessing) para más detalles acerca de la tokenización. + +### AutoModel + + + +🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`AutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`AutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`AutoModelForSequenceClassification`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`] deberías usar para cada tarea. + + + +Ahora puedes pasar tu lote (batch) preprocesado de inputs directamente al modelo. Solo tienes que desempacar el diccionario añadiendo `**`: + +```py +>>> pt_outputs = pt_model(**pt_batch) +``` + +El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades: + +```py +>>> from torch import nn + +>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) +>>> print(pt_predictions) +tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], + [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) +``` + + +🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`TFAutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`TFAutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`TFAutoModelForSequenceClassification`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`] + deberías usar para cada tarea. + + +Ahora puedes pasar tu lote preprocesado de inputs directamente al modelo pasando las llaves del diccionario directamente a los tensores: + +```py +>>> tf_outputs = tf_model(tf_batch) +``` + +El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades: + +```py +>>> import tensorflow as tf + +>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) +>>> print(tf.math.round(tf_predictions * 10**4) / 10**4) +tf.Tensor( +[[0.0021 0.0018 0.0116 0.2121 0.7725] + [0.2084 0.1826 0.1969 0.1755 0.2365]], shape=(2, 5), dtype=float32) +``` + + + + + +Todos los modelos de 🤗 Transformers (PyTorch o TensorFlow) producirán los tensores *antes* de la función de activación +final (como softmax) porque la función de activación final es comúnmente fusionada con la pérdida. + + + +Los modelos son [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) estándares así que podrás usarlos en tu training loop usual. Sin embargo, para facilitar las cosas, 🤗 Transformers provee una clase [`Trainer`] para PyTorch que añade funcionalidades para entrenamiento distribuido, precición mixta, y más. Para TensorFlow, puedes usar el método `fit` desde [Keras](https://keras.io/). Consulta el [tutorial de entrenamiento](./training) para más detalles. + + + +Los outputs del modelo de 🤗 Transformers son dataclasses especiales por lo que sus atributos pueden ser completados en un IDE. +Los outputs del modelo también se comportan como tuplas o diccionarios (e.g., puedes indexar con un entero, un slice o una cadena) en cuyo caso los atributos que son `None` son ignorados. + + + +### Guarda un modelo + + + +Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`PreTrainedModel.save_pretrained`]: + +```py +>>> pt_save_directory = "./pt_save_pretrained" +>>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT +>>> pt_model.save_pretrained(pt_save_directory) +``` + +Cuando quieras usar el modelo otra vez cárgalo con [`PreTrainedModel.from_pretrained`]: + +```py +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") +``` + + + + +Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`TFPreTrainedModel.save_pretrained`]: + +```py +>>> tf_save_directory = "./tf_save_pretrained" +>>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT +>>> tf_model.save_pretrained(tf_save_directory) +``` + +Cuando quieras usar el modelo otra vez cárgalo con [`TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") +``` + + + +Una característica particularmente interesante de 🤗 Transformers es la habilidad de guardar el modelo y cargarlo como un modelo de PyTorch o TensorFlow. El parámetro `from_pt` o `from_tf` puede convertir el modelo de un framework al otro: + + + + +```py +>>> from transformers import AutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +``` + + + +```py +>>> from transformers import TFAutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +``` + + diff --git a/docs/source/es/quicktour.mdx b/docs/source/es/quicktour.mdx deleted file mode 100644 index 408c3fa375a0..000000000000 --- a/docs/source/es/quicktour.mdx +++ /dev/null @@ -1,391 +0,0 @@ - - -# Tour rápido - -[[open-in-colab]] - -¡Entra en marcha con los 🤗 Transformers! Comienza usando [`pipeline`] para una inferencia veloz, carga un modelo preentrenado y un tokenizador con una [AutoClass](./model_doc/auto) para resolver tu tarea de texto, visión o audio. - - - -Todos los ejemplos de código presentados en la documentación tienen un botón arriba a la derecha para elegir si quieres ocultar o mostrar el código en Pytorch o TensorFlow. -Si no fuese así, se espera que el código funcione para ambos backends sin ningún cambio. - - - -## Pipeline - -[`pipeline`] es la forma más fácil de usar un modelo preentrenado para una tarea dada. - - - -El [`pipeline`] soporta muchas tareas comunes listas para usar: - -**Texto**: -* Análisis de Sentimiento (Sentiment Analysis, en inglés): clasifica la polaridad de un texto dado. -* Generación de Texto (Text Generation, en inglés): genera texto a partir de un input dado. -* Reconocimiento de Entidades (Name Entity Recognition o NER, en inglés): etiqueta cada palabra con la entidad que representa (persona, fecha, ubicación, etc.). -* Responder Preguntas (Question answering, en inglés): extrae la respuesta del contexto dado un contexto y una pregunta. -* Rellenar Máscara (Fill-mask, en inglés): rellena el espacio faltante dado un texto con palabras enmascaradas. -* Resumir (Summarization, en inglés): genera un resumen de una secuencia larga de texto o un documento. -* Traducción (Translation, en inglés): traduce un texto a otro idioma. -* Extracción de Características (Feature Extraction, en inglés): crea una representación tensorial del texto. - -**Imagen**: -* Clasificación de Imágenes (Image Classification, en inglés): clasifica una imagen. -* Segmentación de Imágenes (Image Segmentation, en inglés): clasifica cada pixel de una imagen. -* Detección de Objetos (Object Detection, en inglés): detecta objetos dentro de una imagen. - -**Audio**: -* Clasificación de Audios (Audio Classification, en inglés): asigna una etiqueta a un segmento de audio. -* Reconocimiento de Voz Automático (Automatic Speech Recognition o ASR, en inglés): transcribe datos de audio a un texto. - - - -Para más detalles acerca del [`pipeline`] y tareas asociadas, consulta la documentación [aquí](./main_classes/pipelines). - - - -### Uso del Pipeline - -En el siguiente ejemplo, usarás el [`pipeline`] para análisis de sentimiento. - -Instala las siguientes dependencias si aún no lo has hecho: - - - -```bash -pip install torch -``` - - -```bash -pip install tensorflow -``` - - - -Importa [`pipeline`] y especifica la tarea que deseas completar: - -```py ->>> from transformers import pipeline - ->>> clasificador = pipeline("sentiment-analysis", model="pysentimiento/robertuito-sentiment-analysis") -``` - -El pipeline descarga y almacena en caché el [modelo preentrenado](https://huggingface.co/pysentimiento/robertuito-sentiment-analysis) y tokeniza para análisis de sentimiento. Si no hubieramos elegido un modelo el pipeline habría elegido uno por defecto. Ahora puedes usar `clasificador` en tu texto objetivo: - -```py ->>> clasificador("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.") -[{'label': 'POS', 'score': 0.9916}] -``` - -Para más de un enunciado, entrega una lista al [`pipeline`] que devolverá una lista de diccionarios: - -El [`pipeline`] también puede iterar sobre un dataset entero. Comienza instalando la biblioteca [🤗 Datasets](https://huggingface.co/docs/datasets/): - -```bash -pip install datasets -``` - -Crea un [`pipeline`] con la tarea que deseas resolver y el modelo que quieres usar. Coloca el parámetro `device` a `0` para poner los tensores en un dispositivo CUDA: - -```py ->>> import torch ->>> from transformers import pipeline - ->>> reconocedor_de_voz = pipeline( -... "automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish", device=0 -... ) -``` - -A continuación, carga el dataset (ve 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) para más detalles) sobre el que quisieras iterar. Por ejemplo, vamos a cargar el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14): - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="es-ES", split="train") # doctest: +IGNORE_RESULT -``` - -Debemos asegurarnos de que la frecuencia de muestreo del conjunto de datos coincide con la frecuencia de muestreo con la que se entrenó `jonatasgrosman/wav2vec2-large-xlsr-53-spanish`. - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=reconocedor_de_voz.feature_extractor.sampling_rate)) -``` - -Los archivos de audio se cargan y remuestrean automáticamente cuando llamamos a la columna `"audio"`. -Extraigamos las matrices de onda cruda (raw waveform, en inglés) de las primeras 4 muestras y pasémosla como una lista al pipeline: - -```py ->>> resultado = reconocedor_de_voz(dataset[:4]["audio"]) ->>> print([d["text"] for d in resultado]) -['ahora buenas eh a ver tengo un problema con vuestra aplicación resulta que que quiero hacer una transferencia bancaria a una cuenta conocida pero me da error la aplicación a ver que a ver que puede ser', 'la aplicación no cargue saldo de mi nueva cuenta', 'hola tengo un problema con la aplicación no carga y y tampoco veo que carga el saldo de mi cuenta nueva dice que la aplicación está siendo reparada y ahora no puedo acceder a mi cuenta no necesito inmediatamente', 'hora buena la aplicación no se carga la vileza no carga el saldo de mi cuenta nueva dice que la villadenta siendo reparada y oro no puedo hacer a mi cuenta'] -``` - -Para un dataset más grande, donde los inputs son de mayor tamaño (como en habla/audio o visión), querrás pasar un generador en lugar de una lista que carga todos los inputs en memoria. Ve la [documentación del pipeline](./main_classes/pipelines) para más información. - -### Usa otro modelo y otro tokenizador en el pipeline - -El [`pipeline`] puede acomodarse a cualquier modelo del [Model Hub](https://huggingface.co/models) haciendo más fácil adaptar el [`pipeline`] para otros casos de uso. Por ejemplo, si quisieras un modelo capaz de manejar texto en francés, usa los tags en el Model Hub para filtrar entre los modelos apropiados. El resultado mejor filtrado devuelve un [modelo BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingual fine-tuned para el análisis de sentimiento. Genial, ¡vamos a usar este modelo! - -```py ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" -``` - - - -Usa [`AutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `AutoClass` debajo): - -```py ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - - - -Usa [`TFAutoModelForSequenceClassification`] y ['AutoTokenizer'] para cargar un modelo preentrenado y un tokenizador asociado (más en un `TFAutoClass` debajo): - -```py ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - - - -Después puedes especificar el modelo y el tokenizador en el [`pipeline`], y aplicar el `classifier` en tu texto objetivo: - -```py ->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) ->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") -[{'label': '5 stars', 'score': 0.7273}] -``` - -Si no pudieras encontrar el modelo para tu caso respectivo de uso necesitarás ajustar un modelo preentrenado a tus datos. Mira nuestro [tutorial de fine-tuning](./training) para aprender cómo. Finalmente, después de que has ajustado tu modelo preentrenado, ¡por favor considera compartirlo (ve el tutorial [aquí](./model_sharing)) con la comunidad en el Model Hub para democratizar el NLP! 🤗 - -## AutoClass - - - -Por debajo, las clases [`AutoModelForSequenceClassification`] y [`AutoTokenizer`] trabajan juntas para dar poder al [`pipeline`]. Una [AutoClass](./model_doc/auto) es un atajo que automáticamente recupera la arquitectura de un modelo preentrenado con su nombre o el path. Sólo necesitarás seleccionar el `AutoClass` apropiado para tu tarea y tu tokenizador asociado con [`AutoTokenizer`]. - -Regresemos a nuestro ejemplo y veamos cómo puedes usar el `AutoClass` para reproducir los resultados del [`pipeline`]. - -### AutoTokenizer - -Un tokenizador es responsable de procesar el texto a un formato que sea entendible para el modelo. Primero, el tokenizador separará el texto en palabras llamadas *tokens*. Hay múltiples reglas que gobiernan el proceso de tokenización incluyendo el cómo separar una palabra y en qué nivel (aprende más sobre tokenización [aquí](./tokenizer_summary)). Lo más importante es recordar que necesitarás instanciar el tokenizador con el mismo nombre del modelo para asegurar que estás usando las mismas reglas de tokenización con las que el modelo fue preentrenado. - -Carga un tokenizador con [`AutoTokenizer`]: - -```py ->>> from transformers import AutoTokenizer - ->>> nombre_del_modelo = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tokenizer = AutoTokenizer.from_pretrained(nombre_del_modelo) -``` - -Después, el tokenizador convierte los tokens a números para construir un tensor que servirá como input para el modelo. Esto es conocido como el *vocabulario* del modelo. - -Pasa tu texto al tokenizador: - -```py ->>> encoding = tokenizer("Estamos muy felices de mostrarte la biblioteca de 🤗 Transformers.") ->>> print(encoding) -{'input_ids': [101, 10602, 14000, 13653, 43353, 10107, 10102, 47201, 10218, 10106, 18283, 10102, 100, 58263, 119, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -El tokenizador devolverá un diccionario conteniendo: - -* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens. -* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos. - -Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme: - - - -```py ->>> pt_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="pt", -... ) -``` - - -```py ->>> tf_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="tf", -... ) -``` - - - -Lee el tutorial de [preprocessing](./preprocessing) para más detalles acerca de la tokenización. - -### AutoModel - - - -🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`AutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`AutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`AutoModelForSequenceClassification`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - -Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`] deberías usar para cada tarea. - - - -Ahora puedes pasar tu lote (batch) preprocesado de inputs directamente al modelo. Solo tienes que desempacar el diccionario añadiendo `**`: - -```py ->>> pt_outputs = pt_model(**pt_batch) -``` - -El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades: - -```py ->>> from torch import nn - ->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) ->>> print(pt_predictions) -tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], - [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) -``` - - -🤗 Transformers provee una forma simple y unificada de cargar tus instancias preentrenadas. Esto significa que puedes cargar un [`TFAutoModel`] como cargarías un [`AutoTokenizer`]. La única diferencia es seleccionar el [`TFAutoModel`] correcto para la tarea. Ya que estás clasificando texto, o secuencias, carga [`TFAutoModelForSequenceClassification`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - Ve el [task summary](./task_summary) para revisar qué clase del [`AutoModel`] - deberías usar para cada tarea. - - -Ahora puedes pasar tu lote preprocesado de inputs directamente al modelo pasando las llaves del diccionario directamente a los tensores: - -```py ->>> tf_outputs = tf_model(tf_batch) -``` - -El modelo producirá las activaciones finales en el atributo `logits`. Aplica la función softmax a `logits` para obtener las probabilidades: - -```py ->>> import tensorflow as tf - ->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) ->>> print(tf.math.round(tf_predictions * 10**4) / 10**4) -tf.Tensor( -[[0.0021 0.0018 0.0116 0.2121 0.7725] - [0.2084 0.1826 0.1969 0.1755 0.2365]], shape=(2, 5), dtype=float32) -``` - - - - - -Todos los modelos de 🤗 Transformers (PyTorch o TensorFlow) producirán los tensores *antes* de la función de activación -final (como softmax) porque la función de activación final es comúnmente fusionada con la pérdida. - - - -Los modelos son [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) estándares así que podrás usarlos en tu training loop usual. Sin embargo, para facilitar las cosas, 🤗 Transformers provee una clase [`Trainer`] para PyTorch que añade funcionalidades para entrenamiento distribuido, precición mixta, y más. Para TensorFlow, puedes usar el método `fit` desde [Keras](https://keras.io/). Consulta el [tutorial de entrenamiento](./training) para más detalles. - - - -Los outputs del modelo de 🤗 Transformers son dataclasses especiales por lo que sus atributos pueden ser completados en un IDE. -Los outputs del modelo también se comportan como tuplas o diccionarios (e.g., puedes indexar con un entero, un slice o una cadena) en cuyo caso los atributos que son `None` son ignorados. - - - -### Guarda un modelo - - - -Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`PreTrainedModel.save_pretrained`]: - -```py ->>> pt_save_directory = "./pt_save_pretrained" ->>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT ->>> pt_model.save_pretrained(pt_save_directory) -``` - -Cuando quieras usar el modelo otra vez cárgalo con [`PreTrainedModel.from_pretrained`]: - -```py ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") -``` - - - - -Una vez que se haya hecho fine-tuning a tu modelo puedes guardarlo con tu tokenizador usando [`TFPreTrainedModel.save_pretrained`]: - -```py ->>> tf_save_directory = "./tf_save_pretrained" ->>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT ->>> tf_model.save_pretrained(tf_save_directory) -``` - -Cuando quieras usar el modelo otra vez cárgalo con [`TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") -``` - - - -Una característica particularmente interesante de 🤗 Transformers es la habilidad de guardar el modelo y cargarlo como un modelo de PyTorch o TensorFlow. El parámetro `from_pt` o `from_tf` puede convertir el modelo de un framework al otro: - - - -```py ->>> from transformers import AutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) -``` - - -```py ->>> from transformers import TFAutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) -``` - - diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md new file mode 100644 index 000000000000..a66fd1e47e13 --- /dev/null +++ b/docs/source/es/run_scripts.md @@ -0,0 +1,351 @@ + + +# Entrenamiento con scripts + +Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). + +También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca. + +No se espera que los scripts de ejemplo funcionen de inmediato en todos los problemas, y es posible que debas adaptar el script al problema que estás tratando de resolver. Para ayudarte con esto, la mayoría de los scripts exponen completamente cómo se preprocesan los datos, lo que te permite editarlos según sea necesario para tu caso de uso. + +Para cualquier característica que te gustaría implementar en un script de ejemplo, por favor discútelo en el [foro](https://discuss.huggingface.co/) o con un [issue](https://github.com/huggingface/transformers/issues) antes de enviar un Pull Request. Si bien agradecemos las correcciones de errores, es poco probable que fusionemos un Pull Request que agregue más funcionalidad a costa de la legibilidad. + +Esta guía te mostrará cómo ejecutar un ejemplo de un script de entrenamiento para resumir texto en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) y [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Se espera que todos los ejemplos funcionen con ambos frameworks a menos que se especifique lo contrario. + +## Configuración + +Para ejecutar con éxito la última versión de los scripts de ejemplo debes **instalar 🤗 Transformers desde su fuente** en un nuevo entorno virtual: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +pip install . +``` + +Para versiones anteriores de los scripts de ejemplo, haz clic en alguno de los siguientes links: + +
+ Ejemplos de versiones anteriores de 🤗 Transformers + +
+ +Luego cambia tu clon actual de 🤗 Transformers a una versión específica, por ejemplo v3.5.1: + +```bash +git checkout tags/v3.5.1 +``` + +Una vez que hayas configurado la versión correcta de la biblioteca, ve a la carpeta de ejemplo de tu elección e instala los requisitos específicos del ejemplo: + +```bash +pip install -r requirements.txt +``` + +## Ejecutar un script + + + +El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir. + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir. + +```bash +python examples/tensorflow/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Entrenamiento distribuido y de precisión mixta + +[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) admite un entrenamiento distribuido y de precisión mixta, lo que significa que también puedes usarlo en un script. Para habilitar ambas características: + +- Agrega el argumento `fp16` para habilitar la precisión mixta. +- Establece la cantidad de GPU que se usará con el argumento `nproc_per_node`. + +```bash +python -m torch.distributed.launch \ + --nproc_per_node 8 pytorch/summarization/run_summarization.py \ + --fp16 \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +Los scripts de TensorFlow utilizan [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para el entrenamiento distribuido, y no es necesario agregar argumentos adicionales al script de entrenamiento. El script de TensorFlow utilizará múltiples GPUs de forma predeterminada si están disponibles. + +## Ejecutar un script en una TPU + + + +Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. PyTorch admite TPU con el compilador de aprendizaje profundo [XLA](https://www.tensorflow.org/xla) (consulta [aquí](https://github.com/pytorch/xla/blob/master/README.md) para obtener más detalles). Para usar una TPU, inicia el script `xla_spawn.py` y usa el argumento `num_cores` para establecer la cantidad de núcleos de TPU que deseas usar. + +```bash +python xla_spawn.py --num_cores 8 \ + summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. TensorFlow utiliza [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para entrenar en TPUs. Para usar una TPU, pasa el nombre del recurso de la TPU al argumento `tpu` + +```bash +python run_summarization.py \ + --tpu name_of_tpu_resource \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Ejecutar un script con 🤗 Accelerate + +🤗 [Accelerate](https://huggingface.co/docs/accelerate) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes: + +> Nota: Como Accelerate se está desarrollando rápidamente, debes instalar la versión git de Accelerate para ejecutar los scripts +```bash +pip install git+https://github.com/huggingface/accelerate +``` + +En lugar del script `run_summarization.py`, debes usar el script `run_summarization_no_trainer.py`. Los scripts compatibles con 🤗 Accelerate tendrán un archivo `task_no_trainer.py` en la carpeta. Comienza ejecutando el siguiente comando para crear y guardar un archivo de configuración: + +```bash +accelerate config +``` + +Prueba tu configuración para asegurarte que está configurada correctamente: + +```bash +accelerate test +``` + +Todo listo para iniciar el entrenamiento: + +```bash +accelerate launch run_summarization_no_trainer.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir ~/tmp/tst-summarization +``` + +## Usar un conjunto de datos personalizado + +El script de la tarea resumir admite conjuntos de datos personalizados siempre que sean un archivo CSV o JSON Line. Cuando uses tu propio conjunto de datos, necesitas especificar varios argumentos adicionales: + +- `train_file` y `validation_file` especifican la ruta a tus archivos de entrenamiento y validación. +- `text_column` es el texto de entrada para resumir. +- `summary_column` es el texto de destino para la salida. + +Un script para resumir que utiliza un conjunto de datos personalizado se vera así: + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --train_file path_to_csv_or_jsonlines_file \ + --validation_file path_to_csv_or_jsonlines_file \ + --text_column text_column_name \ + --summary_column summary_column_name \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --overwrite_output_dir \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --predict_with_generate +``` + +## Prueba un script + +A veces, es una buena idea ejecutar tu secuencia de comandos en una cantidad menor de ejemplos para asegurarte de que todo funciona como se espera antes de comprometerte con un conjunto de datos completo, lo que puede demorar horas en completarse. Utiliza los siguientes argumentos para truncar el conjunto de datos a un número máximo de muestras: + +- `max_train_samples` +- `max_eval_samples` +- `max_predict_samples` + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +No todos los scripts de ejemplo admiten el argumento `max_predict_samples`. Puede que desconozcas si la secuencia de comandos admite este argumento, agrega `-h` para verificar: + +```bash +examples/pytorch/summarization/run_summarization.py -h +``` + +## Reanudar el entrenamiento desde el punto de control + +Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control. + +El primer método utiliza el argumento `output_dir previous_output_dir` para reanudar el entrenamiento desde el último punto de control almacenado en `output_dir`. En este caso, debes eliminar `overwrite_output_dir`: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --output_dir previous_output_dir \ + --predict_with_generate +``` + +El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific_checkpoint` para reanudar el entrenamiento desde una carpeta de punto de control específica. + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --resume_from_checkpoint path_to_specific_checkpoint \ + --predict_with_generate +``` + +## Comparte tu modelo + +Todos los scripts pueden cargar tu modelo final en el [Model Hub](https://huggingface.co/models). Asegúrate de haber iniciado sesión en Hugging Face antes de comenzar: + +```bash +huggingface-cli login +``` + +Luego agrega el argumento `push_to_hub` al script. Este argumento creará un repositorio con tu nombre de usuario Hugging Face y el nombre de la carpeta especificado en `output_dir`. + +Para darle a tu repositorio un nombre específico, usa el argumento `push_to_hub_model_id` para añadirlo. El repositorio se incluirá automáticamente en tu namespace. + +El siguiente ejemplo muestra cómo cargar un modelo con un nombre de repositorio específico: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --push_to_hub \ + --push_to_hub_model_id finetuned-t5-cnn_dailymail \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` diff --git a/docs/source/es/run_scripts.mdx b/docs/source/es/run_scripts.mdx deleted file mode 100644 index d0ab716f80ff..000000000000 --- a/docs/source/es/run_scripts.mdx +++ /dev/null @@ -1,347 +0,0 @@ - - -# Entrenamiento con scripts - -Junto con los [notebooks](./noteboks/README) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). - -También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca. - -No se espera que los scripts de ejemplo funcionen de inmediato en todos los problemas, y es posible que debas adaptar el script al problema que estás tratando de resolver. Para ayudarte con esto, la mayoría de los scripts exponen completamente cómo se preprocesan los datos, lo que te permite editarlos según sea necesario para tu caso de uso. - -Para cualquier característica que te gustaría implementar en un script de ejemplo, por favor discútelo en el [foro](https://discuss.huggingface.co/) o con un [issue](https://github.com/huggingface/transformers/issues) antes de enviar un Pull Request. Si bien agradecemos las correcciones de errores, es poco probable que fusionemos un Pull Request que agregue más funcionalidad a costa de la legibilidad. - -Esta guía te mostrará cómo ejecutar un ejemplo de un script de entrenamiento para resumir texto en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) y [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Se espera que todos los ejemplos funcionen con ambos frameworks a menos que se especifique lo contrario. - -## Configuración - -Para ejecutar con éxito la última versión de los scripts de ejemplo debes **instalar 🤗 Transformers desde su fuente** en un nuevo entorno virtual: - -```bash -git clone https://github.com/huggingface/transformers -cd transformers -pip install . -``` - -Para versiones anteriores de los scripts de ejemplo, haz clic en alguno de los siguientes links: - -
- Ejemplos de versiones anteriores de 🤗 Transformers - -
- -Luego cambia tu clon actual de 🤗 Transformers a una versión específica, por ejemplo v3.5.1: - -```bash -git checkout tags/v3.5.1 -``` - -Una vez que hayas configurado la versión correcta de la biblioteca, ve a la carpeta de ejemplo de tu elección e instala los requisitos específicos del ejemplo: - -```bash -pip install -r requirements.txt -``` - -## Ejecutar un script - - - -El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir. - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir. - -```bash -python examples/tensorflow/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval -``` - - - -## Entrenamiento distribuido y de precisión mixta - -[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) admite un entrenamiento distribuido y de precisión mixta, lo que significa que también puedes usarlo en un script. Para habilitar ambas características: - -- Agrega el argumento `fp16` para habilitar la precisión mixta. -- Establece la cantidad de GPU que se usará con el argumento `nproc_per_node`. - -```bash -python -m torch.distributed.launch \ - --nproc_per_node 8 pytorch/summarization/run_summarization.py \ - --fp16 \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -Los scripts de TensorFlow utilizan [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) para el entrenamiento distribuido, y no es necesario agregar argumentos adicionales al script de entrenamiento. El script de TensorFlow utilizará múltiples GPUs de forma predeterminada si están disponibles. - -## Ejecutar un script en una TPU - - - -Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. PyTorch admite TPU con el compilador de aprendizaje profundo [XLA](https://www.tensorflow.org/xla) (consulta [aquí](https://github.com/pytorch/xla/blob/master/README.md) para obtener más detalles). Para usar una TPU, inicia el script `xla_spawn.py` y usa el argumento `num_cores` para establecer la cantidad de núcleos de TPU que deseas usar. - -```bash -python xla_spawn.py --num_cores 8 \ - summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicamente para acelerar el rendimiento. TensorFlow utiliza [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) para entrenar en TPUs. Para usar una TPU, pasa el nombre del recurso de la TPU al argumento `tpu` - -```bash -python run_summarization.py \ - --tpu name_of_tpu_resource \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval -``` - - - -## Ejecutar un script con 🤗 Accelerate - -🤗 [Accelerate](https://huggingface.co/docs/accelerate) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes: - -> Nota: Como Accelerate se está desarrollando rápidamente, debes instalar la versión git de Accelerate para ejecutar los scripts -```bash -pip install git+https://github.com/huggingface/accelerate -``` - -En lugar del script `run_summarization.py`, debes usar el script `run_summarization_no_trainer.py`. Los scripts compatibles con 🤗 Accelerate tendrán un archivo `task_no_trainer.py` en la carpeta. Comienza ejecutando el siguiente comando para crear y guardar un archivo de configuración: - -```bash -accelerate config -``` - -Prueba tu configuración para asegurarte que está configurada correctamente: - -```bash -accelerate test -``` - -Todo listo para iniciar el entrenamiento: - -```bash -accelerate launch run_summarization_no_trainer.py \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir ~/tmp/tst-summarization -``` - -## Usar un conjunto de datos personalizado - -El script de la tarea resumir admite conjuntos de datos personalizados siempre que sean un archivo CSV o JSON Line. Cuando uses tu propio conjunto de datos, necesitas especificar varios argumentos adicionales: - -- `train_file` y `validation_file` especifican la ruta a tus archivos de entrenamiento y validación. -- `text_column` es el texto de entrada para resumir. -- `summary_column` es el texto de destino para la salida. - -Un script para resumir que utiliza un conjunto de datos personalizado se vera así: - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --train_file path_to_csv_or_jsonlines_file \ - --validation_file path_to_csv_or_jsonlines_file \ - --text_column text_column_name \ - --summary_column summary_column_name \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --overwrite_output_dir \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --predict_with_generate -``` - -## Prueba un script - -A veces, es una buena idea ejecutar tu secuencia de comandos en una cantidad menor de ejemplos para asegurarte de que todo funciona como se espera antes de comprometerte con un conjunto de datos completo, lo que puede demorar horas en completarse. Utiliza los siguientes argumentos para truncar el conjunto de datos a un número máximo de muestras: - -- `max_train_samples` -- `max_eval_samples` -- `max_predict_samples` - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --max_train_samples 50 \ - --max_eval_samples 50 \ - --max_predict_samples 50 \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -No todos los scripts de ejemplo admiten el argumento `max_predict_samples`. Puede que desconozcas si la secuencia de comandos admite este argumento, agrega `-h` para verificar: - -```bash -examples/pytorch/summarization/run_summarization.py -h -``` - -## Reanudar el entrenamiento desde el punto de control - -Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control. - -El primer método utiliza el argumento `output_dir previous_output_dir` para reanudar el entrenamiento desde el último punto de control almacenado en `output_dir`. En este caso, debes eliminar `overwrite_output_dir`: - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --output_dir previous_output_dir \ - --predict_with_generate -``` - -El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific_checkpoint` para reanudar el entrenamiento desde una carpeta de punto de control específica. - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --resume_from_checkpoint path_to_specific_checkpoint \ - --predict_with_generate -``` - -## Comparte tu modelo - -Todos los scripts pueden cargar tu modelo final en el [Model Hub](https://huggingface.co/models). Asegúrate de haber iniciado sesión en Hugging Face antes de comenzar: - -```bash -huggingface-cli login -``` - -Luego agrega el argumento `push_to_hub` al script. Este argumento creará un repositorio con tu nombre de usuario Hugging Face y el nombre de la carpeta especificado en `output_dir`. - -Para darle a tu repositorio un nombre específico, usa el argumento `push_to_hub_model_id` para añadirlo. El repositorio se incluirá automáticamente en tu namespace. - -El siguiente ejemplo muestra cómo cargar un modelo con un nombre de repositorio específico: - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --push_to_hub \ - --push_to_hub_model_id finetuned-t5-cnn_dailymail \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` diff --git a/docs/source/es/sagemaker.md b/docs/source/es/sagemaker.md new file mode 100644 index 000000000000..a874aefe76f6 --- /dev/null +++ b/docs/source/es/sagemaker.md @@ -0,0 +1,29 @@ + + +# Ejecutar el entrenamiento en Amazon SageMaker + +La documentación ha sido trasladada a [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). Esta página será eliminada en `transformers` 5.0. + +### Tabla de contenido + +- [Entrenar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train) +- [Desplegar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference) +- [Preguntas Frecuentes](https://huggingface.co/docs/sagemaker/faq) diff --git a/docs/source/es/sagemaker.mdx b/docs/source/es/sagemaker.mdx deleted file mode 100644 index 491d93e10d4d..000000000000 --- a/docs/source/es/sagemaker.mdx +++ /dev/null @@ -1,25 +0,0 @@ - - -# Ejecutar el entrenamiento en Amazon SageMaker - -La documentación ha sido trasladada a [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). Esta página será eliminada en `transformers` 5.0. - -### Tabla de contenido - -- [Entrenar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train) -- [Desplegar modelos de Hugging Face en Amazon SageMaker con SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference) -- [Preguntas Frecuentes](https://huggingface.co/docs/sagemaker/faq) diff --git a/docs/source/es/serialization.md b/docs/source/es/serialization.md new file mode 100644 index 000000000000..9c24ba72f3d4 --- /dev/null +++ b/docs/source/es/serialization.md @@ -0,0 +1,674 @@ + + +# Exportar modelos 🤗 Transformers + +Si necesitas implementar modelos 🤗 Transformers en entornos de producción, te +recomendamos exportarlos a un formato serializado que se pueda cargar y ejecutar +en tiempos de ejecución y hardware especializados. En esta guía, te mostraremos cómo +exportar modelos 🤗 Transformers en dos formatos ampliamente utilizados: ONNX y TorchScript. + +Una vez exportado, un modelo puede optimizarse para la inferencia a través de técnicas +como la cuantización y _pruning_. Si estás interesado en optimizar tus modelos para +que funcionen con la máxima eficiencia, consulta la +[biblioteca de 🤗 Optimum](https://github.com/huggingface/optimum). + +## ONNX + +El proyecto [ONNX (Open Neural Network eXchange)](http://onnx.ai) es un +estándar abierto que define un conjunto común de operadores y un formato +de archivo común para representar modelos de aprendizaje profundo en una +amplia variedad de _frameworks_, incluidos PyTorch y TensorFlow. Cuando un modelo +se exporta al formato ONNX, estos operadores se usan para construir un +grafo computacional (a menudo llamado _representación intermedia_) que +representa el flujo de datos a través de la red neuronal. + +Al exponer un grafo con operadores y tipos de datos estandarizados, ONNX facilita +el cambio entre frameworks. Por ejemplo, un modelo entrenado en PyTorch se puede +exportar a formato ONNX y luego importar en TensorFlow (y viceversa). + +🤗 Transformers proporciona un paquete llamado `transformers.onnx`, el cual permite convertir +los checkpoints de un modelo en un grafo ONNX aprovechando los objetos de configuración. +Estos objetos de configuración están hechos a la medida de diferentes arquitecturas de modelos +y están diseñados para ser fácilmente extensibles a otras arquitecturas. + +Las configuraciones a la medida incluyen las siguientes arquitecturas: + + + +- ALBERT +- BART +- BEiT +- BERT +- BigBird +- BigBird-Pegasus +- Blenderbot +- BlenderbotSmall +- BLOOM +- CamemBERT +- CLIP +- CodeGen +- ConvBERT +- ConvNeXT +- ConvNeXTV2 +- Data2VecText +- Data2VecVision +- DeBERTa +- DeBERTa-v2 +- DeiT +- DETR +- DistilBERT +- ELECTRA +- FlauBERT +- GPT Neo +- GPT-J +- I-BERT +- LayoutLM +- LayoutLMv3 +- LeViT +- LongT5 +- M2M100 +- Marian +- mBART +- MobileBERT +- MobileViT +- MT5 +- OpenAI GPT-2 +- Perceiver +- PLBart +- ResNet +- RoBERTa +- RoFormer +- SqueezeBERT +- T5 +- ViT +- XLM +- XLM-RoBERTa +- XLM-RoBERTa-XL +- YOLOS + +En las próximas dos secciones, te mostraremos cómo: + +* Exportar un modelo compatible utilizando el paquete `transformers.onnx`. +* Exportar un modelo personalizado para una arquitectura no compatible. + +### Exportar un model a ONNX + +Para exportar un modelo 🤗 Transformers a ONNX, tienes que instalar primero algunas +dependencias extra: + +```bash +pip install transformers[onnx] +``` + +El paquete `transformers.onnx` puede ser usado luego como un módulo de Python: + +```bash +python -m transformers.onnx --help + +usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output + +positional arguments: + output Path indicating where to store generated ONNX model. + +optional arguments: + -h, --help show this help message and exit + -m MODEL, --model MODEL + Model ID on huggingface.co or path on disk to load model from. + --feature {causal-lm, ...} + The type of features to export the model with. + --opset OPSET ONNX opset version to export the model with. + --atol ATOL Absolute difference tolerence when validating the model. +``` + +Exportar un checkpoint usando una configuración a la medida se puede hacer de la siguiente manera: + +```bash +python -m transformers.onnx --model=distilbert-base-uncased onnx/ +``` + +que debería mostrar los siguientes registros: + +```bash +Validating ONNX model... + -[✓] ONNX model output names match reference model ({'last_hidden_state'}) + - Validating ONNX Model output "last_hidden_state": + -[✓] (2, 8, 768) matches (2, 8, 768) + -[✓] all values close (atol: 1e-05) +All good, model saved at: onnx/model.onnx +``` + +Esto exporta un grafo ONNX del checkpoint definido por el argumento `--model`. +En este ejemplo, es un modelo `distilbert-base-uncased`, pero puede ser cualquier +checkpoint en Hugging Face Hub o que esté almacenado localmente. + +El archivo `model.onnx` resultante se puede ejecutar en uno de los +[muchos aceleradores](https://onnx.ai/supported-tools.html#deployModel) +que admiten el estándar ONNX. Por ejemplo, podemos cargar y ejecutar el +modelo con [ONNX Runtime](https://onnxruntime.ai/) de la siguiente manera: + +```python +>>> from transformers import AutoTokenizer +>>> from onnxruntime import InferenceSession + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> session = InferenceSession("onnx/model.onnx") +>>> # ONNX Runtime expects NumPy arrays as input +>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") +>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) +``` + +Los nombres necesarios de salida (es decir, `["last_hidden_state"]`) se pueden obtener +echando un vistazo a la configuración ONNX de cada modelo. Por ejemplo, para DistilBERT tenemos: + +```python +>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig + +>>> config = DistilBertConfig() +>>> onnx_config = DistilBertOnnxConfig(config) +>>> print(list(onnx_config.outputs.keys())) +["last_hidden_state"]s +``` + +El proceso es idéntico para los checkpoints de TensorFlow en Hub. +Por ejemplo, podemos exportar un checkpoint puro de TensorFlow desde +[Keras](https://huggingface.co/keras-io) de la siguiente manera: + +```bash +python -m transformers.onnx --model=keras-io/transformers-qa onnx/ +``` + +Para exportar un modelo que está almacenado localmente, deberás tener los pesos +y tokenizadores del modelo almacenados en un directorio. Por ejemplo, podemos cargar +y guardar un checkpoint de la siguiente manera: + + + +```python +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> # Load tokenizer and PyTorch weights form the Hub +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +>>> # Save to disk +>>> tokenizer.save_pretrained("local-pt-checkpoint") +>>> pt_model.save_pretrained("local-pt-checkpoint") +``` + +Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` +del paquete `transformers.onnx` al directorio deseado: + +```bash +python -m transformers.onnx --model=local-pt-checkpoint onnx/ +``` + + +```python +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> # Load tokenizer and TensorFlow weights from the Hub +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +>>> # Save to disk +>>> tokenizer.save_pretrained("local-tf-checkpoint") +>>> tf_model.save_pretrained("local-tf-checkpoint") +``` + +Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` +del paquete `transformers.onnx` al directorio deseado: + +```bash +python -m transformers.onnx --model=local-tf-checkpoint onnx/ +``` + + + +### Seleccionar características para diferentes topologías de un modelo + +Cada configuración a la medida viene con un conjunto de _características_ que te permiten exportar +modelos para diferentes tipos de topologías o tareas. Como se muestra en la siguiente tabla, cada +función está asociada con una auto-clase de automóvil diferente: + +| Feature | Auto Class | +| ------------------------------------ | ------------------------------------ | +| `causal-lm`, `causal-lm-with-past` | `AutoModelForCausalLM` | +| `default`, `default-with-past` | `AutoModel` | +| `masked-lm` | `AutoModelForMaskedLM` | +| `question-answering` | `AutoModelForQuestionAnswering` | +| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM` | +| `sequence-classification` | `AutoModelForSequenceClassification` | +| `token-classification` | `AutoModelForTokenClassification` | + +Para cada configuración, puedes encontrar la lista de funciones admitidas a través de `FeaturesManager`. +Por ejemplo, para DistilBERT tenemos: + +```python +>>> from transformers.onnx.features import FeaturesManager + +>>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys()) +>>> print(distilbert_features) +["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"] +``` + +Le puedes pasar una de estas características al argumento `--feature` en el paquete `transformers.onnx`. +Por ejemplo, para exportar un modelo de clasificación de texto, podemos elegir un modelo ya ajustado del Hub y ejecutar: + +```bash +python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \ + --feature=sequence-classification onnx/ +``` + +que mostrará los siguientes registros: + +```bash +Validating ONNX model... + -[✓] ONNX model output names match reference model ({'logits'}) + - Validating ONNX Model output "logits": + -[✓] (2, 2) matches (2, 2) + -[✓] all values close (atol: 1e-05) +All good, model saved at: onnx/model.onnx +``` + +Ten en cuenta que, en este caso, los nombres de salida del modelo ajustado son `logits` en lugar de `last_hidden_state` +que vimos anteriormente con el checkpoint `distilbert-base-uncased`. Esto es de esperarse ya que el modelo ajustado +tiene un cabezal de clasificación secuencial. + + + +Las características que tienen un sufijo 'with-past' (por ejemplo, 'causal-lm-with-past') corresponden a topologías +de modelo con estados ocultos precalculados (clave y valores en los bloques de atención) que se pueden usar para una +decodificación autorregresiva más rápida. + + + + +### Exportar un modelo para una arquitectura no compatible + +Si deseas exportar un modelo cuya arquitectura no es compatible de forma nativa +con la biblioteca, debes seguir tres pasos principales: + +1. Implementa una configuración personalizada en ONNX. +2. Exporta el modelo a ONNX. +3. Valide los resultados de PyTorch y los modelos exportados. + +En esta sección, veremos cómo se implementó la serialización de DistilBERT +para mostrar lo que implica cada paso. + +#### Implementar una configuración personalizada en ONNX + +Comencemos con el objeto de configuración de ONNX. Proporcionamos tres clases abstractas +de las que debe heredar, según el tipo de arquitectura del modelo que quieras exportar: + +* Modelos basados en el _Encoder_ inherente de [`~onnx.config.OnnxConfig`] +* Modelos basados en el _Decoder_ inherente de [`~onnx.config.OnnxConfigWithPast`] +* Modelos _Encoder-decoder_ inherente de [`~onnx.config.OnnxSeq2SeqConfigWithPast`] + + + +Una buena manera de implementar una configuración personalizada en ONNX es observar la implementación +existente en el archivo `configuration_.py` de una arquitectura similar. + + + +Dado que DistilBERT es un modelo de tipo _encoder_, su configuración se hereda de `OnnxConfig`: + +```python +>>> from typing import Mapping, OrderedDict +>>> from transformers.onnx import OnnxConfig + + +>>> class DistilBertOnnxConfig(OnnxConfig): +... @property +... def inputs(self) -> Mapping[str, Mapping[int, str]]: +... return OrderedDict( +... [ +... ("input_ids", {0: "batch", 1: "sequence"}), +... ("attention_mask", {0: "batch", 1: "sequence"}), +... ] +... ) +``` + +Cada objeto de configuración debe implementar la propiedad `inputs` y devolver un mapeo, +donde cada llave corresponde a una entrada esperada y cada valor indica el eje de esa entrada. +Para DistilBERT, podemos ver que se requieren dos entradas: `input_ids` y `attention_mask`. +Estas entradas tienen la misma forma de `(batch_size, sequence_length)`, es por lo que vemos +los mismos ejes utilizados en la configuración. + + + +Observa que la propiedad `inputs` para `DistilBertOnnxConfig` devuelve un `OrderedDict`. +Esto nos asegura que las entradas coincidan con su posición relativa dentro del método +`PreTrainedModel.forward()` al rastrear el grafo. Recomendamos usar un `OrderedDict` +para las propiedades `inputs` y `outputs` al implementar configuraciones ONNX personalizadas. + + + +Una vez que hayas implementado una configuración ONNX, puedes crear una +instancia proporcionando la configuración del modelo base de la siguiente manera: + +```python +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("distilbert-base-uncased") +>>> onnx_config = DistilBertOnnxConfig(config) +``` + +El objeto resultante tiene varias propiedades útiles. Por ejemplo, puedes ver el conjunto de operadores ONNX que se +utilizará durante la exportación: + +```python +>>> print(onnx_config.default_onnx_opset) +11 +``` + +También puedes ver los resultados asociados con el modelo de la siguiente manera: + +```python +>>> print(onnx_config.outputs) +OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})]) +``` + +Observa que la propiedad de salidas sigue la misma estructura que las entradas; +devuelve un objecto `OrderedDict` de salidas nombradas y sus formas. La estructura +de salida está vinculada a la elección de la función con la que se inicializa la configuración. +Por defecto, la configuración de ONNX se inicializa con la función `default` que +corresponde a exportar un modelo cargado con la clase `AutoModel`. Si quieres exportar +una topología de modelo diferente, simplemente proporciona una característica diferente +al argumento `task` cuando inicialices la configuración de ONNX. Por ejemplo, si quisiéramos +exportar DistilBERT con un cabezal de clasificación de secuencias, podríamos usar: + +```python +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("distilbert-base-uncased") +>>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification") +>>> print(onnx_config_for_seq_clf.outputs) +OrderedDict([('logits', {0: 'batch'})]) +``` + + + +Todas las propiedades base y métodos asociados con [`~onnx.config.OnnxConfig`] y las +otras clases de configuración se pueden sobreescribir si es necesario. +Consulte [`BartOnnxConfig`] para ver un ejemplo avanzado. + + + +#### Exportar el modelo + +Una vez que hayas implementado la configuración de ONNX, el siguiente paso es exportar el modelo. +Aquí podemos usar la función `export()` proporcionada por el paquete `transformers.onnx`. +Esta función espera la configuración de ONNX, junto con el modelo base y el tokenizador, +y la ruta para guardar el archivo exportado: + +```python +>>> from pathlib import Path +>>> from transformers.onnx import export +>>> from transformers import AutoTokenizer, AutoModel + +>>> onnx_path = Path("model.onnx") +>>> model_ckpt = "distilbert-base-uncased" +>>> base_model = AutoModel.from_pretrained(model_ckpt) +>>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt) + +>>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path) +``` + +Los objetos `onnx_inputs` y `onnx_outputs` devueltos por la función `export()` +son listas de llaves definidas en las propiedades `inputs` y `outputs` de la configuración. +Una vez exportado el modelo, puedes probar que el modelo está bien formado de la siguiente manera: + +```python +>>> import onnx + +>>> onnx_model = onnx.load("model.onnx") +>>> onnx.checker.check_model(onnx_model) +``` + + + +Si tu modelo tiene más de 2GB, verás que se crean muchos archivos adicionales durante la exportación. +Esto es _esperado_ porque ONNX usa [Búferes de protocolo](https://developers.google.com/protocol-buffers/) +para almacenar el modelo y éstos tienen un límite de tamaño de 2 GB. Consulta la +[documentación de ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) para obtener +instrucciones sobre cómo cargar modelos con datos externos. + + + +#### Validar los resultados del modelo + +El paso final es validar que los resultados del modelo base y exportado coincidan dentro +de cierta tolerancia absoluta. Aquí podemos usar la función `validate_model_outputs()` +proporcionada por el paquete `transformers.onnx` de la siguiente manera: + +```python +>>> from transformers.onnx import validate_model_outputs + +>>> validate_model_outputs( +... onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation +... ) +``` + +Esta función usa el método `OnnxConfig.generate_dummy_inputs()` para generar entradas para el modelo base +y exportado, y la tolerancia absoluta se puede definir en la configuración. En general, encontramos una +concordancia numérica en el rango de 1e-6 a 1e-4, aunque es probable que cualquier valor menor que 1e-3 esté bien. + +### Contribuir con una nueva configuración a 🤗 Transformers + +¡Estamos buscando expandir el conjunto de configuraciones a la medida para usar y agradecemos las contribuciones de la comunidad! +Si deseas contribuir con su colaboración a la biblioteca, deberás: + +* Implementa la configuración de ONNX en el archivo `configuration_.py` correspondiente +* Incluye la arquitectura del modelo y las características correspondientes en [`~onnx.features.FeatureManager`] +* Agrega tu arquitectura de modelo a las pruebas en `test_onnx_v2.py` + +Revisa cómo fue la contribución para la [configuración de IBERT](https://github.com/huggingface/transformers/pull/14868/files) +y así tener una idea de lo que necesito. + +## TorchScript + + + +Este es el comienzo de nuestros experimentos con TorchScript y todavía estamos explorando sus capacidades con modelos de +tamaño de entrada variable. Es un tema de interés y profundizaremos nuestro análisis en las próximas +versiones, con más ejemplos de código, una implementación más flexible y puntos de referencia que comparen códigos +basados en Python con TorchScript compilado. + + + +Según la documentación de PyTorch: "TorchScript es una forma de crear modelos serializables y optimizables a partir del +código de PyTorch". Los dos módulos de Pytorch [JIT y TRACE](https://pytorch.org/docs/stable/jit.html) permiten al +desarrollador exportar su modelo para reutilizarlo en otros programas, como los programas C++ orientados a la eficiencia. + +Hemos proporcionado una interfaz que permite exportar modelos de 🤗 Transformers a TorchScript para que puedan reutilizarse +en un entorno diferente al de un programa Python basado en PyTorch. Aquí explicamos cómo exportar y usar nuestros modelos +usando TorchScript. + +Exportar un modelo requiere de dos cosas: + +- un pase hacia adelante con entradas ficticias. +- instanciación del modelo con la indicador `torchscript`. + +Estas necesidades implican varias cosas con las que los desarrolladores deben tener cuidado. Éstas se detallan a continuación. + +### Indicador de TorchScript y pesos atados + +Este indicador es necesario porque la mayoría de los modelos de lenguaje en este repositorio tienen pesos vinculados entre su capa +de `Embedding` y su capa de `Decoding`. TorchScript no permite la exportación de modelos que tengan pesos atados, por lo que es +necesario desvincular y clonar los pesos previamente. + +Esto implica que los modelos instanciados con el indicador `torchscript` tienen su capa `Embedding` y `Decoding` separadas, +lo que significa que no deben entrenarse más adelante. El entrenamiento desincronizaría las dos capas, lo que generaría +resultados inesperados. + +Este no es el caso de los modelos que no tienen un cabezal de modelo de lenguaje, ya que no tienen pesos atados. +Estos modelos se pueden exportar de forma segura sin el indicador `torchscript`. + +### Entradas ficticias y longitudes estándar + +Las entradas ficticias se utilizan para crear un modelo de pase hacia adelante. Mientras los valores de las entradas se +propagan a través de las capas, PyTorch realiza un seguimiento de las diferentes operaciones ejecutadas en cada tensor. +Estas operaciones registradas se utilizan luego para crear el "rastro" del modelo. + +El rastro se crea en relación con las dimensiones de las entradas. Por lo tanto, está limitado por las dimensiones de la +entrada ficticia y no funcionará para ninguna otra longitud de secuencia o tamaño de lote. Al intentar con un tamaño diferente, +un error como: + +`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` + +aparecerá. Por lo tanto, se recomienda rastrear el modelo con un tamaño de entrada ficticia al menos tan grande como la +entrada más grande que se alimentará al modelo durante la inferencia. El _padding_ se puede realizar para completar los +valores que faltan. Sin embargo, como el modelo se habrá rastreado con un tamaño de entrada grande, las dimensiones de +las diferentes matrices también serán grandes, lo que dará como resultado más cálculos. + +Se recomienda tener cuidado con el número total de operaciones realizadas en cada entrada y seguir de cerca el rendimiento +al exportar modelos de longitud de secuencia variable. + +### Usar TorchScript en Python + +A continuación se muestra un ejemplo que muestra cómo guardar, cargar modelos y cómo usar el rastreo para la inferencia. + +#### Guardando un modelo + +Este fragmento muestra cómo usar TorchScript para exportar un `BertModel`. Aquí, el `BertModel` se instancia de acuerdo +con la clase `BertConfig` y luego se guarda en el disco con el nombre de archivo `traced_bert.pt` + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch + +enc = BertTokenizer.from_pretrained("bert-base-uncased") + +# Tokenizing input text +text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" +tokenized_text = enc.tokenize(text) + +# Masking one of the input tokens +masked_index = 8 +tokenized_text[masked_index] = "[MASK]" +indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + +# Creating a dummy input +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) +dummy_input = [tokens_tensor, segments_tensors] + +# Initializing the model with the torchscript flag +# Flag set to True even though it is not necessary as this model does not have an LM Head. +config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, +) + +# Instantiating the model +model = BertModel(config) + +# The model needs to be in evaluation mode +model.eval() + +# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag +model = BertModel.from_pretrained("bert-base-uncased", torchscript=True) + +# Creating the trace +traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) +torch.jit.save(traced_model, "traced_bert.pt") +``` + +#### Cargar un modelo + +Este fragmento muestra cómo cargar el `BertModel` que se guardó previamente en el disco con el nombre `traced_bert.pt`. +Estamos reutilizando el `dummy_input` previamente inicializado. + +```python +loaded_model = torch.jit.load("traced_bert.pt") +loaded_model.eval() + +all_encoder_layers, pooled_output = loaded_model(*dummy_input) +``` + +#### Usar un modelo rastreado para la inferencia + +Usar el modelo rastreado para la inferencia es tan simple como usar su método `__call__`: + +```python +traced_model(tokens_tensor, segments_tensors) +``` + +### Implementar los modelos HuggingFace TorchScript en AWS mediante Neuron SDK + +AWS presentó la familia de instancias [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) para la inferencia +de aprendizaje automático de bajo costo y alto rendimiento en la nube. Las instancias Inf1 funcionan con el chip AWS +Inferentia, un acelerador de hardware personalizado, que se especializa en cargas de trabajo de inferencia de aprendizaje +profundo. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) es el kit de desarrollo para Inferentia +que admite el rastreo y la optimización de modelos de transformers para su implementación en Inf1. El SDK de Neuron proporciona: + + +1. API fácil de usar con una línea de cambio de código para rastrear y optimizar un modelo de TorchScript para la inferencia en la nube. +2. Optimizaciones de rendimiento listas para usar con un [costo-rendimiento mejorado](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>) +3. Soporte para modelos HuggingFace Transformers construidos con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) +o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). + +#### Implicaciones + +Los modelos Transformers basados en la arquitectura +[BERT (Representaciones de _Enconder_ bidireccional de Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), +o sus variantes, como [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) y +[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta), se ejecutarán mejor en Inf1 para tareas no +generativas, como la respuesta extractiva de preguntas, la clasificación de secuencias y la clasificación de tokens. +Como alternativa, las tareas de generación de texto se pueden adaptar para ejecutarse en Inf1, según este +[tutorial de AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). +Puedes encontrar más información sobre los modelos que están listos para usarse en Inferentia en la +[sección _Model Architecture Fit_ de la documentación de Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia). + +#### Dependencias + +Usar AWS Neuron para convertir modelos requiere las siguientes dependencias y entornos: + +* Un [entorno Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide), +que viene preconfigurado en [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). + +#### Convertir un modelo a AWS Neuron + +Con el mismo script usado en [Uso de TorchScript en Python](https://huggingface.co/docs/transformers/main/es/serialization#using-torchscript-in-python) +para rastrear un "BertModel", puedes importar la extensión del _framework_ `torch.neuron` para acceder a los componentes +del SDK de Neuron a través de una API de Python. + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch +import torch.neuron +``` +Y modificando la línea de código de rastreo de: + +```python +torch.jit.trace(model, [tokens_tensor, segments_tensors]) +``` + +con lo siguiente: + +```python +torch.neuron.trace(model, [token_tensor, segments_tensors]) +``` + +Este cambio permite a Neuron SDK rastrear el modelo y optimizarlo para ejecutarse en instancias Inf1. + +Para obtener más información sobre las funciones, las herramientas, los tutoriales de ejemplo y las últimas actualizaciones +de AWS Neuron SDK, consulte la [documentación de AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/docs/source/es/serialization.mdx b/docs/source/es/serialization.mdx deleted file mode 100644 index 4c42fd5d830e..000000000000 --- a/docs/source/es/serialization.mdx +++ /dev/null @@ -1,669 +0,0 @@ - - -# Exportar modelos 🤗 Transformers - -Si necesitas implementar modelos 🤗 Transformers en entornos de producción, te -recomendamos exportarlos a un formato serializado que se pueda cargar y ejecutar -en tiempos de ejecución y hardware especializados. En esta guía, te mostraremos cómo -exportar modelos 🤗 Transformers en dos formatos ampliamente utilizados: ONNX y TorchScript. - -Una vez exportado, un modelo puede optimizarse para la inferencia a través de técnicas -como la cuantización y _pruning_. Si estás interesado en optimizar tus modelos para -que funcionen con la máxima eficiencia, consulta la -[biblioteca de 🤗 Optimum](https://github.com/huggingface/optimum). - -## ONNX - -El proyecto [ONNX (Open Neural Network eXchange)](http://onnx.ai) es un -estándar abierto que define un conjunto común de operadores y un formato -de archivo común para representar modelos de aprendizaje profundo en una -amplia variedad de _frameworks_, incluidos PyTorch y TensorFlow. Cuando un modelo -se exporta al formato ONNX, estos operadores se usan para construir un -grafo computacional (a menudo llamado _representación intermedia_) que -representa el flujo de datos a través de la red neuronal. - -Al exponer un grafo con operadores y tipos de datos estandarizados, ONNX facilita -el cambio entre frameworks. Por ejemplo, un modelo entrenado en PyTorch se puede -exportar a formato ONNX y luego importar en TensorFlow (y viceversa). - -🤗 Transformers proporciona un paquete llamado `transformers.onnx`, el cual permite convertir -los checkpoints de un modelo en un grafo ONNX aprovechando los objetos de configuración. -Estos objetos de configuración están hechos a la medida de diferentes arquitecturas de modelos -y están diseñados para ser fácilmente extensibles a otras arquitecturas. - -Las configuraciones a la medida incluyen las siguientes arquitecturas: - - - -- ALBERT -- BART -- BEiT -- BERT -- BigBird -- BigBird-Pegasus -- Blenderbot -- BlenderbotSmall -- BLOOM -- CamemBERT -- CLIP -- CodeGen -- ConvBERT -- ConvNeXT -- Data2VecText -- Data2VecVision -- DeBERTa -- DeBERTa-v2 -- DeiT -- DETR -- DistilBERT -- ELECTRA -- FlauBERT -- GPT Neo -- GPT-J -- I-BERT -- LayoutLM -- LayoutLMv3 -- LeViT -- LongT5 -- M2M100 -- Marian -- mBART -- MobileBERT -- MobileViT -- MT5 -- OpenAI GPT-2 -- Perceiver -- PLBart -- ResNet -- RoBERTa -- RoFormer -- SqueezeBERT -- T5 -- ViT -- XLM -- XLM-RoBERTa -- XLM-RoBERTa-XL -- YOLOS - -En las próximas dos secciones, te mostraremos cómo: - -* Exportar un modelo compatible utilizando el paquete `transformers.onnx`. -* Exportar un modelo personalizado para una arquitectura no compatible. - -### Exportar un model a ONNX - -Para exportar un modelo 🤗 Transformers a ONNX, tienes que instalar primero algunas -dependencias extra: - -```bash -pip install transformers[onnx] -``` - -El paquete `transformers.onnx` puede ser usado luego como un módulo de Python: - -```bash -python -m transformers.onnx --help - -usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output - -positional arguments: - output Path indicating where to store generated ONNX model. - -optional arguments: - -h, --help show this help message and exit - -m MODEL, --model MODEL - Model ID on huggingface.co or path on disk to load model from. - --feature {causal-lm, ...} - The type of features to export the model with. - --opset OPSET ONNX opset version to export the model with. - --atol ATOL Absolute difference tolerence when validating the model. -``` - -Exportar un checkpoint usando una configuración a la medida se puede hacer de la siguiente manera: - -```bash -python -m transformers.onnx --model=distilbert-base-uncased onnx/ -``` - -que debería mostrar los siguientes registros: - -```bash -Validating ONNX model... - -[✓] ONNX model output names match reference model ({'last_hidden_state'}) - - Validating ONNX Model output "last_hidden_state": - -[✓] (2, 8, 768) matches (2, 8, 768) - -[✓] all values close (atol: 1e-05) -All good, model saved at: onnx/model.onnx -``` - -Esto exporta un grafo ONNX del checkpoint definido por el argumento `--model`. -En este ejemplo, es un modelo `distilbert-base-uncased`, pero puede ser cualquier -checkpoint en Hugging Face Hub o que esté almacenado localmente. - -El archivo `model.onnx` resultante se puede ejecutar en uno de los -[muchos aceleradores](https://onnx.ai/supported-tools.html#deployModel) -que admiten el estándar ONNX. Por ejemplo, podemos cargar y ejecutar el -modelo con [ONNX Runtime](https://onnxruntime.ai/) de la siguiente manera: - -```python ->>> from transformers import AutoTokenizer ->>> from onnxruntime import InferenceSession - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> session = InferenceSession("onnx/model.onnx") ->>> # ONNX Runtime expects NumPy arrays as input ->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") ->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) -``` - -Los nombres necesarios de salida (es decir, `["last_hidden_state"]`) se pueden obtener -echando un vistazo a la configuración ONNX de cada modelo. Por ejemplo, para DistilBERT tenemos: - -```python ->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig - ->>> config = DistilBertConfig() ->>> onnx_config = DistilBertOnnxConfig(config) ->>> print(list(onnx_config.outputs.keys())) -["last_hidden_state"]s -``` - -El proceso es idéntico para los checkpoints de TensorFlow en Hub. -Por ejemplo, podemos exportar un checkpoint puro de TensorFlow desde -[Keras](https://huggingface.co/keras-io) de la siguiente manera: - -```bash -python -m transformers.onnx --model=keras-io/transformers-qa onnx/ -``` - -Para exportar un modelo que está almacenado localmente, deberás tener los pesos -y tokenizadores del modelo almacenados en un directorio. Por ejemplo, podemos cargar -y guardar un checkpoint de la siguiente manera: - - - -```python ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> # Load tokenizer and PyTorch weights form the Hub ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") ->>> # Save to disk ->>> tokenizer.save_pretrained("local-pt-checkpoint") ->>> pt_model.save_pretrained("local-pt-checkpoint") -``` - -Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` -del paquete `transformers.onnx` al directorio deseado: - -```bash -python -m transformers.onnx --model=local-pt-checkpoint onnx/ -``` - - -```python ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> # Load tokenizer and TensorFlow weights from the Hub ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") ->>> # Save to disk ->>> tokenizer.save_pretrained("local-tf-checkpoint") ->>> tf_model.save_pretrained("local-tf-checkpoint") -``` - -Una vez que se guarda el checkpoint, podemos exportarlo a ONNX usando el argumento `--model` -del paquete `transformers.onnx` al directorio deseado: - -```bash -python -m transformers.onnx --model=local-tf-checkpoint onnx/ -``` - - - -### Seleccionar características para diferentes topologías de un modelo - -Cada configuración a la medida viene con un conjunto de _características_ que te permiten exportar -modelos para diferentes tipos de topologías o tareas. Como se muestra en la siguiente tabla, cada -función está asociada con una auto-clase de automóvil diferente: - -| Feature | Auto Class | -| ------------------------------------ | ------------------------------------ | -| `causal-lm`, `causal-lm-with-past` | `AutoModelForCausalLM` | -| `default`, `default-with-past` | `AutoModel` | -| `masked-lm` | `AutoModelForMaskedLM` | -| `question-answering` | `AutoModelForQuestionAnswering` | -| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM` | -| `sequence-classification` | `AutoModelForSequenceClassification` | -| `token-classification` | `AutoModelForTokenClassification` | - -Para cada configuración, puedes encontrar la lista de funciones admitidas a través de `FeaturesManager`. -Por ejemplo, para DistilBERT tenemos: - -```python ->>> from transformers.onnx.features import FeaturesManager - ->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys()) ->>> print(distilbert_features) -["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"] -``` - -Le puedes pasar una de estas características al argumento `--feature` en el paquete `transformers.onnx`. -Por ejemplo, para exportar un modelo de clasificación de texto, podemos elegir un modelo ya ajustado del Hub y ejecutar: - -```bash -python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \ - --feature=sequence-classification onnx/ -``` - -que mostrará los siguientes registros: - -```bash -Validating ONNX model... - -[✓] ONNX model output names match reference model ({'logits'}) - - Validating ONNX Model output "logits": - -[✓] (2, 2) matches (2, 2) - -[✓] all values close (atol: 1e-05) -All good, model saved at: onnx/model.onnx -``` - -Ten en cuenta que, en este caso, los nombres de salida del modelo ajustado son `logits` en lugar de `last_hidden_state` -que vimos anteriormente con el checkpoint `distilbert-base-uncased`. Esto es de esperarse ya que el modelo ajustado -tiene un cabezal de clasificación secuencial. - - - -Las características que tienen un sufijo 'with-past' (por ejemplo, 'causal-lm-with-past') corresponden a topologías -de modelo con estados ocultos precalculados (clave y valores en los bloques de atención) que se pueden usar para una -decodificación autorregresiva más rápida. - - - - -### Exportar un modelo para una arquitectura no compatible - -Si deseas exportar un modelo cuya arquitectura no es compatible de forma nativa -con la biblioteca, debes seguir tres pasos principales: - -1. Implementa una configuración personalizada en ONNX. -2. Exporta el modelo a ONNX. -3. Valide los resultados de PyTorch y los modelos exportados. - -En esta sección, veremos cómo se implementó la serialización de DistilBERT -para mostrar lo que implica cada paso. - -#### Implementar una configuración personalizada en ONNX - -Comencemos con el objeto de configuración de ONNX. Proporcionamos tres clases abstractas -de las que debe heredar, según el tipo de arquitectura del modelo que quieras exportar: - -* Modelos basados en el _Encoder_ inherente de [`~onnx.config.OnnxConfig`] -* Modelos basados en el _Decoder_ inherente de [`~onnx.config.OnnxConfigWithPast`] -* Modelos _Encoder-decoder_ inherente de [`~onnx.config.OnnxSeq2SeqConfigWithPast`] - - - -Una buena manera de implementar una configuración personalizada en ONNX es observar la implementación -existente en el archivo `configuration_.py` de una arquitectura similar. - - - -Dado que DistilBERT es un modelo de tipo _encoder_, su configuración se hereda de `OnnxConfig`: - -```python ->>> from typing import Mapping, OrderedDict ->>> from transformers.onnx import OnnxConfig - - ->>> class DistilBertOnnxConfig(OnnxConfig): -... @property -... def inputs(self) -> Mapping[str, Mapping[int, str]]: -... return OrderedDict( -... [ -... ("input_ids", {0: "batch", 1: "sequence"}), -... ("attention_mask", {0: "batch", 1: "sequence"}), -... ] -... ) -``` - -Cada objeto de configuración debe implementar la propiedad `inputs` y devolver un mapeo, -donde cada llave corresponde a una entrada esperada y cada valor indica el eje de esa entrada. -Para DistilBERT, podemos ver que se requieren dos entradas: `input_ids` y `attention_mask`. -Estas entradas tienen la misma forma de `(batch_size, sequence_length)`, es por lo que vemos -los mismos ejes utilizados en la configuración. - - - -Observa que la propiedad `inputs` para `DistilBertOnnxConfig` devuelve un `OrderedDict`. -Esto nos asegura que las entradas coincidan con su posición relativa dentro del método -`PreTrainedModel.forward()` al rastrear el grafo. Recomendamos usar un `OrderedDict` -para las propiedades `inputs` y `outputs` al implementar configuraciones ONNX personalizadas. - - - -Una vez que hayas implementado una configuración ONNX, puedes crear una -instancia proporcionando la configuración del modelo base de la siguiente manera: - -```python ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("distilbert-base-uncased") ->>> onnx_config = DistilBertOnnxConfig(config) -``` - -El objeto resultante tiene varias propiedades útiles. Por ejemplo, puedes ver el conjunto de operadores ONNX que se -utilizará durante la exportación: - -```python ->>> print(onnx_config.default_onnx_opset) -11 -``` - -También puedes ver los resultados asociados con el modelo de la siguiente manera: - -```python ->>> print(onnx_config.outputs) -OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})]) -``` - -Observa que la propiedad de salidas sigue la misma estructura que las entradas; -devuelve un objecto `OrderedDict` de salidas nombradas y sus formas. La estructura -de salida está vinculada a la elección de la función con la que se inicializa la configuración. -Por defecto, la configuración de ONNX se inicializa con la función `default` que -corresponde a exportar un modelo cargado con la clase `AutoModel`. Si quieres exportar -una topología de modelo diferente, simplemente proporciona una característica diferente -al argumento `task` cuando inicialices la configuración de ONNX. Por ejemplo, si quisiéramos -exportar DistilBERT con un cabezal de clasificación de secuencias, podríamos usar: - -```python ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("distilbert-base-uncased") ->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification") ->>> print(onnx_config_for_seq_clf.outputs) -OrderedDict([('logits', {0: 'batch'})]) -``` - - - -Todas las propiedades base y métodos asociados con [`~onnx.config.OnnxConfig`] y las -otras clases de configuración se pueden sobreescribir si es necesario. -Consulte [`BartOnnxConfig`] para ver un ejemplo avanzado. - - - -#### Exportar el modelo - -Una vez que hayas implementado la configuración de ONNX, el siguiente paso es exportar el modelo. -Aquí podemos usar la función `export()` proporcionada por el paquete `transformers.onnx`. -Esta función espera la configuración de ONNX, junto con el modelo base y el tokenizador, -y la ruta para guardar el archivo exportado: - -```python ->>> from pathlib import Path ->>> from transformers.onnx import export ->>> from transformers import AutoTokenizer, AutoModel - ->>> onnx_path = Path("model.onnx") ->>> model_ckpt = "distilbert-base-uncased" ->>> base_model = AutoModel.from_pretrained(model_ckpt) ->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt) - ->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path) -``` - -Los objetos `onnx_inputs` y `onnx_outputs` devueltos por la función `export()` -son listas de llaves definidas en las propiedades `inputs` y `outputs` de la configuración. -Una vez exportado el modelo, puedes probar que el modelo está bien formado de la siguiente manera: - -```python ->>> import onnx - ->>> onnx_model = onnx.load("model.onnx") ->>> onnx.checker.check_model(onnx_model) -``` - - - -Si tu modelo tiene más de 2GB, verás que se crean muchos archivos adicionales durante la exportación. -Esto es _esperado_ porque ONNX usa [Búferes de protocolo](https://developers.google.com/protocol-buffers/) -para almacenar el modelo y éstos tienen un límite de tamaño de 2 GB. Consulta la -[documentación de ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) para obtener -instrucciones sobre cómo cargar modelos con datos externos. - - - -#### Validar los resultados del modelo - -El paso final es validar que los resultados del modelo base y exportado coincidan dentro -de cierta tolerancia absoluta. Aquí podemos usar la función `validate_model_outputs()` -proporcionada por el paquete `transformers.onnx` de la siguiente manera: - -```python ->>> from transformers.onnx import validate_model_outputs - ->>> validate_model_outputs( -... onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation -... ) -``` - -Esta función usa el método `OnnxConfig.generate_dummy_inputs()` para generar entradas para el modelo base -y exportado, y la tolerancia absoluta se puede definir en la configuración. En general, encontramos una -concordancia numérica en el rango de 1e-6 a 1e-4, aunque es probable que cualquier valor menor que 1e-3 esté bien. - -### Contribuir con una nueva configuración a 🤗 Transformers - -¡Estamos buscando expandir el conjunto de configuraciones a la medida para usar y agradecemos las contribuciones de la comunidad! -Si deseas contribuir con su colaboración a la biblioteca, deberás: - -* Implementa la configuración de ONNX en el archivo `configuration_.py` correspondiente -* Incluye la arquitectura del modelo y las características correspondientes en [`~onnx.features.FeatureManager`] -* Agrega tu arquitectura de modelo a las pruebas en `test_onnx_v2.py` - -Revisa cómo fue la contribución para la [configuración de IBERT](https://github.com/huggingface/transformers/pull/14868/files) -y así tener una idea de lo que necesito. - -## TorchScript - - - -Este es el comienzo de nuestros experimentos con TorchScript y todavía estamos explorando sus capacidades con modelos de -tamaño de entrada variable. Es un tema de interés y profundizaremos nuestro análisis en las próximas -versiones, con más ejemplos de código, una implementación más flexible y puntos de referencia que comparen códigos -basados en Python con TorchScript compilado. - - - -Según la documentación de PyTorch: "TorchScript es una forma de crear modelos serializables y optimizables a partir del -código de PyTorch". Los dos módulos de Pytorch [JIT y TRACE](https://pytorch.org/docs/stable/jit.html) permiten al -desarrollador exportar su modelo para reutilizarlo en otros programas, como los programas C++ orientados a la eficiencia. - -Hemos proporcionado una interfaz que permite exportar modelos de 🤗 Transformers a TorchScript para que puedan reutilizarse -en un entorno diferente al de un programa Python basado en PyTorch. Aquí explicamos cómo exportar y usar nuestros modelos -usando TorchScript. - -Exportar un modelo requiere de dos cosas: - -- un pase hacia adelante con entradas ficticias. -- instanciación del modelo con la indicador `torchscript`. - -Estas necesidades implican varias cosas con las que los desarrolladores deben tener cuidado. Éstas se detallan a continuación. - -### Indicador de TorchScript y pesos atados - -Este indicador es necesario porque la mayoría de los modelos de lenguaje en este repositorio tienen pesos vinculados entre su capa -de `Embedding` y su capa de `Decoding`. TorchScript no permite la exportación de modelos que tengan pesos atados, por lo que es -necesario desvincular y clonar los pesos previamente. - -Esto implica que los modelos instanciados con el indicador `torchscript` tienen su capa `Embedding` y `Decoding` separadas, -lo que significa que no deben entrenarse más adelante. El entrenamiento desincronizaría las dos capas, lo que generaría -resultados inesperados. - -Este no es el caso de los modelos que no tienen un cabezal de modelo de lenguaje, ya que no tienen pesos atados. -Estos modelos se pueden exportar de forma segura sin el indicador `torchscript`. - -### Entradas ficticias y longitudes estándar - -Las entradas ficticias se utilizan para crear un modelo de pase hacia adelante. Mientras los valores de las entradas se -propagan a través de las capas, PyTorch realiza un seguimiento de las diferentes operaciones ejecutadas en cada tensor. -Estas operaciones registradas se utilizan luego para crear el "rastro" del modelo. - -El rastro se crea en relación con las dimensiones de las entradas. Por lo tanto, está limitado por las dimensiones de la -entrada ficticia y no funcionará para ninguna otra longitud de secuencia o tamaño de lote. Al intentar con un tamaño diferente, -un error como: - -`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` - -aparecerá. Por lo tanto, se recomienda rastrear el modelo con un tamaño de entrada ficticia al menos tan grande como la -entrada más grande que se alimentará al modelo durante la inferencia. El _padding_ se puede realizar para completar los -valores que faltan. Sin embargo, como el modelo se habrá rastreado con un tamaño de entrada grande, las dimensiones de -las diferentes matrices también serán grandes, lo que dará como resultado más cálculos. - -Se recomienda tener cuidado con el número total de operaciones realizadas en cada entrada y seguir de cerca el rendimiento -al exportar modelos de longitud de secuencia variable. - -### Usar TorchScript en Python - -A continuación se muestra un ejemplo que muestra cómo guardar, cargar modelos y cómo usar el rastreo para la inferencia. - -#### Guardando un modelo - -Este fragmento muestra cómo usar TorchScript para exportar un `BertModel`. Aquí, el `BertModel` se instancia de acuerdo -con la clase `BertConfig` y luego se guarda en el disco con el nombre de archivo `traced_bert.pt` - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch - -enc = BertTokenizer.from_pretrained("bert-base-uncased") - -# Tokenizing input text -text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" -tokenized_text = enc.tokenize(text) - -# Masking one of the input tokens -masked_index = 8 -tokenized_text[masked_index] = "[MASK]" -indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) -segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] - -# Creating a dummy input -tokens_tensor = torch.tensor([indexed_tokens]) -segments_tensors = torch.tensor([segments_ids]) -dummy_input = [tokens_tensor, segments_tensors] - -# Initializing the model with the torchscript flag -# Flag set to True even though it is not necessary as this model does not have an LM Head. -config = BertConfig( - vocab_size_or_config_json_file=32000, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - torchscript=True, -) - -# Instantiating the model -model = BertModel(config) - -# The model needs to be in evaluation mode -model.eval() - -# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag -model = BertModel.from_pretrained("bert-base-uncased", torchscript=True) - -# Creating the trace -traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) -torch.jit.save(traced_model, "traced_bert.pt") -``` - -#### Cargar un modelo - -Este fragmento muestra cómo cargar el `BertModel` que se guardó previamente en el disco con el nombre `traced_bert.pt`. -Estamos reutilizando el `dummy_input` previamente inicializado. - -```python -loaded_model = torch.jit.load("traced_bert.pt") -loaded_model.eval() - -all_encoder_layers, pooled_output = loaded_model(*dummy_input) -``` - -#### Usar un modelo rastreado para la inferencia - -Usar el modelo rastreado para la inferencia es tan simple como usar su método `__call__`: - -```python -traced_model(tokens_tensor, segments_tensors) -``` - -### Implementar los modelos HuggingFace TorchScript en AWS mediante Neuron SDK - -AWS presentó la familia de instancias [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) para la inferencia -de aprendizaje automático de bajo costo y alto rendimiento en la nube. Las instancias Inf1 funcionan con el chip AWS -Inferentia, un acelerador de hardware personalizado, que se especializa en cargas de trabajo de inferencia de aprendizaje -profundo. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) es el kit de desarrollo para Inferentia -que admite el rastreo y la optimización de modelos de transformers para su implementación en Inf1. El SDK de Neuron proporciona: - - -1. API fácil de usar con una línea de cambio de código para rastrear y optimizar un modelo de TorchScript para la inferencia en la nube. -2. Optimizaciones de rendimiento listas para usar con un [costo-rendimiento mejorado](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>) -3. Soporte para modelos HuggingFace Transformers construidos con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) -o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). - -#### Implicaciones - -Los modelos Transformers basados en la arquitectura -[BERT (Representaciones de _Enconder_ bidireccional de Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), -o sus variantes, como [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) y -[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta), se ejecutarán mejor en Inf1 para tareas no -generativas, como la respuesta extractiva de preguntas, la clasificación de secuencias y la clasificación de tokens. -Como alternativa, las tareas de generación de texto se pueden adaptar para ejecutarse en Inf1, según este -[tutorial de AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). -Puedes encontrar más información sobre los modelos que están listos para usarse en Inferentia en la -[sección _Model Architecture Fit_ de la documentación de Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia). - -#### Dependencias - -Usar AWS Neuron para convertir modelos requiere las siguientes dependencias y entornos: - -* Un [entorno Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide), -que viene preconfigurado en [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). - -#### Convertir un modelo a AWS Neuron - -Con el mismo script usado en [Uso de TorchScript en Python](https://huggingface.co/docs/transformers/main/es/serialization#using-torchscript-in-python) -para rastrear un "BertModel", puedes importar la extensión del _framework_ `torch.neuron` para acceder a los componentes -del SDK de Neuron a través de una API de Python. - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch -import torch.neuron -``` -Y modificando la línea de código de rastreo de: - -```python -torch.jit.trace(model, [tokens_tensor, segments_tensors]) -``` - -con lo siguiente: - -```python -torch.neuron.trace(model, [token_tensor, segments_tensors]) -``` - -Este cambio permite a Neuron SDK rastrear el modelo y optimizarlo para ejecutarse en instancias Inf1. - -Para obtener más información sobre las funciones, las herramientas, los tutoriales de ejemplo y las últimas actualizaciones -de AWS Neuron SDK, consulte la [documentación de AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/docs/source/es/tasks/asr.md b/docs/source/es/tasks/asr.md new file mode 100644 index 000000000000..850bdfd711e7 --- /dev/null +++ b/docs/source/es/tasks/asr.md @@ -0,0 +1,366 @@ + + +# Reconocimiento automático del habla + + + +El reconocimiento automático del habla (ASR, por sus siglas en inglés) convierte una señal de habla en texto y mapea una secuencia de entradas de audio en salidas en forma de texto. Los asistentes virtuales como Siri y Alexa usan modelos de ASR para ayudar a sus usuarios todos los días. De igual forma, hay muchas otras aplicaciones, como la transcripción de contenidos en vivo y la toma automática de notas durante reuniones. + +En esta guía te mostraremos como: + +1. Hacer fine-tuning al modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) con el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) para transcribir audio a texto. +2. Usar tu modelo ajustado para tareas de inferencia. + + + +Revisa la [página de la tarea](https://huggingface.co/tasks/automatic-speech-recognition) de reconocimiento automático del habla para acceder a más información sobre los modelos, datasets y métricas asociados. + + + +Antes de comenzar, asegúrate de haber instalado todas las librerías necesarias: + +```bash +pip install transformers datasets evaluate jiwer +``` + +Te aconsejamos iniciar sesión con tu cuenta de Hugging Face para que puedas subir tu modelo y comartirlo con la comunidad. Cuando te sea solicitado, ingresa tu token para iniciar sesión: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Cargar el dataset MInDS-14 + +Comencemos cargando un subconjunto más pequeño del dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) desde la biblioteca 🤗 Datasets. De esta forma, tendrás la oportunidad de experimentar y asegurarte de que todo funcione antes de invertir más tiempo entrenando con el dataset entero. + +```py +>>> from datasets import load_dataset, Audio + +>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]") +``` +Divide la partición `train` (entrenamiento) en una partición de entrenamiento y una de prueba usando el método [`~Dataset.train_test_split`]: + +```py +>>> minds = minds.train_test_split(test_size=0.2) +``` + +Ahora échale un vistazo al dataset: + +```py +>>> minds +DatasetDict({ + train: Dataset({ + features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], + num_rows: 16 + }) + test: Dataset({ + features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], + num_rows: 4 + }) +}) +``` + +Aunque el dataset contiene mucha información útil, como los campos `lang_id` (identificador del lenguaje) y `english_transcription` (transcripción al inglés), en esta guía nos enfocaremos en los campos `audio` y `transcription`. Puedes quitar las otras columnas con el método [`~datasets.Dataset.remove_columns`]: + +```py +>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"]) +``` + +Vuelve a echarle un vistazo al ejemplo: + +```py +>>> minds["train"][0] +{'audio': {'array': array([-0.00024414, 0. , 0. , ..., 0.00024414, + 0.00024414, 0.00024414], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'sampling_rate': 8000}, + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} +``` + +Hay dos campos: + +- `audio`: un `array` (arreglo) unidimensional de la señal de habla que debe ser invocado para cargar y re-muestrear el archivo de audio. +- `transcription`: el texto objetivo. + +## Preprocesamiento + +El siguiente paso es cargar un procesador Wav2Vec2 para procesar la señal de audio: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") +``` +El dataset MInDS-14 tiene una tasa de muestreo de 8000kHz (puedes encontrar esta información en su [tarjeta de dataset](https://huggingface.co/datasets/PolyAI/minds14)), lo que significa que tendrás que re-muestrear el dataset a 16000kHz para poder usar el modelo Wav2Vec2 pre-entrenado: + +```py +>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) +>>> minds["train"][0] +{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ..., + 2.78103951e-04, 2.38446111e-04, 1.18740834e-04], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'sampling_rate': 16000}, + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', + 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} +``` + +Como puedes ver en el campo `transcription`, el texto contiene una mezcla de carácteres en mayúsculas y en minúsculas. El tokenizer Wav2Vec2 fue entrenado únicamente con carácteres en mayúsculas, así que tendrás que asegurarte de que el texto se ajuste al vocabulario del tokenizer: + +```py +>>> def uppercase(example): +... return {"transcription": example["transcription"].upper()} + + +>>> minds = minds.map(uppercase) +``` + +Ahora vamos a crear una función de preprocesamiento que: + +1. Invoque la columna `audio` para cargar y re-muestrear el archivo de audio. +2. Extraiga el campo `input_values` (valores de entrada) del archivo de audio y haga la tokenización de la columna `transcription` con el procesador. + +```py +>>> def prepare_dataset(batch): +... audio = batch["audio"] +... batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"]) +... batch["input_length"] = len(batch["input_values"][0]) +... return batch +``` + +Para aplicar la función de preprocesamiento a todo el dataset, puedes usar la función [`~datasets.Dataset.map`] de 🤗 Datasets. Para acelerar la función `map` puedes incrementar el número de procesos con el parámetro `num_proc`. Quita las columnas que no necesites con el método [`~datasets.Dataset.remove_columns`]: + +```py +>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4) +``` + +🤗 Transformers no tiene un collator de datos para la tarea de ASR, así que tendrás que adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos. El collator también le aplicará padding dinámico a tu texto y etiquetas para que tengan la longitud del elemento más largo en su lote (en vez de la mayor longitud en el dataset entero), de forma que todas las muestras tengan una longitud uniforme. Aunque es posible hacerle padding a tu texto con el `tokenizer` haciendo `padding=True`, el padding dinámico es más eficiente. + +A diferencia de otros collators de datos, este tiene que aplicarle un método de padding distinto a los campos `input_values` (valores de entrada) y `labels` (etiquetas): + +```py +>>> import torch + +>>> from dataclasses import dataclass, field +>>> from typing import Any, Dict, List, Optional, Union + + +>>> @dataclass +... class DataCollatorCTCWithPadding: +... processor: AutoProcessor +... padding: Union[bool, str] = "longest" + +... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: +... # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y +... # requieren métodos de padding diferentes +... input_features = [{"input_values": feature["input_values"][0]} for feature in features] +... label_features = [{"input_ids": feature["labels"]} for feature in features] + +... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt") + +... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt") + +... # remplaza el padding con -100 para ignorar la pérdida de forma correcta +... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + +... batch["labels"] = labels + +... return batch +``` + +Ahora puedes instanciar tu `DataCollatorForCTCWithPadding`: + +```py +>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest") +``` + +## Evaluación + +A menudo es útil incluir una métrica durante el entrenamiento para evaluar el rendimiento de tu modelo. Puedes cargar un método de evaluación rápidamente con la biblioteca 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). Para esta tarea, puedes usar la métrica de [tasa de error por palabra](https://huggingface.co/spaces/evaluate-metric/wer) (WER, por sus siglas en inglés). Puedes ver la [guía rápida](https://huggingface.co/docs/evaluate/a_quick_tour) de 🤗 Evaluate para aprender más acerca de cómo cargar y computar una métrica. + +```py +>>> import evaluate + +>>> wer = evaluate.load("wer") +``` + +Ahora crea una función que le pase tus predicciones y etiquetas a [`~evaluate.EvaluationModule.compute`] para calcular la WER: + +```py +>>> import numpy as np + + +>>> def compute_metrics(pred): +... pred_logits = pred.predictions +... pred_ids = np.argmax(pred_logits, axis=-1) + +... pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id + +... pred_str = processor.batch_decode(pred_ids) +... label_str = processor.batch_decode(pred.label_ids, group_tokens=False) + +... wer = wer.compute(predictions=pred_str, references=label_str) + +... return {"wer": wer} +``` + +Ahora tu función `compute_metrics` (computar métricas) está lista y podrás usarla cuando estés preparando tu entrenamiento. + +## Entrenamiento + + + + + +Si no tienes experiencia haciéndole fine-tuning a un modelo con el [`Trainer`], ¡échale un vistazo al tutorial básico [aquí](../training#train-with-pytorch-trainer)! + + + +¡Ya puedes empezar a entrenar tu modelo! Para ello, carga Wav2Vec2 con [`AutoModelForCTC`]. Especifica la reducción que quieres aplicar con el parámetro `ctc_loss_reduction`. A menudo, es mejor usar el promedio en lugar de la sumatoria que se hace por defecto. + +```py +>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer + +>>> model = AutoModelForCTC.from_pretrained( +... "facebook/wav2vec2-base", +... ctc_loss_reduction="mean", +... pad_token_id=processor.tokenizer.pad_token_id, +... ) +``` +En este punto, solo quedan tres pasos: + +1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. El único parámetro obligatorio es `output_dir` (carpeta de salida), el cual especifica dónde guardar tu modelo. Puedes subir este modelo al Hub haciendo `push_to_hub=True` (debes haber iniciado sesión en Hugging Face para subir tu modelo). Al final de cada época, el [`Trainer`] evaluará la WER y guardará el punto de control del entrenamiento. +2. Pásale los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer, el collator de datos y la función `compute_metrics`. +3. Llama el método [`~Trainer.train`] para hacerle fine-tuning a tu modelo. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_asr_mind_model", +... per_device_train_batch_size=8, +... gradient_accumulation_steps=2, +... learning_rate=1e-5, +... warmup_steps=500, +... max_steps=2000, +... gradient_checkpointing=True, +... fp16=True, +... group_by_length=True, +... evaluation_strategy="steps", +... per_device_eval_batch_size=8, +... save_steps=1000, +... eval_steps=1000, +... logging_steps=25, +... load_best_model_at_end=True, +... metric_for_best_model="wer", +... greater_is_better=False, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=encoded_minds["train"], +... eval_dataset=encoded_minds["test"], +... tokenizer=processor.feature_extractor, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +Una vez que el entrenamiento haya sido completado, comparte tu modelo en el Hub con el método [`~transformers.Trainer.push_to_hub`] para que todo el mundo pueda usar tu modelo: + +```py +>>> trainer.push_to_hub() +``` + + + + + +Para ver un ejemplo más detallado de cómo hacerle fine-tuning a un modelo para reconocimiento automático del habla, échale un vistazo a esta [entrada de blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) para ASR en inglés y a esta [entrada](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) para ASR multilingüe. + + + +## Inferencia + +¡Genial, ahora que le has hecho fine-tuning a un modelo, puedes usarlo para inferencia! + +Carga el archivo de audio sobre el cual quieras correr la inferencia. ¡Recuerda re-muestrar la tasa de muestreo del archivo de audio para que sea la misma del modelo si es necesario! + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +>>> sampling_rate = dataset.features["audio"].sampling_rate +>>> audio_file = dataset[0]["audio"]["path"] +``` + +La manera más simple de probar tu modelo para hacer inferencia es usarlo en un [`pipeline`]. Puedes instanciar un `pipeline` para reconocimiento automático del habla con tu modelo y pasarle tu archivo de audio: + +```py +>>> from transformers import pipeline + +>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model") +>>> transcriber(audio_file) +{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'} +``` + + + +La transcripción es decente, pero podría ser mejor. ¡Intenta hacerle fine-tuning a tu modelo con más ejemplos para obtener resultados aún mejores! + + + +También puedes replicar de forma manual los resultados del `pipeline` si lo deseas: + + + +Carga un procesador para preprocesar el archivo de audio y la transcripción y devuelve el `input` como un tensor de PyTorch: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model") +>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") +``` + +Pásale tus entradas al modelo y devuelve los logits: + +```py +>>> from transformers import AutoModelForCTC + +>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +Obtén los identificadores de los tokens con mayor probabilidad en las predicciones y usa el procesador para decodificarlos y transformarlos en texto: + +```py +>>> import torch + +>>> predicted_ids = torch.argmax(logits, dim=-1) +>>> transcription = processor.batch_decode(predicted_ids) +>>> transcription +['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'] +``` + + diff --git a/docs/source/es/tasks/asr.mdx b/docs/source/es/tasks/asr.mdx deleted file mode 100644 index 7d331b11f7ea..000000000000 --- a/docs/source/es/tasks/asr.mdx +++ /dev/null @@ -1,363 +0,0 @@ - - -# Reconocimiento automático del habla - - - -El reconocimiento automático del habla (ASR, por sus siglas en inglés) convierte una señal de habla en texto y mapea una secuencia de entradas de audio en salidas en forma de texto. Los asistentes virtuales como Siri y Alexa usan modelos de ASR para ayudar a sus usuarios todos los días. De igual forma, hay muchas otras aplicaciones, como la transcripción de contenidos en vivo y la toma automática de notas durante reuniones. - -En esta guía te mostraremos como: - -1. Hacer fine-tuning al modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) con el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) para transcribir audio a texto. -2. Usar tu modelo ajustado para tareas de inferencia. - - - -Revisa la [página de la tarea](https://huggingface.co/tasks/automatic-speech-recognition) de reconocimiento automático del habla para acceder a más información sobre los modelos, datasets y métricas asociados. - - - -Antes de comenzar, asegúrate de haber instalado todas las librerías necesarias: - -```bash -pip install transformers datasets evaluate jiwer -``` - -Te aconsejamos iniciar sesión con tu cuenta de Hugging Face para que puedas subir tu modelo y comartirlo con la comunidad. Cuando te sea solicitado, ingresa tu token para iniciar sesión: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Cargar el dataset MInDS-14 - -Comencemos cargando un subconjunto más pequeño del dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) desde la biblioteca 🤗 Datasets. De esta forma, tendrás la oportunidad de experimentar y asegurarte de que todo funcione antes de invertir más tiempo entrenando con el dataset entero. - -```py ->>> from datasets import load_dataset, Audio - ->>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]") -``` -Divide la partición `train` (entrenamiento) en una partición de entrenamiento y una de prueba usando el método [`~Dataset.train_test_split`]: - -```py ->>> minds = minds.train_test_split(test_size=0.2) -``` - -Ahora échale un vistazo al dataset: - -```py ->>> minds -DatasetDict({ - train: Dataset({ - features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], - num_rows: 16 - }) - test: Dataset({ - features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'], - num_rows: 4 - }) -}) -``` - -Aunque el dataset contiene mucha información útil, como los campos `lang_id` (identificador del lenguaje) y `english_transcription` (transcripción al inglés), en esta guía nos enfocaremos en los campos `audio` y `transcription`. Puedes quitar las otras columnas con el método [`~datasets.Dataset.remove_columns`]: - -```py ->>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"]) -``` - -Vuelve a echarle un vistazo al ejemplo: - -```py ->>> minds["train"][0] -{'audio': {'array': array([-0.00024414, 0. , 0. , ..., 0.00024414, - 0.00024414, 0.00024414], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'sampling_rate': 8000}, - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} -``` - -Hay dos campos: - -- `audio`: un `array` (arreglo) unidimensional de la señal de habla que debe ser invocado para cargar y re-muestrear el archivo de audio. -- `transcription`: el texto objetivo. - -## Preprocesamiento - -El siguiente paso es cargar un procesador Wav2Vec2 para procesar la señal de audio: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") -``` -El dataset MInDS-14 tiene una tasa de muestreo de 8000kHz (puedes encontrar esta información en su [tarjeta de dataset](https://huggingface.co/datasets/PolyAI/minds14)), lo que significa que tendrás que re-muestrear el dataset a 16000kHz para poder usar el modelo Wav2Vec2 pre-entrenado: - -```py ->>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) ->>> minds["train"][0] -{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ..., - 2.78103951e-04, 2.38446111e-04, 1.18740834e-04], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'sampling_rate': 16000}, - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav', - 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} -``` - -Como puedes ver en el campo `transcription`, el texto contiene una mezcla de carácteres en mayúsculas y en minúsculas. El tokenizer Wav2Vec2 fue entrenado únicamente con carácteres en mayúsculas, así que tendrás que asegurarte de que el texto se ajuste al vocabulario del tokenizer: - -```py ->>> def uppercase(example): -... return {"transcription": example["transcription"].upper()} - - ->>> minds = minds.map(uppercase) -``` - -Ahora vamos a crear una función de preprocesamiento que: - -1. Invoque la columna `audio` para cargar y re-muestrear el archivo de audio. -2. Extraiga el campo `input_values` (valores de entrada) del archivo de audio y haga la tokenización de la columna `transcription` con el procesador. - -```py ->>> def prepare_dataset(batch): -... audio = batch["audio"] -... batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"]) -... batch["input_length"] = len(batch["input_values"][0]) -... return batch -``` - -Para aplicar la función de preprocesamiento a todo el dataset, puedes usar la función [`~datasets.Dataset.map`] de 🤗 Datasets. Para acelerar la función `map` puedes incrementar el número de procesos con el parámetro `num_proc`. Quita las columnas que no necesites con el método [`~datasets.Dataset.remove_columns`]: - -```py ->>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4) -``` - -🤗 Transformers no tiene un collator de datos para la tarea de ASR, así que tendrás que adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos. El collator también le aplicará padding dinámico a tu texto y etiquetas para que tengan la longitud del elemento más largo en su lote (en vez de la mayor longitud en el dataset entero), de forma que todas las muestras tengan una longitud uniforme. Aunque es posible hacerle padding a tu texto con el `tokenizer` haciendo `padding=True`, el padding dinámico es más eficiente. - -A diferencia de otros collators de datos, este tiene que aplicarle un método de padding distinto a los campos `input_values` (valores de entrada) y `labels` (etiquetas): - -```py ->>> import torch - ->>> from dataclasses import dataclass, field ->>> from typing import Any, Dict, List, Optional, Union - - ->>> @dataclass -... class DataCollatorCTCWithPadding: - -... processor: AutoProcessor -... padding: Union[bool, str] = "longest" - -... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: -... # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y -... # requieren métodos de padding diferentes -... input_features = [{"input_values": feature["input_values"][0]} for feature in features] -... label_features = [{"input_ids": feature["labels"]} for feature in features] - -... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt") - -... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt") - -... # remplaza el padding con -100 para ignorar la pérdida de forma correcta -... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) - -... batch["labels"] = labels - -... return batch -``` - -Ahora puedes instanciar tu `DataCollatorForCTCWithPadding`: - -```py ->>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest") -``` - -## Evaluación - -A menudo es útil incluir una métrica durante el entrenamiento para evaluar el rendimiento de tu modelo. Puedes cargar un método de evaluación rápidamente con la biblioteca 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). Para esta tarea, puedes usar la métrica de [tasa de error por palabra](https://huggingface.co/spaces/evaluate-metric/wer) (WER, por sus siglas en inglés). Puedes ver la [guía rápida](https://huggingface.co/docs/evaluate/a_quick_tour) de 🤗 Evaluate para aprender más acerca de cómo cargar y computar una métrica. - -```py ->>> import evaluate - ->>> wer = evaluate.load("wer") -``` - -Ahora crea una función que le pase tus predicciones y etiquetas a [`~evaluate.EvaluationModule.compute`] para calcular la WER: - -```py ->>> import numpy as np - - ->>> def compute_metrics(pred): -... pred_logits = pred.predictions -... pred_ids = np.argmax(pred_logits, axis=-1) - -... pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id - -... pred_str = processor.batch_decode(pred_ids) -... label_str = processor.batch_decode(pred.label_ids, group_tokens=False) - -... wer = wer.compute(predictions=pred_str, references=label_str) - -... return {"wer": wer} -``` - -Ahora tu función `compute_metrics` (computar métricas) está lista y podrás usarla cuando estés preparando tu entrenamiento. - -## Entrenamiento - - - - - -Si no tienes experiencia haciéndole fine-tuning a un modelo con el [`Trainer`], ¡échale un vistazo al tutorial básico [aquí](../training#train-with-pytorch-trainer)! - - - -¡Ya puedes empezar a entrenar tu modelo! Para ello, carga Wav2Vec2 con [`AutoModelForCTC`]. Especifica la reducción que quieres aplicar con el parámetro `ctc_loss_reduction`. A menudo, es mejor usar el promedio en lugar de la sumatoria que se hace por defecto. - -```py ->>> from transformers import AutoModelForCTC, TrainingArguments, Trainer - ->>> model = AutoModelForCTC.from_pretrained( -... "facebook/wav2vec2-base", -... ctc_loss_reduction="mean", -... pad_token_id=processor.tokenizer.pad_token_id, -... ) -``` -En este punto, solo quedan tres pasos: - -1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. El único parámetro obligatorio es `output_dir` (carpeta de salida), el cual especifica dónde guardar tu modelo. Puedes subir este modelo al Hub haciendo `push_to_hub=True` (debes haber iniciado sesión en Hugging Face para subir tu modelo). Al final de cada época, el [`Trainer`] evaluará la WER y guardará el punto de control del entrenamiento. -2. Pásale los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer, el collator de datos y la función `compute_metrics`. -3. Llama el método [`~Trainer.train`] para hacerle fine-tuning a tu modelo. - -```py ->>> training_args = TrainingArguments( -... output_dir="my_awesome_asr_mind_model", -... per_device_train_batch_size=8, -... gradient_accumulation_steps=2, -... learning_rate=1e-5, -... warmup_steps=500, -... max_steps=2000, -... gradient_checkpointing=True, -... fp16=True, -... group_by_length=True, -... evaluation_strategy="steps", -... per_device_eval_batch_size=8, -... save_steps=1000, -... eval_steps=1000, -... logging_steps=25, -... load_best_model_at_end=True, -... metric_for_best_model="wer", -... greater_is_better=False, -... push_to_hub=True, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=encoded_minds["train"], -... eval_dataset=encoded_minds["test"], -... tokenizer=processor.feature_extractor, -... data_collator=data_collator, -... compute_metrics=compute_metrics, -... ) - ->>> trainer.train() -``` - -Una vez que el entrenamiento haya sido completado, comparte tu modelo en el Hub con el método [`~transformers.Trainer.push_to_hub`] para que todo el mundo pueda usar tu modelo: - -```py ->>> trainer.push_to_hub() -``` - - - - - -Para ver un ejemplo más detallado de cómo hacerle fine-tuning a un modelo para reconocimiento automático del habla, échale un vistazo a esta [entrada de blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) para ASR en inglés y a esta [entrada](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) para ASR multilingüe. - - - -## Inferencia - -¡Genial, ahora que le has hecho fine-tuning a un modelo, puedes usarlo para inferencia! - -Carga el archivo de audio sobre el cual quieras correr la inferencia. ¡Recuerda re-muestrar la tasa de muestreo del archivo de audio para que sea la misma del modelo si es necesario! - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train") ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) ->>> sampling_rate = dataset.features["audio"].sampling_rate ->>> audio_file = dataset[0]["audio"]["path"] -``` - -La manera más simple de probar tu modelo para hacer inferencia es usarlo en un [`pipeline`]. Puedes instanciar un `pipeline` para reconocimiento automático del habla con tu modelo y pasarle tu archivo de audio: - -```py ->>> from transformers import pipeline - ->>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model") ->>> transcriber(audio_file) -{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'} -``` - - - -La transcripción es decente, pero podría ser mejor. ¡Intenta hacerle fine-tuning a tu modelo con más ejemplos para obtener resultados aún mejores! - - - -También puedes replicar de forma manual los resultados del `pipeline` si lo deseas: - - - -Carga un procesador para preprocesar el archivo de audio y la transcripción y devuelve el `input` como un tensor de PyTorch: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model") ->>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") -``` - -Pásale tus entradas al modelo y devuelve los logits: - -```py ->>> from transformers import AutoModelForCTC - ->>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model") ->>> with torch.no_grad(): -... logits = model(**inputs).logits -``` - -Obtén los identificadores de los tokens con mayor probabilidad en las predicciones y usa el procesador para decodificarlos y transformarlos en texto: - -```py ->>> import torch - ->>> predicted_ids = torch.argmax(logits, dim=-1) ->>> transcription = processor.batch_decode(predicted_ids) ->>> transcription -['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'] -``` - - diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md new file mode 100644 index 000000000000..3a959aa934ff --- /dev/null +++ b/docs/source/es/tasks/image_classification.md @@ -0,0 +1,173 @@ + + +# Clasificación de imágenes + + + +La clasificación de imágenes asigna una etiqueta o clase a una imagen. A diferencia de la clasificación de texto o audio, las entradas son los valores de los píxeles que representan una imagen. La clasificación de imágenes tiene muchos usos, como la detección de daños tras una catástrofe, el control de la salud de los cultivos o la búsqueda de signos de enfermedad en imágenes médicas. + +Esta guía te mostrará como hacer fine-tune al [ViT](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vit) en el dataset [Food-101](https://huggingface.co/datasets/food101) para clasificar un alimento en una imagen. + + + +Consulta la [página de la tarea](https://huggingface.co/tasks/audio-classification) de clasificación de imágenes para obtener más información sobre sus modelos, datasets y métricas asociadas. + + + +## Carga el dataset Food-101 + +Carga solo las primeras 5000 imágenes del dataset Food-101 de la biblioteca 🤗 de Datasets ya que es bastante grande: + +```py +>>> from datasets import load_dataset + +>>> food = load_dataset("food101", split="train[:5000]") +``` + +Divide el dataset en un train y un test set: + +```py +>>> food = food.train_test_split(test_size=0.2) +``` + +A continuación, observa un ejemplo: + +```py +>>> food["train"][0] +{'image': , + 'label': 79} +``` + +El campo `image` contiene una imagen PIL, y cada `label` es un número entero que representa una clase. Crea un diccionario que asigne un nombre de label a un entero y viceversa. El mapeo ayudará al modelo a recuperar el nombre de label a partir del número de la misma: + +```py +>>> labels = food["train"].features["label"].names +>>> label2id, id2label = dict(), dict() +>>> for i, label in enumerate(labels): +... label2id[label] = str(i) +... id2label[str(i)] = label +``` + +Ahora puedes convertir el número de label en un nombre de label para obtener más información: + +```py +>>> id2label[str(79)] +'prime_rib' +``` + +Cada clase de alimento - o label - corresponde a un número; `79` indica una costilla de primera en el ejemplo anterior. + +## Preprocesa + +Carga el image processor de ViT para procesar la imagen en un tensor: + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") +``` + +Aplica varias transformaciones de imagen al dataset para hacer el modelo más robusto contra el overfitting. En este caso se utilizará el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. Recorta una parte aleatoria de la imagen, cambia su tamaño y normalízala con la media y la desviación estándar de la imagen: + +```py +>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor + +>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) +>>> _transforms = Compose([RandomResizedCrop(image_processor.size["height"]), ToTensor(), normalize]) +``` + +Crea una función de preprocesamiento que aplique las transformaciones y devuelva los `pixel_values` - los inputs al modelo - de la imagen: + +```py +>>> def transforms(examples): +... examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]] +... del examples["image"] +... return examples +``` + +Utiliza el método [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?#datasets.Dataset.with_transform) de 🤗 Dataset para aplicar las transformaciones sobre todo el dataset. Las transformaciones se aplican sobre la marcha cuando se carga un elemento del dataset: + +```py +>>> food = food.with_transform(transforms) +``` + +Utiliza [`DefaultDataCollator`] para crear un batch de ejemplos. A diferencia de otros data collators en 🤗 Transformers, el DefaultDataCollator no aplica un preprocesamiento adicional como el padding. + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + +## Entrena +Carga ViT con [`AutoModelForImageClassification`]. Especifica el número de labels, y pasa al modelo el mapping entre el número de label y la clase de label: + +```py +>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer + +>>> model = AutoModelForImageClassification.from_pretrained( +... "google/vit-base-patch16-224-in21k", +... num_labels=len(labels), +... id2label=id2label, +... label2id=label2id, +... ) +``` + + + +Si no estás familiarizado con el fine-tuning de un modelo con el [`Trainer`], echa un vistazo al tutorial básico [aquí](../training#finetune-with-trainer)! + + + +Al llegar a este punto, solo quedan tres pasos: + +1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. Es importante que no elimines las columnas que no se utilicen, ya que esto hará que desaparezca la columna `image`. Sin la columna `image` no puedes crear `pixel_values`. Establece `remove_unused_columns=False` para evitar este comportamiento. +2. Pasa los training arguments al [`Trainer`] junto con el modelo, los datasets, tokenizer y data collator. +3. Llama [`~Trainer.train`] para hacer fine-tune de tu modelo. + +```py +>>> training_args = TrainingArguments( +... output_dir="./results", +... per_device_train_batch_size=16, +... evaluation_strategy="steps", +... num_train_epochs=4, +... fp16=True, +... save_steps=100, +... eval_steps=100, +... logging_steps=10, +... learning_rate=2e-4, +... save_total_limit=2, +... remove_unused_columns=False, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... data_collator=data_collator, +... train_dataset=food["train"], +... eval_dataset=food["test"], +... tokenizer=image_processor, +... ) + +>>> trainer.train() +``` + + + +Para ver un ejemplo más a profundidad de cómo hacer fine-tune a un modelo para clasificación de imágenes, echa un vistazo al correspondiente [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). + + diff --git a/docs/source/es/tasks/image_classification.mdx b/docs/source/es/tasks/image_classification.mdx deleted file mode 100644 index 9b8b03207d08..000000000000 --- a/docs/source/es/tasks/image_classification.mdx +++ /dev/null @@ -1,169 +0,0 @@ - - -# Clasificación de imágenes - - - -La clasificación de imágenes asigna una etiqueta o clase a una imagen. A diferencia de la clasificación de texto o audio, las entradas son los valores de los píxeles que representan una imagen. La clasificación de imágenes tiene muchos usos, como la detección de daños tras una catástrofe, el control de la salud de los cultivos o la búsqueda de signos de enfermedad en imágenes médicas. - -Esta guía te mostrará como hacer fine-tune al [ViT](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/vit) en el dataset [Food-101](https://huggingface.co/datasets/food101) para clasificar un alimento en una imagen. - - - -Consulta la [página de la tarea](https://huggingface.co/tasks/audio-classification) de clasificación de imágenes para obtener más información sobre sus modelos, datasets y métricas asociadas. - - - -## Carga el dataset Food-101 - -Carga solo las primeras 5000 imágenes del dataset Food-101 de la biblioteca 🤗 de Datasets ya que es bastante grande: - -```py ->>> from datasets import load_dataset - ->>> food = load_dataset("food101", split="train[:5000]") -``` - -Divide el dataset en un train y un test set: - -```py ->>> food = food.train_test_split(test_size=0.2) -``` - -A continuación, observa un ejemplo: - -```py ->>> food["train"][0] -{'image': , - 'label': 79} -``` - -El campo `image` contiene una imagen PIL, y cada `label` es un número entero que representa una clase. Crea un diccionario que asigne un nombre de label a un entero y viceversa. El mapeo ayudará al modelo a recuperar el nombre de label a partir del número de la misma: - -```py ->>> labels = food["train"].features["label"].names ->>> label2id, id2label = dict(), dict() ->>> for i, label in enumerate(labels): -... label2id[label] = str(i) -... id2label[str(i)] = label -``` - -Ahora puedes convertir el número de label en un nombre de label para obtener más información: - -```py ->>> id2label[str(79)] -'prime_rib' -``` - -Cada clase de alimento - o label - corresponde a un número; `79` indica una costilla de primera en el ejemplo anterior. - -## Preprocesa - -Carga el feature extractor de ViT para procesar la imagen en un tensor: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") -``` - -Aplica varias transformaciones de imagen al dataset para hacer el modelo más robusto contra el overfitting. En este caso se utilizará el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. Recorta una parte aleatoria de la imagen, cambia su tamaño y normalízala con la media y la desviación estándar de la imagen: - -```py ->>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor - ->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) ->>> _transforms = Compose([RandomResizedCrop(feature_extractor.size), ToTensor(), normalize]) -``` - -Crea una función de preprocesamiento que aplique las transformaciones y devuelva los `pixel_values` - los inputs al modelo - de la imagen: - -```py ->>> def transforms(examples): -... examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]] -... del examples["image"] -... return examples -``` - -Utiliza el método [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?#datasets.Dataset.with_transform) de 🤗 Dataset para aplicar las transformaciones sobre todo el dataset. Las transformaciones se aplican sobre la marcha cuando se carga un elemento del dataset: - -```py ->>> food = food.with_transform(transforms) -``` - -Utiliza [`DefaultDataCollator`] para crear un batch de ejemplos. A diferencia de otros data collators en 🤗 Transformers, el DefaultDataCollator no aplica un preprocesamiento adicional como el padding. - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator() -``` - -## Entrena -Carga ViT con [`AutoModelForImageClassification`]. Especifica el número de labels, y pasa al modelo el mapping entre el número de label y la clase de label: - -```py ->>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer - ->>> model = AutoModelForImageClassification.from_pretrained( -... "google/vit-base-patch16-224-in21k", -... num_labels=len(labels), -... id2label=id2label, -... label2id=label2id, -... ) -``` - - - -Si no estás familiarizado con el fine-tuning de un modelo con el [`Trainer`], echa un vistazo al tutorial básico [aquí](../training#finetune-with-trainer)! - - - -Al llegar a este punto, solo quedan tres pasos: - -1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. Es importante que no elimines las columnas que no se utilicen, ya que esto hará que desaparezca la columna `image`. Sin la columna `image` no puedes crear `pixel_values`. Establece `remove_unused_columns=False` para evitar este comportamiento. -2. Pasa los training arguments al [`Trainer`] junto con el modelo, los datasets, tokenizer y data collator. -3. Llama [`~Trainer.train`] para hacer fine-tune de tu modelo. - -```py ->>> training_args = TrainingArguments( -... output_dir="./results", -... per_device_train_batch_size=16, -... evaluation_strategy="steps", -... num_train_epochs=4, -... fp16=True, -... save_steps=100, -... eval_steps=100, -... logging_steps=10, -... learning_rate=2e-4, -... save_total_limit=2, -... remove_unused_columns=False, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... data_collator=data_collator, -... train_dataset=food["train"], -... eval_dataset=food["test"], -... tokenizer=feature_extractor, -... ) - ->>> trainer.train() -``` - - - -Para ver un ejemplo más a profundidad de cómo hacer fine-tune a un modelo para clasificación de imágenes, echa un vistazo al correspondiente [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). - - diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md new file mode 100644 index 000000000000..66ac8fb0d4b5 --- /dev/null +++ b/docs/source/es/tasks/language_modeling.md @@ -0,0 +1,423 @@ + + +# Modelado de lenguaje + +El modelado de lenguaje predice palabras en un enunciado. Hay dos formas de modelado de lenguaje. + + + +El modelado de lenguaje causal predice el siguiente token en una secuencia de tokens, y el modelo solo puede considerar los tokens a la izquierda. + + + +El modelado de lenguaje por enmascaramiento predice un token enmascarado en una secuencia, y el modelo puede considerar los tokens bidireccionalmente. + +Esta guía te mostrará cómo realizar fine-tuning [DistilGPT2](https://huggingface.co/distilgpt2) para modelos de lenguaje causales y [DistilRoBERTa](https://huggingface.co/distilroberta-base) para modelos de lenguaje por enmascaramiento en el [r/askscience](https://www.reddit.com/r/askscience/) subdataset [ELI5](https://huggingface.co/datasets/eli5). + + + +Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/bert-base-uncased) siguiendo los mismos pasos presentados en esta guía! + +Mira la [página de tarea](https://huggingface.co/tasks/text-generation) para generación de texto y la [página de tarea](https://huggingface.co/tasks/fill-mask) para modelos de lenguajes por enmascaramiento para obtener más información sobre los modelos, datasets, y métricas asociadas. + + + +## Carga el dataset ELI5 + +Carga solo los primeros 5000 registros desde la biblioteca 🤗 Datasets, dado que es bastante grande: + +```py +>>> from datasets import load_dataset + +>>> eli5 = load_dataset("eli5", split="train_asks[:5000]") +``` + +Divide este dataset en subdatasets para el entrenamiento y el test: + +```py +eli5 = eli5.train_test_split(test_size=0.2) +``` + +Luego observa un ejemplo: + +```py +>>> eli5["train"][0] +{'answers': {'a_id': ['c3d1aib', 'c3d4lya'], + 'score': [6, 3], + 'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", + "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]}, + 'answers_urls': {'url': []}, + 'document': '', + 'q_id': 'nyxfp', + 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', + 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']}, + 'subreddit': 'askscience', + 'title': 'Few questions about this space walk photograph.', + 'title_urls': {'url': []}} +``` + +Observa que `text` es un subcampo anidado dentro del diccionario `answers`. Cuando preproceses el dataset, deberás extraer el subcampo `text` en una columna aparte. + +## Preprocesamiento + + + +Para modelados de lenguaje causales carga el tokenizador DistilGPT2 para procesar el subcampo `text`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") +``` + + + +Para modelados de lenguaje por enmascaramiento carga el tokenizador DistilRoBERTa, en lugar de DistilGPT2: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") +``` + +Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten): + +```py +>>> eli5 = eli5.flatten() +>>> eli5["train"][0] +{'answers.a_id': ['c3d1aib', 'c3d4lya'], + 'answers.score': [6, 3], + 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", + "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"], + 'answers_urls.url': [], + 'document': '', + 'q_id': 'nyxfp', + 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', + 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'], + 'subreddit': 'askscience', + 'title': 'Few questions about this space walk photograph.', + 'title_urls.url': []} +``` + +Cada subcampo es ahora una columna separada, como lo indica el prefijo `answers`. Observa que `answers.text` es una lista. En lugar de tokenizar cada enunciado por separado, convierte la lista en un string para tokenizarlos conjuntamente. + +Así es como puedes crear una función de preprocesamiento para convertir la lista en una cadena y truncar las secuencias para que no superen la longitud máxima de input de DistilGPT2: + +```py +>>> def preprocess_function(examples): +... return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True) +``` + +Usa de 🤗 Datasets la función [`map`](https://huggingface.co/docs/datasets/process#map) para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez y aumentar la cantidad de procesos con `num_proc`. Elimina las columnas que no necesitas: + +```py +>>> tokenized_eli5 = eli5.map( +... preprocess_function, +... batched=True, +... num_proc=4, +... remove_columns=eli5["train"].column_names, +... ) +``` + +Ahora necesitas una segunda función de preprocesamiento para capturar el texto truncado de cualquier ejemplo demasiado largo para evitar cualquier pérdida de información. Esta función de preprocesamiento debería: + +- Concatenar todo el texto. +- Dividir el texto concatenado en trozos más pequeños definidos por un `block_size`. + +```py +>>> block_size = 128 + + +>>> def group_texts(examples): +... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} +... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... total_length = (total_length // block_size) * block_size +... result = { +... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] +... for k, t in concatenated_examples.items() +... } +... result["labels"] = result["input_ids"].copy() +... return result +``` + +Aplica la función `group_texts` sobre todo el dataset: + +```py +>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) +``` + +Para modelados de lenguaje causales, usa [`DataCollatorForLanguageModeling`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto a la dimensión del elemento más largo del lote para que de esta manera tengan largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente. + + + +Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> tokenizer.pad_token = tokenizer.eos_token +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) +``` + +Para modelados de lenguaje por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos. + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> tokenizer.pad_token = tokenizer.eos_token +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) +``` + + +Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") +``` + +Para modelados de lenguajes por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos. + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") +``` + + + +## Modelado de lenguaje causal + +El modelado de lenguaje causal es frecuentemente utilizado para generación de texto. Esta sección te muestra cómo realizar fine-tuning a [DistilGPT2](https://huggingface.co/distilgpt2) para generar nuevo texto. + +### Entrenamiento + + + +Carga DistilGPT2 con [`AutoModelForCausalLM`]: + +```py +>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + +>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") +``` + + + +Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)! + + + +A este punto, solo faltan tres pasos: + +1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. +2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator. +3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo. + +```py +>>> training_args = TrainingArguments( +... output_dir="./results", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... weight_decay=0.01, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=lm_dataset["train"], +... eval_dataset=lm_dataset["test"], +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + + +Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator: + +```py +>>> tf_train_set = lm_dataset["train"].to_tf_dataset( +... columns=["attention_mask", "input_ids", "labels"], +... dummy_labels=True, +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = lm_dataset["test"].to_tf_dataset( +... columns=["attention_mask", "input_ids", "labels"], +... dummy_labels=True, +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + + + +Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)! + + + +Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Carga DistilGPT2 con [`TFAutoModelForCausalLM`]: + +```py +>>> from transformers import TFAutoModelForCausalLM + +>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2") +``` + +Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) +``` + +Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3) +``` + + + +## Modelado de lenguaje por enmascaramiento + +El modelado de lenguaje por enmascaramiento es también conocido como una tarea de rellenar la máscara, pues predice un token enmascarado dada una secuencia. Los modelos de lenguaje por enmascaramiento requieren una buena comprensión del contexto de una secuencia entera, en lugar de solo el contexto a la izquierda. Esta sección te enseña como realizar el fine-tuning de [DistilRoBERTa](https://huggingface.co/distilroberta-base) para predecir una palabra enmascarada. + +### Entrenamiento + + + +Carga DistilRoBERTa con [`AutoModelForMaskedlM`]: + +```py +>>> from transformers import AutoModelForMaskedLM + +>>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base") +``` + + + +Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)! + + + +A este punto, solo faltan tres pasos: + +1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. +2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator. +3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning de tu modelo. + +```py +>>> training_args = TrainingArguments( +... output_dir="./results", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... num_train_epochs=3, +... weight_decay=0.01, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=lm_dataset["train"], +... eval_dataset=lm_dataset["test"], +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + + +Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator: + +```py +>>> tf_train_set = lm_dataset["train"].to_tf_dataset( +... columns=["attention_mask", "input_ids", "labels"], +... dummy_labels=True, +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = lm_dataset["test"].to_tf_dataset( +... columns=["attention_mask", "input_ids", "labels"], +... dummy_labels=True, +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + + + +Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)! + + + +Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Carga DistilRoBERTa con [`TFAutoModelForMaskedLM`]: + +```py +>>> from transformers import TFAutoModelForMaskedLM + +>>> model = TFAutoModelForCausalLM.from_pretrained("distilroberta-base") +``` + +Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) +``` + +Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3) +``` + + + + + +Para un ejemplo más profundo sobre cómo realizar el fine-tuning sobre un modelo de lenguaje causal, considera +[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) +o [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). + + \ No newline at end of file diff --git a/docs/source/es/tasks/language_modeling.mdx b/docs/source/es/tasks/language_modeling.mdx deleted file mode 100644 index 565185072a11..000000000000 --- a/docs/source/es/tasks/language_modeling.mdx +++ /dev/null @@ -1,419 +0,0 @@ - - -# Modelado de lenguaje - -El modelado de lenguaje predice palabras en un enunciado. Hay dos formas de modelado de lenguaje. - - - -El modelado de lenguaje causal predice el siguiente token en una secuencia de tokens, y el modelo solo puede considerar los tokens a la izquierda. - - - -El modelado de lenguaje por enmascaramiento predice un token enmascarado en una secuencia, y el modelo puede considerar los tokens bidireccionalmente. - -Esta guía te mostrará cómo realizar fine-tuning [DistilGPT2](https://huggingface.co/distilgpt2) para modelos de lenguaje causales y [DistilRoBERTa](https://huggingface.co/distilroberta-base) para modelos de lenguaje por enmascaramiento en el [r/askscience](https://www.reddit.com/r/askscience/) subdataset [ELI5](https://huggingface.co/datasets/eli5). - - - -Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/bert-base-uncased) siguiendo los mismos pasos presentados en esta guía! - -Mira la [página de tarea](https://huggingface.co/tasks/text-generation) para generación de texto y la [página de tarea](https://huggingface.co/tasks/fill-mask) para modelos de lenguajes por enmascaramiento para obtener más información sobre los modelos, datasets, y métricas asociadas. - - - -## Carga el dataset ELI5 - -Carga solo los primeros 5000 registros desde la biblioteca 🤗 Datasets, dado que es bastante grande: - -```py ->>> from datasets import load_dataset - ->>> eli5 = load_dataset("eli5", split="train_asks[:5000]") -``` - -Divide este dataset en subdatasets para el entrenamiento y el test: - -```py -eli5 = eli5.train_test_split(test_size=0.2) -``` - -Luego observa un ejemplo: - -```py ->>> eli5["train"][0] -{'answers': {'a_id': ['c3d1aib', 'c3d4lya'], - 'score': [6, 3], - 'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", - "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]}, - 'answers_urls': {'url': []}, - 'document': '', - 'q_id': 'nyxfp', - 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', - 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']}, - 'subreddit': 'askscience', - 'title': 'Few questions about this space walk photograph.', - 'title_urls': {'url': []}} -``` - -Observa que `text` es un subcampo anidado dentro del diccionario `answers`. Cuando preproceses el dataset, deberás extraer el subcampo `text` en una columna aparte. - -## Preprocesamiento - - - -Para modelados de lenguaje causales carga el tokenizador DistilGPT2 para procesar el subcampo `text`: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") -``` - - - -Para modelados de lenguaje por enmascaramiento carga el tokenizador DistilRoBERTa, en lugar de DistilGPT2: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") -``` - -Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten): - -```py ->>> eli5 = eli5.flatten() ->>> eli5["train"][0] -{'answers.a_id': ['c3d1aib', 'c3d4lya'], - 'answers.score': [6, 3], - 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.", - "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"], - 'answers_urls.url': [], - 'document': '', - 'q_id': 'nyxfp', - 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?', - 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'], - 'subreddit': 'askscience', - 'title': 'Few questions about this space walk photograph.', - 'title_urls.url': []} -``` - -Cada subcampo es ahora una columna separada, como lo indica el prefijo `answers`. Observa que `answers.text` es una lista. En lugar de tokenizar cada enunciado por separado, convierte la lista en un string para tokenizarlos conjuntamente. - -Así es como puedes crear una función de preprocesamiento para convertir la lista en una cadena y truncar las secuencias para que no superen la longitud máxima de input de DistilGPT2: - -```py ->>> def preprocess_function(examples): -... return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True) -``` - -Usa de 🤗 Datasets la función [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez y aumentar la cantidad de procesos con `num_proc`. Elimina las columnas que no necesitas: - -```py ->>> tokenized_eli5 = eli5.map( -... preprocess_function, -... batched=True, -... num_proc=4, -... remove_columns=eli5["train"].column_names, -... ) -``` - -Ahora necesitas una segunda función de preprocesamiento para capturar el texto truncado de cualquier ejemplo demasiado largo para evitar cualquier pérdida de información. Esta función de preprocesamiento debería: - -- Concatenar todo el texto. -- Dividir el texto concatenado en trozos más pequeños definidos por un `block_size`. - -```py ->>> block_size = 128 - - ->>> def group_texts(examples): -... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} -... total_length = len(concatenated_examples[list(examples.keys())[0]]) -... total_length = (total_length // block_size) * block_size -... result = { -... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] -... for k, t in concatenated_examples.items() -... } -... result["labels"] = result["input_ids"].copy() -... return result -``` - -Aplica la función `group_texts` sobre todo el dataset: - -```py ->>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) -``` - -Para modelados de lenguaje causales, usa [`DataCollatorForLanguageModeling`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto a la dimensión del elemento más largo del lote para que de esta manera tengan largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente. - - - -Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha: - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> tokenizer.pad_token = tokenizer.eos_token ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) -``` - -Para modelados de lenguaje por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos. - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> tokenizer.pad_token = tokenizer.eos_token ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) -``` - - -Puedes usar el token de final de secuencia como el token de relleno y asignar `mlm=False`. Esto usará los inputs como etiquetas movidas un elemento hacia la derecha: - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") -``` - -Para modelados de lenguajes por enmascaramiento usa el mismo [`DataCollatorForLanguageModeling`] excepto que deberás especificar `mlm_probability` para enmascarar tokens aleatoriamente cada vez que iteras sobre los datos. - -```py ->>> from transformers import DataCollatorForLanguageModeling - ->>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") -``` - - - -## Modelado de lenguaje causal - -El modelado de lenguaje causal es frecuentemente utilizado para generación de texto. Esta sección te muestra cómo realizar fine-tuning a [DistilGPT2](https://huggingface.co/distilgpt2) para generar nuevo texto. - -### Entrenamiento - - - -Carga DistilGPT2 con [`AutoModelForCausalLM`]: - -```py ->>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer - ->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") -``` - - - -Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)! - - - -A este punto, solo faltan tres pasos: - -1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. -2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator. -3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo. - -```py ->>> training_args = TrainingArguments( -... output_dir="./results", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... weight_decay=0.01, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=lm_dataset["train"], -... eval_dataset=lm_dataset["test"], -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - - -Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator: - -```py ->>> tf_train_set = lm_dataset["train"].to_tf_dataset( -... columns=["attention_mask", "input_ids", "labels"], -... dummy_labels=True, -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = lm_dataset["test"].to_tf_dataset( -... columns=["attention_mask", "input_ids", "labels"], -... dummy_labels=True, -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - - - -Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)! - - - -Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento: - -```py ->>> from transformers import create_optimizer, AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Carga DistilGPT2 con [`TFAutoModelForCausalLM`]: - -```py ->>> from transformers import TFAutoModelForCausalLM - ->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2") -``` - -Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3) -``` - - - -## Modelado de lenguaje por enmascaramiento - -El modelado de lenguaje por enmascaramiento es también conocido como una tarea de rellenar la máscara, pues predice un token enmascarado dada una secuencia. Los modelos de lenguaje por enmascaramiento requieren una buena comprensión del contexto de una secuencia entera, en lugar de solo el contexto a la izquierda. Esta sección te enseña como realizar el fine-tuning de [DistilRoBERTa](https://huggingface.co/distilroberta-base) para predecir una palabra enmascarada. - -### Entrenamiento - - - -Carga DistilRoBERTa con [`AutoModelForMaskedlM`]: - -```py ->>> from transformers import AutoModelForMaskedLM - ->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base") -``` - - - -Si no estás familiarizado con el proceso de realizar fine-tuning sobre un modelo con [`Trainer`], considera el tutorial básico [aquí](../training#finetune-with-trainer)! - - - -A este punto, solo faltan tres pasos: - -1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. -2. Pasarle los argumentos de entrenamiento a [`Trainer`] junto con el modelo, dataset, y el data collator. -3. Realiza la llamada [`~Trainer.train`] para realizar el fine-tuning de tu modelo. - -```py ->>> training_args = TrainingArguments( -... output_dir="./results", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... num_train_epochs=3, -... weight_decay=0.01, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=lm_dataset["train"], -... eval_dataset=lm_dataset["test"], -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - - -Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator: - -```py ->>> tf_train_set = lm_dataset["train"].to_tf_dataset( -... columns=["attention_mask", "input_ids", "labels"], -... dummy_labels=True, -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = lm_dataset["test"].to_tf_dataset( -... columns=["attention_mask", "input_ids", "labels"], -... dummy_labels=True, -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - - - -Si no estás familiarizado con realizar fine-tuning de tus modelos con Keras, considera el tutorial básico [aquí](training#finetune-with-keras)! - - - -Crea la función optimizadora, la tasa de aprendizaje, y algunos hiperparámetros de entrenamiento: - -```py ->>> from transformers import create_optimizer, AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Carga DistilRoBERTa con [`TFAutoModelForMaskedLM`]: - -```py ->>> from transformers import TFAutoModelForMaskedLM - ->>> model = TFAutoModelForCausalLM.from_pretrained("distilroberta-base") -``` - -Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3) -``` - - - - - -Para un ejemplo más profundo sobre cómo realizar el fine-tuning sobre un modelo de lenguaje causal, considera -[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) -o [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). - - \ No newline at end of file diff --git a/docs/source/es/tasks/multiple_choice.md b/docs/source/es/tasks/multiple_choice.md new file mode 100644 index 000000000000..8391dcbdd5eb --- /dev/null +++ b/docs/source/es/tasks/multiple_choice.md @@ -0,0 +1,292 @@ + + +# Selección múltiple + +La tarea de selección múltiple es parecida a la de responder preguntas, con la excepción de que se dan varias opciones de respuesta junto con el contexto. El modelo se entrena para escoger la respuesta correcta +entre varias opciones a partir del contexto dado. + +Esta guía te mostrará como hacerle fine-tuning a [BERT](https://huggingface.co/bert-base-uncased) en la configuración `regular` del dataset [SWAG](https://huggingface.co/datasets/swag), de forma +que seleccione la mejor respuesta a partir de varias opciones y algún contexto. + +## Cargar el dataset SWAG + +Carga el dataset SWAG con la biblioteca 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> swag = load_dataset("swag", "regular") +``` + +Ahora, échale un vistazo a un ejemplo del dataset: + +```py +>>> swag["train"][0] +{'ending0': 'passes by walking down the street playing their instruments.', + 'ending1': 'has heard approaching them.', + 'ending2': "arrives and they're outside dancing and asleep.", + 'ending3': 'turns the lead singer watches the performance.', + 'fold-ind': '3416', + 'gold-source': 'gold', + 'label': 0, + 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', + 'sent2': 'A drum line', + 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', + 'video-id': 'anetv_jkn6uvmqwh4'} +``` + +Los campos `sent1` y `sent2` muestran cómo comienza una oración, y cada campo `ending` indica cómo podría terminar. Dado el comienzo de la oración, el modelo debe escoger el final de oración correcto indicado por el campo `label`. + +## Preprocesmaiento + +Carga el tokenizer de BERT para procesar el comienzo de cada oración y los cuatro finales posibles: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +La función de preprocesmaiento debe hacer lo siguiente: + +1. Hacer cuatro copias del campo `sent1` de forma que se pueda combinar cada una con el campo `sent2` para recrear la forma en que empieza la oración. +2. Combinar `sent2` con cada uno de los cuatro finales de oración posibles. +3. Aplanar las dos listas para que puedas tokenizarlas, y luego des-aplanarlas para que cada ejemplo tenga los campos `input_ids`, `attention_mask` y `labels` correspondientes. + +```py +>>> ending_names = ["ending0", "ending1", "ending2", "ending3"] + + +>>> def preprocess_function(examples): +... first_sentences = [[context] * 4 for context in examples["sent1"]] +... question_headers = examples["sent2"] +... second_sentences = [ +... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) +... ] + +... first_sentences = sum(first_sentences, []) +... second_sentences = sum(second_sentences, []) + +... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) +... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} +``` + +Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez. + +```py +tokenized_swag = swag.map(preprocess_function, batched=True) +``` + +🤗 Transformers no tiene un collator de datos para la tarea de selección múltiple, así que tendrías que crear uno. Puedes adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos para selección múltiple. Este también +le *añadirá relleno de manera dinámica* a tu texto y a las etiquetas para que tengan la longitud del elemento más largo en su lote, de forma que tengan una longitud uniforme. Aunque es posible rellenar el texto en la función `tokenizer` haciendo +`padding=True`, el rellenado dinámico es más eficiente. + +El `DataCollatorForMultipleChoice` aplanará todas las entradas del modelo, les aplicará relleno y luego des-aplanará los resultados: + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import torch + + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Collator de datos que le añadirá relleno de forma automática a las entradas recibidas para +... una tarea de selección múltiple. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="pt", +... ) + +... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} +... batch["labels"] = torch.tensor(labels, dtype=torch.int64) +... return batch +``` + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import tensorflow as tf + + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="tf", +... ) + +... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} +... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) +... return batch +``` + + + +## Entrenamiento + + + +Carga el modelo BERT con [`AutoModelForMultipleChoice`]: + +```py +>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer + +>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased") +``` + + + +Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)! + + + +En este punto, solo quedan tres pasos: + +1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. +2. Pasarle los argumentos del entrenamiento al [`Trainer`] jnto con el modelo, el dataset, el tokenizer y el collator de datos. +3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo. + +```py +>>> training_args = TrainingArguments( +... output_dir="./results", +... evaluation_strategy="epoch", +... learning_rate=5e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_swag["train"], +... eval_dataset=tokenized_swag["validation"], +... tokenizer=tokenizer, +... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), +... ) + +>>> trainer.train() +``` + + +Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`]. + +```py +>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_swag["train"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_swag["validation"], +... shuffle=False, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) +``` + + + +Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)! + + + +Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 2 +>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs +>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +Carga el modelo BERT con [`TFAutoModelForMultipleChoice`]: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased") +``` + +Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> model.compile(optimizer=optimizer) +``` + +Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2) +``` + + diff --git a/docs/source/es/tasks/multiple_choice.mdx b/docs/source/es/tasks/multiple_choice.mdx deleted file mode 100644 index 2ece0969bf96..000000000000 --- a/docs/source/es/tasks/multiple_choice.mdx +++ /dev/null @@ -1,288 +0,0 @@ - - -# Selección múltiple - -La tarea de selección múltiple es parecida a la de responder preguntas, con la excepción de que se dan varias opciones de respuesta junto con el contexto. El modelo se entrena para escoger la respuesta correcta -entre varias opciones a partir del contexto dado. - -Esta guía te mostrará como hacerle fine-tuning a [BERT](https://huggingface.co/bert-base-uncased) en la configuración `regular` del dataset [SWAG](https://huggingface.co/datasets/swag), de forma -que seleccione la mejor respuesta a partir de varias opciones y algún contexto. - -## Cargar el dataset SWAG - -Carga el dataset SWAG con la biblioteca 🤗 Datasets: - -```py ->>> from datasets import load_dataset - ->>> swag = load_dataset("swag", "regular") -``` - -Ahora, échale un vistazo a un ejemplo del dataset: - -```py ->>> swag["train"][0] -{'ending0': 'passes by walking down the street playing their instruments.', - 'ending1': 'has heard approaching them.', - 'ending2': "arrives and they're outside dancing and asleep.", - 'ending3': 'turns the lead singer watches the performance.', - 'fold-ind': '3416', - 'gold-source': 'gold', - 'label': 0, - 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', - 'sent2': 'A drum line', - 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', - 'video-id': 'anetv_jkn6uvmqwh4'} -``` - -Los campos `sent1` y `sent2` muestran cómo comienza una oración, y cada campo `ending` indica cómo podría terminar. Dado el comienzo de la oración, el modelo debe escoger el final de oración correcto indicado por el campo `label`. - -## Preprocesmaiento - -Carga el tokenizer de BERT para procesar el comienzo de cada oración y los cuatro finales posibles: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -``` - -La función de preprocesmaiento debe hacer lo siguiente: - -1. Hacer cuatro copias del campo `sent1` de forma que se pueda combinar cada una con el campo `sent2` para recrear la forma en que empieza la oración. -2. Combinar `sent2` con cada uno de los cuatro finales de oración posibles. -3. Aplanar las dos listas para que puedas tokenizarlas, y luego des-aplanarlas para que cada ejemplo tenga los campos `input_ids`, `attention_mask` y `labels` correspondientes. - -```py ->>> ending_names = ["ending0", "ending1", "ending2", "ending3"] - - ->>> def preprocess_function(examples): -... first_sentences = [[context] * 4 for context in examples["sent1"]] -... question_headers = examples["sent2"] -... second_sentences = [ -... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) -... ] - -... first_sentences = sum(first_sentences, []) -... second_sentences = sum(second_sentences, []) - -... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) -... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} -``` - -Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez. - -```py -tokenized_swag = swag.map(preprocess_function, batched=True) -``` - -🤗 Transformers no tiene un collator de datos para la tarea de selección múltiple, así que tendrías que crear uno. Puedes adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos para selección múltiple. Este también -le *añadirá relleno de manera dinámica* a tu texto y a las etiquetas para que tengan la longitud del elemento más largo en su lote, de forma que tengan una longitud uniforme. Aunque es posible rellenar el texto en la función `tokenizer` haciendo -`padding=True`, el rellenado dinámico es más eficiente. - -El `DataCollatorForMultipleChoice` aplanará todas las entradas del modelo, les aplicará relleno y luego des-aplanará los resultados: - - - -```py ->>> from dataclasses import dataclass ->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy ->>> from typing import Optional, Union ->>> import torch - - ->>> @dataclass -... class DataCollatorForMultipleChoice: -... """ -... Collator de datos que le añadirá relleno de forma automática a las entradas recibidas para -... una tarea de selección múltiple. -... """ - -... tokenizer: PreTrainedTokenizerBase -... padding: Union[bool, str, PaddingStrategy] = True -... max_length: Optional[int] = None -... pad_to_multiple_of: Optional[int] = None - -... def __call__(self, features): -... label_name = "label" if "label" in features[0].keys() else "labels" -... labels = [feature.pop(label_name) for feature in features] -... batch_size = len(features) -... num_choices = len(features[0]["input_ids"]) -... flattened_features = [ -... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features -... ] -... flattened_features = sum(flattened_features, []) - -... batch = self.tokenizer.pad( -... flattened_features, -... padding=self.padding, -... max_length=self.max_length, -... pad_to_multiple_of=self.pad_to_multiple_of, -... return_tensors="pt", -... ) - -... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} -... batch["labels"] = torch.tensor(labels, dtype=torch.int64) -... return batch -``` - - -```py ->>> from dataclasses import dataclass ->>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy ->>> from typing import Optional, Union ->>> import tensorflow as tf - - ->>> @dataclass -... class DataCollatorForMultipleChoice: -... """ -... Data collator that will dynamically pad the inputs for multiple choice received. -... """ - -... tokenizer: PreTrainedTokenizerBase -... padding: Union[bool, str, PaddingStrategy] = True -... max_length: Optional[int] = None -... pad_to_multiple_of: Optional[int] = None - -... def __call__(self, features): -... label_name = "label" if "label" in features[0].keys() else "labels" -... labels = [feature.pop(label_name) for feature in features] -... batch_size = len(features) -... num_choices = len(features[0]["input_ids"]) -... flattened_features = [ -... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features -... ] -... flattened_features = sum(flattened_features, []) - -... batch = self.tokenizer.pad( -... flattened_features, -... padding=self.padding, -... max_length=self.max_length, -... pad_to_multiple_of=self.pad_to_multiple_of, -... return_tensors="tf", -... ) - -... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} -... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) -... return batch -``` - - - -## Entrenamiento - - - -Carga el modelo BERT con [`AutoModelForMultipleChoice`]: - -```py ->>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer - ->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased") -``` - - - -Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)! - - - -En este punto, solo quedan tres pasos: - -1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. -2. Pasarle los argumentos del entrenamiento al [`Trainer`] jnto con el modelo, el dataset, el tokenizer y el collator de datos. -3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo. - -```py ->>> training_args = TrainingArguments( -... output_dir="./results", -... evaluation_strategy="epoch", -... learning_rate=5e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... num_train_epochs=3, -... weight_decay=0.01, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_swag["train"], -... eval_dataset=tokenized_swag["validation"], -... tokenizer=tokenizer, -... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), -... ) - ->>> trainer.train() -``` - - -Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`]. - -```py ->>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_swag["train"], -... shuffle=True, -... batch_size=batch_size, -... collate_fn=data_collator, -... ) - ->>> tf_validation_set = model.prepare_tf_dataset( -... tokenized_swag["validation"], -... shuffle=False, -... batch_size=batch_size, -... collate_fn=data_collator, -... ) -``` - - - -Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)! - - - -Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento: - -```py ->>> from transformers import create_optimizer - ->>> batch_size = 16 ->>> num_train_epochs = 2 ->>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs ->>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) -``` - -Carga el modelo BERT con [`TFAutoModelForMultipleChoice`]: - -```py ->>> from transformers import TFAutoModelForMultipleChoice - ->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased") -``` - -Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> model.compile(optimizer=optimizer) -``` - -Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2) -``` - - diff --git a/docs/source/es/tasks/question_answering.md b/docs/source/es/tasks/question_answering.md new file mode 100644 index 000000000000..2aa896142e2e --- /dev/null +++ b/docs/source/es/tasks/question_answering.md @@ -0,0 +1,275 @@ + + +# Respuesta a preguntas + + + +La respuesta a preguntas devuelve una respuesta a partir de una pregunta dada. Existen dos formas comunes de responder preguntas: + +- Extractiva: extraer la respuesta a partir del contexto dado. +- Abstractiva: generar una respuesta que responda correctamente la pregunta a partir del contexto dado. + +Esta guía te mostrará como hacer fine-tuning de [DistilBERT](https://huggingface.co/distilbert-base-uncased) en el dataset [SQuAD](https://huggingface.co/datasets/squad) para responder preguntas de forma extractiva. + + + +Revisa la [página de la tarea](https://huggingface.co/tasks/question-answering) de responder preguntas para tener más información sobre otras formas de responder preguntas y los modelos, datasets y métricas asociadas. + + + +## Carga el dataset SQuAD + +Carga el dataset SQuAD con la biblioteca 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> squad = load_dataset("squad") +``` + +Ahora, échale un vistazo a una muestra: + +```py +>>> squad["train"][0] +{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, + 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', + 'id': '5733be284776f41900661182', + 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', + 'title': 'University_of_Notre_Dame' +} +``` + +El campo `answers` es un diccionario que contiene la posición inicial de la respuesta y el `texto` de la respuesta. + +## Preprocesamiento + + + +Carga el tokenizer de DistilBERT para procesar los campos `question` (pregunta) y `context` (contexto): + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +``` + +Hay algunos pasos de preprocesamiento específicos para la tarea de respuesta a preguntas que debes tener en cuenta: + +1. Algunos ejemplos en un dataset pueden tener un contexto que supera la longitud máxima de entrada de un modelo. Trunca solamente el contexto asignándole el valor `"only_second"` al parámetro `truncation`. +2. A continuación, mapea las posiciones de inicio y fin de la respuesta al contexto original asignándole el valor `True` al parámetro `return_offsets_mapping`. +3. Una vez tengas el mapeo, puedes encontrar los tokens de inicio y fin de la respuesta. Usa el método [`sequence_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.sequence_ids) +para encontrar qué parte de la lista de tokens desplazados corresponde a la pregunta y cuál corresponde al contexto. + +A continuación puedes ver como se crea una función para truncar y mapear los tokens de inicio y fin de la respuesta al `context`: + +```py +>>> def preprocess_function(examples): +... questions = [q.strip() for q in examples["question"]] +... inputs = tokenizer( +... questions, +... examples["context"], +... max_length=384, +... truncation="only_second", +... return_offsets_mapping=True, +... padding="max_length", +... ) + +... offset_mapping = inputs.pop("offset_mapping") +... answers = examples["answers"] +... start_positions = [] +... end_positions = [] + +... for i, offset in enumerate(offset_mapping): +... answer = answers[i] +... start_char = answer["answer_start"][0] +... end_char = answer["answer_start"][0] + len(answer["text"][0]) +... sequence_ids = inputs.sequence_ids(i) + +... # Encuentra el inicio y el fin del contexto +... idx = 0 +... while sequence_ids[idx] != 1: +... idx += 1 +... context_start = idx +... while sequence_ids[idx] == 1: +... idx += 1 +... context_end = idx - 1 + +... # Si la respuesta entera no está dentro del contexto, etiquétala como (0, 0) +... if offset[context_start][0] > end_char or offset[context_end][1] < start_char: +... start_positions.append(0) +... end_positions.append(0) +... else: +... # De lo contrario, esta es la posición de los tokens de inicio y fin +... idx = context_start +... while idx <= context_end and offset[idx][0] <= start_char: +... idx += 1 +... start_positions.append(idx - 1) + +... idx = context_end +... while idx >= context_start and offset[idx][1] >= end_char: +... idx -= 1 +... end_positions.append(idx + 1) + +... inputs["start_positions"] = start_positions +... inputs["end_positions"] = end_positions +... return inputs +``` + +Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez. +Quita las columnas que no necesites: + +```py +>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names) +``` + +Usa el [`DefaultDataCollator`] para crear un lote de ejemplos. A diferencia de los otros collators de datos en 🤗 Transformers, el `DefaultDataCollator` no aplica ningún procesamiento adicional (como el rellenado). + + + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") +``` + + + +## Entrenamiento + + + +Carga el modelo DistilBERT con [`AutoModelForQuestionAnswering`]: + +```py +>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer + +>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + + +Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)! + + + +En este punto, solo quedan tres pasos: + +1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. +2. Pasarle los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer y el collator de datos. +3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo. + +```py +>>> training_args = TrainingArguments( +... output_dir="./results", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_squad["train"], +... eval_dataset=tokenized_squad["validation"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + + +Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`]. + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_squad["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_squad["validation"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + + + +Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)! + + + +Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_epochs = 2 +>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs +>>> optimizer, schedule = create_optimizer( +... init_lr=2e-5, +... num_warmup_steps=0, +... num_train_steps=total_train_steps, +... ) +``` + +Carga el modelo DistilBERT con [`TFAutoModelForQuestionAnswering`]: + +```py +>>> from transformers import TFAutoModelForQuestionAnswering + +>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased") +``` + +Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) +``` + +Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3) +``` + + + + + +Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para responder preguntas, échale un vistazo al +[cuaderno de PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) o al +[cuaderno de TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb) correspondiente. + + diff --git a/docs/source/es/tasks/question_answering.mdx b/docs/source/es/tasks/question_answering.mdx deleted file mode 100644 index d599fa8f1a37..000000000000 --- a/docs/source/es/tasks/question_answering.mdx +++ /dev/null @@ -1,271 +0,0 @@ - - -# Respuesta a preguntas - - - -La respuesta a preguntas devuelve una respuesta a partir de una pregunta dada. Existen dos formas comunes de responder preguntas: - -- Extractiva: extraer la respuesta a partir del contexto dado. -- Abstractiva: generar una respuesta que responda correctamente la pregunta a partir del contexto dado. - -Esta guía te mostrará como hacer fine-tuning de [DistilBERT](https://huggingface.co/distilbert-base-uncased) en el dataset [SQuAD](https://huggingface.co/datasets/squad) para responder preguntas de forma extractiva. - - - -Revisa la [página de la tarea](https://huggingface.co/tasks/question-answering) de responder preguntas para tener más información sobre otras formas de responder preguntas y los modelos, datasets y métricas asociadas. - - - -## Carga el dataset SQuAD - -Carga el dataset SQuAD con la biblioteca 🤗 Datasets: - -```py ->>> from datasets import load_dataset - ->>> squad = load_dataset("squad") -``` - -Ahora, échale un vistazo a una muestra: - -```py ->>> squad["train"][0] -{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, - 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', - 'id': '5733be284776f41900661182', - 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', - 'title': 'University_of_Notre_Dame' -} -``` - -El campo `answers` es un diccionario que contiene la posición inicial de la respuesta y el `texto` de la respuesta. - -## Preprocesamiento - - - -Carga el tokenizer de DistilBERT para procesar los campos `question` (pregunta) y `context` (contexto): - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") -``` - -Hay algunos pasos de preprocesamiento específicos para la tarea de respuesta a preguntas que debes tener en cuenta: - -1. Algunos ejemplos en un dataset pueden tener un contexto que supera la longitud máxima de entrada de un modelo. Trunca solamente el contexto asignándole el valor `"only_second"` al parámetro `truncation`. -2. A continuación, mapea las posiciones de inicio y fin de la respuesta al contexto original asignándole el valor `True` al parámetro `return_offsets_mapping`. -3. Una vez tengas el mapeo, puedes encontrar los tokens de inicio y fin de la respuesta. Usa el método [`sequence_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.sequence_ids) -para encontrar qué parte de la lista de tokens desplazados corresponde a la pregunta y cuál corresponde al contexto. - -A continuación puedes ver como se crea una función para truncar y mapear los tokens de inicio y fin de la respuesta al `context`: - -```py ->>> def preprocess_function(examples): -... questions = [q.strip() for q in examples["question"]] -... inputs = tokenizer( -... questions, -... examples["context"], -... max_length=384, -... truncation="only_second", -... return_offsets_mapping=True, -... padding="max_length", -... ) - -... offset_mapping = inputs.pop("offset_mapping") -... answers = examples["answers"] -... start_positions = [] -... end_positions = [] - -... for i, offset in enumerate(offset_mapping): -... answer = answers[i] -... start_char = answer["answer_start"][0] -... end_char = answer["answer_start"][0] + len(answer["text"][0]) -... sequence_ids = inputs.sequence_ids(i) - -... # Encuentra el inicio y el fin del contexto -... idx = 0 -... while sequence_ids[idx] != 1: -... idx += 1 -... context_start = idx -... while sequence_ids[idx] == 1: -... idx += 1 -... context_end = idx - 1 - -... # Si la respuesta entera no está dentro del contexto, etiquétala como (0, 0) -... if offset[context_start][0] > end_char or offset[context_end][1] < start_char: -... start_positions.append(0) -... end_positions.append(0) -... else: -... # De lo contrario, esta es la posición de los tokens de inicio y fin -... idx = context_start -... while idx <= context_end and offset[idx][0] <= start_char: -... idx += 1 -... start_positions.append(idx - 1) - -... idx = context_end -... while idx >= context_start and offset[idx][1] >= end_char: -... idx -= 1 -... end_positions.append(idx + 1) - -... inputs["start_positions"] = start_positions -... inputs["end_positions"] = end_positions -... return inputs -``` - -Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la función de preprocesamiento al dataset entero. Puedes acelerar la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez. -Quita las columnas que no necesites: - -```py ->>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names) -``` - -Usa el [`DefaultDataCollator`] para crear un lote de ejemplos. A diferencia de los otros collators de datos en 🤗 Transformers, el `DefaultDataCollator` no aplica ningún procesamiento adicional (como el rellenado). - - - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator() -``` - - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator(return_tensors="tf") -``` - - - -## Entrenamiento - - - -Carga el modelo DistilBERT con [`AutoModelForQuestionAnswering`]: - -```py ->>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer - ->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - - -Para familiarizarte con el fine-tuning con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)! - - - -En este punto, solo quedan tres pasos: - -1. Definir tus hiperparámetros de entrenamiento en [`TrainingArguments`]. -2. Pasarle los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer y el collator de datos. -3. Invocar el método [`~Trainer.train`] para realizar el fine-tuning del modelo. - -```py ->>> training_args = TrainingArguments( -... output_dir="./results", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... num_train_epochs=3, -... weight_decay=0.01, -... ) - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_squad["train"], -... eval_dataset=tokenized_squad["validation"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - - -Para realizar el fine-tuning de un modelo en TensorFlow, primero convierte tus datasets al formato `tf.data.Dataset` con el método [`~TFPreTrainedModel.prepare_tf_dataset`]. - -```py ->>> tf_train_set = model.prepare_tf_dataset( -... tokenized_squad["train"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_validation_set = model.prepare_tf_dataset( -... tokenized_squad["validation"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - - - -Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)! - - - -Prepara una función de optimización, un programa para la tasa de aprendizaje y algunos hiperparámetros de entrenamiento: - -```py ->>> from transformers import create_optimizer - ->>> batch_size = 16 ->>> num_epochs = 2 ->>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs ->>> optimizer, schedule = create_optimizer( -... init_lr=2e-5, -... num_warmup_steps=0, -... num_train_steps=total_train_steps, -... ) -``` - -Carga el modelo DistilBERT con [`TFAutoModelForQuestionAnswering`]: - -```py ->>> from transformers import TFAutoModelForQuestionAnswering - ->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased") -``` - -Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> import tensorflow as tf - ->>> model.compile(optimizer=optimizer) -``` - -Invoca el método [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3) -``` - - - - - -Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para responder preguntas, échale un vistazo al -[cuaderno de PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) o al -[cuaderno de TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb) correspondiente. - - diff --git a/docs/source/es/tasks/summarization.md b/docs/source/es/tasks/summarization.md new file mode 100644 index 000000000000..b545e4216e5d --- /dev/null +++ b/docs/source/es/tasks/summarization.md @@ -0,0 +1,226 @@ + + +# Generación de resúmenes + + + +La generación de resúmenes (summarization, en inglés) crea una versión más corta de un documento o un artículo que resume toda su información importante. Junto con la traducción, es un ejemplo de una tarea que puede ser formulada como una tarea secuencia a secuencia. La generación de resúmenes puede ser: + +- Extractiva: Extrae la información más relevante de un documento. +- Abstractiva: Genera un texto nuevo que captura la información más importante. + +Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva. + + + +Consulta la [página de la tarea](https://huggingface.co/tasks/summarization) de generación de resúmenes para obtener más información sobre sus modelos, datasets y métricas asociadas. + + + +## Carga el dataset BillSum + +Carga el dataset BillSum de la biblioteca 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> billsum = load_dataset("billsum", split="ca_test") +``` + +Divide el dataset en un set de train y un set de test: + +```py +>>> billsum = billsum.train_test_split(test_size=0.2) +``` + +A continuación, observa un ejemplo: + +```py +>>> billsum["train"][0] +{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.', + 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.', + 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'} +``` + +El campo `text` es el input y el campo `summary` es el objetivo. + +## Preprocesa + +Carga el tokenizador T5 para procesar `text` y `summary`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("t5-small") +``` + +La función de preprocesamiento necesita: + +1. Agregar un prefijo al input; una clave para que T5 sepa que se trata de una tarea de generación de resúmenes. Algunos modelos capaces de realizar múltiples tareas de NLP requieren una clave que indique la tarea específica. +2. Usar el argumento `text_target` para tokenizar etiquetas. +3. Truncar secuencias para que no sean más largas que la longitud máxima fijada por el parámetro `max_length`. + +```py +>>> prefix = "summarize: " + + +>>> def preprocess_function(examples): +... inputs = [prefix + doc for doc in examples["text"]] +... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) + +... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) + +... model_inputs["labels"] = labels["input_ids"] +... return model_inputs +``` + +Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez: + +```py +>>> tokenized_billsum = billsum.map(preprocess_function, batched=True) +``` + +Usa [`DataCollatorForSeq2Seq`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto y etiquetas a la dimensión del elemento más largo del lote para que tengan un largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente. + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) +``` + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") +``` + + + +## Entrenamiento + + + +Carga T5 con [`AutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") +``` + + + +Para familiarizarte con el proceso para realizar fine-tuning sobre un modelo con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)! + + + +En este punto, solo faltan tres pasos: + +1. Definir tus hiperparámetros de entrenamiento en [`Seq2SeqTrainingArguments`]. +2. Pasarle los argumentos de entrenamiento a [`Seq2SeqTrainer`] junto con el modelo, dataset y data collator. +3. Llamar [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo. + +```py +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="./results", +... evaluation_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... weight_decay=0.01, +... save_total_limit=3, +... num_train_epochs=1, +... fp16=True, +... ) + +>>> trainer = Seq2SeqTrainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_billsum["train"], +... eval_dataset=tokenized_billsum["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + + +Para hacer fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`~datasets.Dataset.to_tf_dataset`]. Especifica los inputs y etiquetas en `columns`, el tamaño de lote, el data collator, y si es necesario mezclar el dataset: + +```py +>>> tf_train_set = tokenized_billsum["train"].to_tf_dataset( +... columns=["attention_mask", "input_ids", "labels"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = tokenized_billsum["test"].to_tf_dataset( +... columns=["attention_mask", "input_ids", "labels"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + + + +Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)! + + + +Crea la función optimizadora, establece la tasa de aprendizaje y algunos hiperparámetros de entrenamiento: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +Carga T5 con [`TFAutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") +``` + +Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> model.compile(optimizer=optimizer) +``` + +Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3) +``` + + + + + +Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para generación de resúmenes, revisa la +[notebook en PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) +o a la [notebook en TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb). + + diff --git a/docs/source/es/tasks/summarization.mdx b/docs/source/es/tasks/summarization.mdx deleted file mode 100644 index c09c4b0b833a..000000000000 --- a/docs/source/es/tasks/summarization.mdx +++ /dev/null @@ -1,222 +0,0 @@ - - -# Generación de resúmenes - - - -La generación de resúmenes (summarization, en inglés) crea una versión más corta de un documento o un artículo que resume toda su información importante. Junto con la traducción, es un ejemplo de una tarea que puede ser formulada como una tarea secuencia a secuencia. La generación de resúmenes puede ser: - -- Extractiva: Extrae la información más relevante de un documento. -- Abstractiva: Genera un texto nuevo que captura la información más importante. - -Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva. - - - -Consulta la [página de la tarea](https://huggingface.co/tasks/summarization) de generación de resúmenes para obtener más información sobre sus modelos, datasets y métricas asociadas. - - - -## Carga el dataset BillSum - -Carga el dataset BillSum de la biblioteca 🤗 Datasets: - -```py ->>> from datasets import load_dataset - ->>> billsum = load_dataset("billsum", split="ca_test") -``` - -Divide el dataset en un set de train y un set de test: - -```py ->>> billsum = billsum.train_test_split(test_size=0.2) -``` - -A continuación, observa un ejemplo: - -```py ->>> billsum["train"][0] -{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.', - 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.', - 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'} -``` - -El campo `text` es el input y el campo `summary` es el objetivo. - -## Preprocesa - -Carga el tokenizador T5 para procesar `text` y `summary`: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("t5-small") -``` - -La función de preprocesamiento necesita: - -1. Agregar un prefijo al input; una clave para que T5 sepa que se trata de una tarea de generación de resúmenes. Algunos modelos capaces de realizar múltiples tareas de NLP requieren una clave que indique la tarea específica. -2. Usar el argumento `text_target` para tokenizar etiquetas. -3. Truncar secuencias para que no sean más largas que la longitud máxima fijada por el parámetro `max_length`. - -```py ->>> prefix = "summarize: " - - ->>> def preprocess_function(examples): -... inputs = [prefix + doc for doc in examples["text"]] -... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) - -... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) - -... model_inputs["labels"] = labels["input_ids"] -... return model_inputs -``` - -Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez: - -```py ->>> tokenized_billsum = billsum.map(preprocess_function, batched=True) -``` - -Usa [`DataCollatorForSeq2Seq`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto y etiquetas a la dimensión del elemento más largo del lote para que tengan un largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente. - - - -```py ->>> from transformers import DataCollatorForSeq2Seq - ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) -``` - - -```py ->>> from transformers import DataCollatorForSeq2Seq - ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") -``` - - - -## Entrenamiento - - - -Carga T5 con [`AutoModelForSeq2SeqLM`]: - -```py ->>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer - ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") -``` - - - -Para familiarizarte con el proceso para realizar fine-tuning sobre un modelo con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)! - - - -En este punto, solo faltan tres pasos: - -1. Definir tus hiperparámetros de entrenamiento en [`Seq2SeqTrainingArguments`]. -2. Pasarle los argumentos de entrenamiento a [`Seq2SeqTrainer`] junto con el modelo, dataset y data collator. -3. Llamar [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo. - -```py ->>> training_args = Seq2SeqTrainingArguments( -... output_dir="./results", -... evaluation_strategy="epoch", -... learning_rate=2e-5, -... per_device_train_batch_size=16, -... per_device_eval_batch_size=16, -... weight_decay=0.01, -... save_total_limit=3, -... num_train_epochs=1, -... fp16=True, -... ) - ->>> trainer = Seq2SeqTrainer( -... model=model, -... args=training_args, -... train_dataset=tokenized_billsum["train"], -... eval_dataset=tokenized_billsum["test"], -... tokenizer=tokenizer, -... data_collator=data_collator, -... ) - ->>> trainer.train() -``` - - -Para hacer fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`~datasets.Dataset.to_tf_dataset`]. Especifica los inputs y etiquetas en `columns`, el tamaño de lote, el data collator, y si es necesario mezclar el dataset: - -```py ->>> tf_train_set = tokenized_billsum["train"].to_tf_dataset( -... columns=["attention_mask", "input_ids", "labels"], -... shuffle=True, -... batch_size=16, -... collate_fn=data_collator, -... ) - ->>> tf_test_set = tokenized_billsum["test"].to_tf_dataset( -... columns=["attention_mask", "input_ids", "labels"], -... shuffle=False, -... batch_size=16, -... collate_fn=data_collator, -... ) -``` - - - -Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)! - - - -Crea la función optimizadora, establece la tasa de aprendizaje y algunos hiperparámetros de entrenamiento: - -```py ->>> from transformers import create_optimizer, AdamWeightDecay - ->>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) -``` - -Carga T5 con [`TFAutoModelForSeq2SeqLM`]: - -```py ->>> from transformers import TFAutoModelForSeq2SeqLM - ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") -``` - -Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): - -```py ->>> model.compile(optimizer=optimizer) -``` - -Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo: - -```py ->>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3) -``` - - - - - -Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para generación de resúmenes, revisa la -[notebook en PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) -o a la [notebook en TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb). - - \ No newline at end of file diff --git a/docs/source/es/training.md b/docs/source/es/training.md new file mode 100644 index 000000000000..7b7b0657bd8f --- /dev/null +++ b/docs/source/es/training.md @@ -0,0 +1,371 @@ + + +# Fine-tuning a un modelo pre-entrenado + +[[open-in-colab]] + +El uso de un modelo pre-entrenado tiene importantes ventajas. Reduce los costos de computación, la huella de carbono y te permite utilizar modelos de última generación sin tener que entrenar uno desde cero. + +* Fine-tuning a un modelo pre-entrenado con 🤗 Transformers [`Trainer`]. +* Fine-tuning a un modelo pre-entrenado en TensorFlow con Keras. +* Fine-tuning a un modelo pre-entrenado en PyTorch nativo. + + + +## Prepara un dataset + + + +Antes de aplicar fine-tuning a un modelo pre-entrenado, descarga un dataset y prepáralo para el entrenamiento. El tutorial anterior nos enseñó cómo procesar los datos para el entrenamiento, y ahora es la oportunidad de poner a prueba estas habilidades. + +Comienza cargando el dataset de [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full): + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("yelp_review_full") +>>> dataset[100] +{'label': 0, + 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} +``` + +Como ya sabes, necesitas un tokenizador para procesar el texto e incluir una estrategia para el padding y el truncamiento para manejar cualquier longitud de secuencia variable. Para procesar tu dataset en un solo paso, utiliza el método de 🤗 Datasets map para aplicar una función de preprocesamiento sobre todo el dataset: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + +>>> def tokenize_function(examples): +... return tokenizer(examples["text"], padding="max_length", truncation=True) + + +>>> tokenized_datasets = dataset.map(tokenize_function, batched=True) +``` + +Si lo deseas, puedes crear un subconjunto más pequeño del dataset completo para aplicarle fine-tuning y así reducir el tiempo. + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + + + +## Fine-tuning con `Trainer` + + + +🤗 Transformers proporciona una clase [`Trainer`] optimizada para el entrenamiento de modelos de 🤗 Transformers, haciendo más fácil el inicio del entrenamiento sin necesidad de escribir manualmente tu propio ciclo. La API del [`Trainer`] soporta una amplia gama de opciones de entrenamiento y características como el logging, el gradient accumulation y el mixed precision. + +Comienza cargando tu modelo y especifica el número de labels previstas. A partir del [Card Dataset](https://huggingface.co/datasets/yelp_review_full#data-fields) de Yelp Review, que como ya sabemos tiene 5 labels: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + + + +Verás una advertencia acerca de que algunos de los pesos pre-entrenados no están siendo utilizados y que algunos pesos están siendo inicializados al azar. No te preocupes, esto es completamente normal. +El head/cabezal pre-entrenado del modelo BERT se descarta y se sustituye por un head de clasificación inicializado aleatoriamente. Puedes aplicar fine-tuning a este nuevo head del modelo en tu tarea de clasificación de secuencias haciendo transfer learning del modelo pre-entrenado. + + + +### Hiperparámetros de entrenamiento + +A continuación, crea una clase [`TrainingArguments`] que contenga todos los hiperparámetros que puedes ajustar así como los indicadores para activar las diferentes opciones de entrenamiento. Para este tutorial puedes empezar con los [hiperparámetros](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) de entrenamiento por defecto, pero siéntete libre de experimentar con ellos para encontrar tu configuración óptima. + +Especifica dónde vas a guardar los checkpoints de tu entrenamiento: + +```py +>>> from transformers import TrainingArguments + +>>> training_args = TrainingArguments(output_dir="test_trainer") +``` + +### Métricas + +El [`Trainer`] no evalúa automáticamente el rendimiento del modelo durante el entrenamiento. Tendrás que pasarle a [`Trainer`] una función para calcular y hacer un reporte de las métricas. La biblioteca de 🤗 Datasets proporciona una función de [`accuracy`](https://huggingface.co/metrics/accuracy) simple que puedes cargar con la función `load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para más información): + +```py +>>> import numpy as np +>>> from datasets import load_metric + +>>> metric = load_metric("accuracy") +``` + +Define la función `compute` en `metric` para calcular el accuracy de tus predicciones. Antes de pasar tus predicciones a `compute`, necesitas convertir las predicciones a logits (recuerda que todos los modelos de 🤗 Transformers devuelven logits). + +```py +>>> def compute_metrics(eval_pred): +... logits, labels = eval_pred +... predictions = np.argmax(logits, axis=-1) +... return metric.compute(predictions=predictions, references=labels) +``` + +Si quieres controlar tus métricas de evaluación durante el fine-tuning, especifica el parámetro `evaluation_strategy` en tus argumentos de entrenamiento para que el modelo tenga en cuenta la métrica de evaluación al final de cada época: + +```py +>>> from transformers import TrainingArguments + +>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") +``` + +### Trainer + +Crea un objeto [`Trainer`] con tu modelo, argumentos de entrenamiento, datasets de entrenamiento y de prueba, y tu función de evaluación: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +A continuación, aplica fine-tuning a tu modelo llamando [`~transformers.Trainer.train`]: + +```py +>>> trainer.train() +``` + + + +## Fine-tuning con Keras + + + +Los modelos de 🤗 Transformers también permiten realizar el entrenamiento en TensorFlow con la API de Keras. Solo es necesario hacer algunos cambios antes de hacer fine-tuning. + +### Convierte el dataset al formato de TensorFlow + +El [`DefaultDataCollator`] junta los tensores en un batch para que el modelo se entrene en él. Asegúrate de especificar `return_tensors` para devolver los tensores de TensorFlow: + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") +``` + + + +[`Trainer`] utiliza [`DataCollatorWithPadding`] por defecto por lo que no es necesario especificar explícitamente un intercalador de datos (data collator, en inglés). + + + +A continuación, convierte los datasets tokenizados en datasets de TensorFlow con el método [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica tus entradas en `columns` y tu etiqueta en `label_cols`: + +```py +>>> tf_train_dataset = small_train_dataset.to_tf_dataset( +... columns=["attention_mask", "input_ids", "token_type_ids"], +... label_cols="labels", +... shuffle=True, +... collate_fn=data_collator, +... batch_size=8, +... ) + +>>> tf_validation_dataset = small_eval_dataset.to_tf_dataset( +... columns=["attention_mask", "input_ids", "token_type_ids"], +... label_cols="labels", +... shuffle=False, +... collate_fn=data_collator, +... batch_size=8, +... ) +``` + +### Compila y ajusta + +Carguemos un modelo TensorFlow con el número esperado de labels: + +```py +>>> import tensorflow as tf +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + +A continuación, compila y aplica fine-tuning a tu modelo con [`fit`](https://keras.io/api/models/model_training_apis/) como lo harías con cualquier otro modelo de Keras: + +```py +>>> model.compile( +... optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), +... loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), +... metrics=tf.metrics.SparseCategoricalAccuracy(), +... ) + +>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3) +``` + + + +## Fine-tune en PyTorch nativo + + + +El [`Trainer`] se encarga del ciclo de entrenamiento y permite aplicar fine-tuning a un modelo en una sola línea de código. Para los que prefieran escribir su propio ciclo de entrenamiento, también pueden aplicar fine-tuning a un modelo de 🤗 Transformers en PyTorch nativo. + +En este punto, es posible que necesites reiniciar tu notebook o ejecutar el siguiente código para liberar algo de memoria: + +```py +del model +del pytorch_model +del trainer +torch.cuda.empty_cache() +``` + +A continuación, haremos un post-procesamiento manual al `tokenized_dataset` y así prepararlo para el entrenamiento. + +1. Elimina la columna de `text` porque el modelo no acepta texto en crudo como entrada: + + ```py + >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) + ``` + +2. Cambia el nombre de la columna de `label` a `labels` porque el modelo espera que el argumento se llame `labels`: + + ```py + >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") + ``` + +3. Establece el formato del dataset para devolver tensores PyTorch en lugar de listas: + + ```py + >>> tokenized_datasets.set_format("torch") + ``` + +A continuación, crea un subconjunto más pequeño del dataset como se ha mostrado anteriormente para acelerar el fine-tuning: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + +### DataLoader + +Crea un `DataLoader` para tus datasets de entrenamiento y de prueba para poder iterar sobre batches de datos: + +```py +>>> from torch.utils.data import DataLoader + +>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) +>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) +``` + +Carga tu modelo con el número de labels previstas: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + +### Optimiza y programa el learning rate + +Crea un optimizador y el learning rate para aplicar fine-tuning al modelo. Vamos a utilizar el optimizador [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) de PyTorch: + +```py +>>> from torch.optim import AdamW + +>>> optimizer = AdamW(model.parameters(), lr=5e-5) +``` + +Crea el learning rate desde el [`Trainer`]: + +```py +>>> from transformers import get_scheduler + +>>> num_epochs = 3 +>>> num_training_steps = num_epochs * len(train_dataloader) +>>> lr_scheduler = get_scheduler( +... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps +... ) +``` + +Por último, especifica el `device` o entorno de ejecución para utilizar una GPU si tienes acceso a una. De lo contrario, el entrenamiento en una CPU puede llevarte varias horas en lugar de un par de minutos. + +```py +>>> import torch + +>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +>>> model.to(device) +``` + + + +Consigue acceso gratuito a una GPU en la nube si es que no tienes este recurso de forma local con un notebook alojado en [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/). + + + +Genial, ¡ahora podemos entrenar! 🥳 + +### Ciclo de entrenamiento + +Para hacer un seguimiento al progreso del entrenamiento, utiliza la biblioteca [tqdm](https://tqdm.github.io/) para añadir una barra de progreso sobre el número de pasos de entrenamiento: + +```py +>>> from tqdm.auto import tqdm + +>>> progress_bar = tqdm(range(num_training_steps)) + +>>> model.train() +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... outputs = model(**batch) +... loss = outputs.loss +... loss.backward() + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +### Métricas + +De la misma manera que necesitas añadir una función de evaluación al [`Trainer`], necesitas hacer lo mismo cuando escribas tu propio ciclo de entrenamiento. Pero en lugar de calcular y reportar la métrica al final de cada época, esta vez acumularás todos los batches con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) y calcularás la métrica al final. + +```py +>>> metric = load_metric("accuracy") +>>> model.eval() +>>> for batch in eval_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... with torch.no_grad(): +... outputs = model(**batch) + +... logits = outputs.logits +... predictions = torch.argmax(logits, dim=-1) +... metric.add_batch(predictions=predictions, references=batch["labels"]) + +>>> metric.compute() +``` + + + +## Recursos adicionales + +Para más ejemplos de fine-tuning consulta: + +- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) incluye scripts + para entrenar tareas comunes de NLP en PyTorch y TensorFlow. + +- [🤗 Transformers Notebooks](notebooks) contiene varios notebooks sobre cómo aplicar fine-tuning a un modelo para tareas específicas en PyTorch y TensorFlow. diff --git a/docs/source/es/training.mdx b/docs/source/es/training.mdx deleted file mode 100644 index 467df17d1380..000000000000 --- a/docs/source/es/training.mdx +++ /dev/null @@ -1,367 +0,0 @@ - - -# Fine-tuning a un modelo pre-entrenado - -[[open-in-colab]] - -El uso de un modelo pre-entrenado tiene importantes ventajas. Reduce los costos de computación, la huella de carbono y te permite utilizar modelos de última generación sin tener que entrenar uno desde cero. - -* Fine-tuning a un modelo pre-entrenado con 🤗 Transformers [`Trainer`]. -* Fine-tuning a un modelo pre-entrenado en TensorFlow con Keras. -* Fine-tuning a un modelo pre-entrenado en PyTorch nativo. - - - -## Prepara un dataset - - - -Antes de aplicar fine-tuning a un modelo pre-entrenado, descarga un dataset y prepáralo para el entrenamiento. El tutorial anterior nos enseñó cómo procesar los datos para el entrenamiento, y ahora es la oportunidad de poner a prueba estas habilidades. - -Comienza cargando el dataset de [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full): - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("yelp_review_full") ->>> dataset[100] -{'label': 0, - 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} -``` - -Como ya sabes, necesitas un tokenizador para procesar el texto e incluir una estrategia para el padding y el truncamiento para manejar cualquier longitud de secuencia variable. Para procesar tu dataset en un solo paso, utiliza el método de 🤗 Datasets map para aplicar una función de preprocesamiento sobre todo el dataset: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - - ->>> def tokenize_function(examples): -... return tokenizer(examples["text"], padding="max_length", truncation=True) - - ->>> tokenized_datasets = dataset.map(tokenize_function, batched=True) -``` - -Si lo deseas, puedes crear un subconjunto más pequeño del dataset completo para aplicarle fine-tuning y así reducir el tiempo. - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - - - -## Fine-tuning con `Trainer` - - - -🤗 Transformers proporciona una clase [`Trainer`] optimizada para el entrenamiento de modelos de 🤗 Transformers, haciendo más fácil el inicio del entrenamiento sin necesidad de escribir manualmente tu propio ciclo. La API del [`Trainer`] soporta una amplia gama de opciones de entrenamiento y características como el logging, el gradient accumulation y el mixed precision. - -Comienza cargando tu modelo y especifica el número de labels previstas. A partir del [Card Dataset](https://huggingface.co/datasets/yelp_review_full#data-fields) de Yelp Review, que como ya sabemos tiene 5 labels: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - - - -Verás una advertencia acerca de que algunos de los pesos pre-entrenados no están siendo utilizados y que algunos pesos están siendo inicializados al azar. No te preocupes, esto es completamente normal. -El head/cabezal pre-entrenado del modelo BERT se descarta y se sustituye por un head de clasificación inicializado aleatoriamente. Puedes aplicar fine-tuning a este nuevo head del modelo en tu tarea de clasificación de secuencias haciendo transfer learning del modelo pre-entrenado. - - - -### Hiperparámetros de entrenamiento - -A continuación, crea una clase [`TrainingArguments`] que contenga todos los hiperparámetros que puedes ajustar así como los indicadores para activar las diferentes opciones de entrenamiento. Para este tutorial puedes empezar con los [hiperparámetros](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) de entrenamiento por defecto, pero siéntete libre de experimentar con ellos para encontrar tu configuración óptima. - -Especifica dónde vas a guardar los checkpoints de tu entrenamiento: - -```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments(output_dir="test_trainer") -``` - -### Métricas - -El [`Trainer`] no evalúa automáticamente el rendimiento del modelo durante el entrenamiento. Tendrás que pasarle a [`Trainer`] una función para calcular y hacer un reporte de las métricas. La biblioteca de 🤗 Datasets proporciona una función de [`accuracy`](https://huggingface.co/metrics/accuracy) simple que puedes cargar con la función `load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para más información): - -```py ->>> import numpy as np ->>> from datasets import load_metric - ->>> metric = load_metric("accuracy") -``` - -Define la función `compute` en `metric` para calcular el accuracy de tus predicciones. Antes de pasar tus predicciones a `compute`, necesitas convertir las predicciones a logits (recuerda que todos los modelos de 🤗 Transformers devuelven logits). - -```py ->>> def compute_metrics(eval_pred): -... logits, labels = eval_pred -... predictions = np.argmax(logits, axis=-1) -... return metric.compute(predictions=predictions, references=labels) -``` - -Si quieres controlar tus métricas de evaluación durante el fine-tuning, especifica el parámetro `evaluation_strategy` en tus argumentos de entrenamiento para que el modelo tenga en cuenta la métrica de evaluación al final de cada época: - -```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") -``` - -### Trainer - -Crea un objeto [`Trainer`] con tu modelo, argumentos de entrenamiento, datasets de entrenamiento y de prueba, y tu función de evaluación: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -A continuación, aplica fine-tuning a tu modelo llamando [`~transformers.Trainer.train`]: - -```py ->>> trainer.train() -``` - - - -## Fine-tuning con Keras - - - -Los modelos de 🤗 Transformers también permiten realizar el entrenamiento en TensorFlow con la API de Keras. Solo es necesario hacer algunos cambios antes de hacer fine-tuning. - -### Convierte el dataset al formato de TensorFlow - -El [`DefaultDataCollator`] junta los tensores en un batch para que el modelo se entrene en él. Asegúrate de especificar `return_tensors` para devolver los tensores de TensorFlow: - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator(return_tensors="tf") -``` - - - -[`Trainer`] utiliza [`DataCollatorWithPadding`] por defecto por lo que no es necesario especificar explícitamente un intercalador de datos (data collator, en inglés). - - - -A continuación, convierte los datasets tokenizados en datasets de TensorFlow con el método [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica tus entradas en `columns` y tu etiqueta en `label_cols`: - -```py ->>> tf_train_dataset = small_train_dataset.to_tf_dataset( -... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], -... shuffle=True, -... collate_fn=data_collator, -... batch_size=8, -... ) - ->>> tf_validation_dataset = small_eval_dataset.to_tf_dataset( -... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], -... shuffle=False, -... collate_fn=data_collator, -... batch_size=8, -... ) -``` - -### Compila y ajusta - -Carguemos un modelo TensorFlow con el número esperado de labels: - -```py ->>> import tensorflow as tf ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - -A continuación, compila y aplica fine-tuning a tu modelo con [`fit`](https://keras.io/api/models/model_training_apis/) como lo harías con cualquier otro modelo de Keras: - -```py ->>> model.compile( -... optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), -... loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), -... metrics=tf.metrics.SparseCategoricalAccuracy(), -... ) - ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3) -``` - - - -## Fine-tune en PyTorch nativo - - - -El [`Trainer`] se encarga del ciclo de entrenamiento y permite aplicar fine-tuning a un modelo en una sola línea de código. Para los que prefieran escribir su propio ciclo de entrenamiento, también pueden aplicar fine-tuning a un modelo de 🤗 Transformers en PyTorch nativo. - -En este punto, es posible que necesites reiniciar tu notebook o ejecutar el siguiente código para liberar algo de memoria: - -```py -del model -del pytorch_model -del trainer -torch.cuda.empty_cache() -``` - -A continuación, haremos un post-procesamiento manual al `tokenized_dataset` y así prepararlo para el entrenamiento. - -1. Elimina la columna de `text` porque el modelo no acepta texto en crudo como entrada: - - ```py - >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) - ``` - -2. Cambia el nombre de la columna de `label` a `labels` porque el modelo espera que el argumento se llame `labels`: - - ```py - >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") - ``` - -3. Establece el formato del dataset para devolver tensores PyTorch en lugar de listas: - - ```py - >>> tokenized_datasets.set_format("torch") - ``` - -A continuación, crea un subconjunto más pequeño del dataset como se ha mostrado anteriormente para acelerar el fine-tuning: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - -### DataLoader - -Crea un `DataLoader` para tus datasets de entrenamiento y de prueba para poder iterar sobre batches de datos: - -```py ->>> from torch.utils.data import DataLoader - ->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) ->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) -``` - -Carga tu modelo con el número de labels previstas: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - -### Optimiza y programa el learning rate - -Crea un optimizador y el learning rate para aplicar fine-tuning al modelo. Vamos a utilizar el optimizador [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) de PyTorch: - -```py ->>> from torch.optim import AdamW - ->>> optimizer = AdamW(model.parameters(), lr=5e-5) -``` - -Crea el learning rate desde el [`Trainer`]: - -```py ->>> from transformers import get_scheduler - ->>> num_epochs = 3 ->>> num_training_steps = num_epochs * len(train_dataloader) ->>> lr_scheduler = get_scheduler( -... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps -... ) -``` - -Por último, especifica el `device` o entorno de ejecución para utilizar una GPU si tienes acceso a una. De lo contrario, el entrenamiento en una CPU puede llevarte varias horas en lugar de un par de minutos. - -```py ->>> import torch - ->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ->>> model.to(device) -``` - - - -Consigue acceso gratuito a una GPU en la nube si es que no tienes este recurso de forma local con un notebook alojado en [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/). - - - -Genial, ¡ahora podemos entrenar! 🥳 - -### Ciclo de entrenamiento - -Para hacer un seguimiento al progreso del entrenamiento, utiliza la biblioteca [tqdm](https://tqdm.github.io/) para añadir una barra de progreso sobre el número de pasos de entrenamiento: - -```py ->>> from tqdm.auto import tqdm - ->>> progress_bar = tqdm(range(num_training_steps)) - ->>> model.train() ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... outputs = model(**batch) -... loss = outputs.loss -... loss.backward() - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -### Métricas - -De la misma manera que necesitas añadir una función de evaluación al [`Trainer`], necesitas hacer lo mismo cuando escribas tu propio ciclo de entrenamiento. Pero en lugar de calcular y reportar la métrica al final de cada época, esta vez acumularás todos los batches con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) y calcularás la métrica al final. - -```py ->>> metric = load_metric("accuracy") ->>> model.eval() ->>> for batch in eval_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... with torch.no_grad(): -... outputs = model(**batch) - -... logits = outputs.logits -... predictions = torch.argmax(logits, dim=-1) -... metric.add_batch(predictions=predictions, references=batch["labels"]) - ->>> metric.compute() -``` - - - -## Recursos adicionales - -Para más ejemplos de fine-tuning consulta: - -- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) incluye scripts - para entrenar tareas comunes de NLP en PyTorch y TensorFlow. - -- [🤗 Transformers Notebooks](notebooks) contiene varios notebooks sobre cómo aplicar fine-tuning a un modelo para tareas específicas en PyTorch y TensorFlow. diff --git a/docs/source/fr/_config.py b/docs/source/fr/_config.py new file mode 100644 index 000000000000..07f1de5f7db0 --- /dev/null +++ b/docs/source/fr/_config.py @@ -0,0 +1,14 @@ +# docstyle-ignore +INSTALL_CONTENT = """ +# Installation de Transformers +! pip install transformers datasets +# Pour installer à partir du code source au lieu de la dernière version, commentez la commande ci-dessus et décommentez la suivante. +# ! pip install git+https://github.com/huggingface/transformers.git +""" + +notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] +black_avoid_patterns = { + "{processor_class}": "FakeProcessorClass", + "{model_class}": "FakeModelClass", + "{object_class}": "FakeObjectClass", +} diff --git a/docs/source/fr/_toctree.yml b/docs/source/fr/_toctree.yml new file mode 100755 index 000000000000..11632a423b6a --- /dev/null +++ b/docs/source/fr/_toctree.yml @@ -0,0 +1,156 @@ +- sections: + - local: index + title: 🤗 Transformers + - local: quicktour + title: Visite rapide + - local: in_translation + title: Installation + title: Démarrer +- sections: + - local: in_translation + title: Pipelines pour l'inférence + - local: in_translation + title: Chargement d'instances pré-entraînées avec une AutoClass + - local: in_translation + title: Préparation des données + - local: in_translation + title: Fine-tune un modèle pré-entraîné + - local: in_translation + title: Entraînement distribué avec 🤗 Accelerate + - local: in_translation + title: Partager un modèle + title: Tutoriels +- sections: + - sections: + - local: in_translation + title: Créer votre architecture + - local: in_translation + title: Partager vos modèles + - local: in_translation + title: Entraînement avec un script + - local: in_translation + title: Entraînement avec Amazon SageMaker + - local: in_translation + title: Convertir depuis des checkpoints Tensorflow + - local: in_translation + title: Exporter vers ONNX + - local: in_translation + title: Exporter vers TorchScript + - local: in_translation + title: Aide au dépannage + title: Usage général + - sections: + - local: in_translation + title: Utiliser les tokenizers de 🤗 Tokenizers + - local: in_translation + title: Inférence avec les modèles multilingues + - local: in_translation + title: Stratégies de génération de texte + - sections: + - isExpanded: false + local: in_translation + title: Classification de texte + - local: in_translation + title: Classification de token + - local: in_translation + title: Système de question-réponse + - local: in_translation + title: Modélisation causale du langage + - local: in_translation + title: Modélisation du langage avec masque + - local: in_translation + title: Traduction + - local: in_translation + title: Génération de résumé + - local: in_translation + title: Question à choix multiple + title: Guides des tâches + title: Traitement automatique des langues + - sections: + - local: in_translation + title: Classification audio + - local: in_translation + title: Reconnaissance automatique de la parole + title: Audio + - sections: + - local: in_translation + title: Classification d'images + - local: in_translation + title: Segmentation sémantique + - local: in_translation + title: Classification de vidéos + - local: in_translation + title: Détection d'objets + title: Vision par ordinateur + - sections: + - local: in_translation + title: Performance et extensibilité + - sections: + - local: in_translation + title: Comment contribuer à transformers? + - local: in_translation + title: Comment ajouter un modèle à 🤗 Transformers? + - local: in_translation + title: Comment convertir un modèle 🤗 Transformers vers TensorFlow? + - local: in_translation + title: Comment ajouter un pipeline à 🤗 Transformers? + - local: in_translation + title: Tester + - local: in_translation + title: Vérification pour une Pull Request + title: Contribuer + - local: in_translation + title: 🤗 Transformers Notebooks + - local: in_translation + title: Ressources communautaires + - local: in_translation + title: Benchmarks + - local: in_translation + title: Migration à partir de versions précédentes + title: Guides d'utilisation +- sections: + - local: in_translation + title: Philosophie + - local: in_translation + title: Glossaire + - local: in_translation + title: Qu'est ce 🤗 Transformers peut faire ? + - local: in_translation + title: Quelles tâches 🤗 Transformers peut résoudre ? + - local: in_translation + title: Résumé des modèles + - local: in_translation + title: Résumé des tokenizers + - local: in_translation + title: Remplissage et troncature + - local: in_translation + title: BERTology + - local: in_translation + title: Perplexité des modèles à longueur fixe + - local: in_translation + title: Pipelines pour inférence avec des serveurs web + title: Guides conceptuels +- sections: + - isExpanded: false + sections: + - local: in_translation + title: Classes principales + - local: in_translation + title: Modèles textuels + - local: in_translation + title: Modèles visuels + - local: in_translation + title: Modèles audio + - local: in_translation + title: Modèles multimodal + - local: in_translation + title: Modèles d'apprentissage par renforcement + - local: in_translation + title: Modèles de séries temporelles + - local: in_translation + title: Graph models + title: Modèles + - sections: + - local: in_translation + title: Utilitaires internes + title: API diff --git a/docs/source/fr/in_translation.md b/docs/source/fr/in_translation.md new file mode 100644 index 000000000000..910559ef6c9a --- /dev/null +++ b/docs/source/fr/in_translation.md @@ -0,0 +1,5 @@ + + +# Traduction en cours. \ No newline at end of file diff --git a/docs/source/fr/index.md b/docs/source/fr/index.md new file mode 100644 index 000000000000..9e3e6eb5c236 --- /dev/null +++ b/docs/source/fr/index.md @@ -0,0 +1,410 @@ + + +# 🤗 Transformers + +Apprentissage automatique de pointe pour [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), et [JAX](https://jax.readthedocs.io/en/latest/). + +🤗 Transformers fournit des API et des outils pour télécharger et entraîner facilement des modèles pré-entraînés de pointe. L'utilisation de modèles pré-entraînés peut réduire vos coûts de calcul, votre empreinte carbone, et vous faire économiser le temps et les ressources nécessaires pour entraîner un modèle à partir de zéro. Ces modèles prennent en charge des tâches courantes dans différentes modalités, telles que : + +📝 **Traitement automatique des langues**: classification de texte, reconnaissance d'entités, système de question-réponse, modèle de langage, génération de résumé, traduction, question à choix multiples et génération de texte.
+🖼️ **Vision par ordinateur**: classification d'image, détection d'objet et segmentation.
+🗣️ **Audio**: reconnaissance automatique de la parole et classification audio.
+🐙 **Multimodalité**: système de question-réponse avec des tableaux ou images, reconnaissance optique de caractères, extraction d'information depuis des documents scannés et classification de vidéo. + +🤗 Transformers prend en charge l'interopérabilité entre PyTorch, TensorFlow et JAX. Cela permet d'utiliser un framework différent à chaque étape de la vie d'un modèle, par exemple entraîner un modèle en trois lignes de code avec un framework, et le charger pour l'inférence avec un autre. Les modèles peuvent également être exportés dans un format comme ONNX et TorchScript pour être déployés dans des environnements de production. + +Rejoignez la communauté grandissante sur le [Hub](https://huggingface.co/models), le [forum](https://discuss.huggingface.co/) ou [Discord](https://discord.com/invite/JfAtkvEtRb) dès aujourd'hui ! + +## Si vous cherchez un support personnalisé de l'équipe Hugging Face + + + HuggingFace Expert Acceleration Program + + +## Contents + +La documentation est organisée en 5 parties: + +- **DEMARRER** propose une visite rapide de la bibliothèque et des instructions d'installation pour être opérationnel. +- **TUTORIELS** excellent point de départ pour les débutants. Cette section vous aidera à acquérir les compétences de base dont vous avez besoin pour commencer à utiliser la bibliothèque. +- **GUIDES D'UTILISATION** pour différentes tâches comme par exemple le finetuning d'un modèle pré-entraîné pour la classification de texte ou comment créer et partager votre propre modèle. +- **GUIDES CONCEPTUELS** pour plus de discussions et d'explications sur les concepts et les idées sous-jacentes aux modèles, aux tâches et à la philosophie de conception de 🤗 Transformers. +- **API** décrit toutes les classes et fonctions : + + - **CLASSES PRINCIPALES** détaille les classes les plus importantes comme la configuration, le modèle, le tokenizer et le pipeline.. + - **MODELES** détaille les classes et les fonctions propres à chaque modèle de la bibliothèque. + - **UTILITAIRES INTERNES** détaille les classes et fonctions utilitaires utilisées en interne. + +### Modèles supportés + + + +1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. +1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell. +1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. +1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. +1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. +1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. +1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. +1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. +1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. +1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. +1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. +1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). +1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry. +1. **[BridgeTower](model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. +1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. +1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. +1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. +1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. +1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. +1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. +1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. +1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. +1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. +1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. +1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. +1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. +1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. +1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai. +1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DETA](model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. +1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi. +1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. +1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. +1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. +1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. +1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. +1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. +1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei +1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. +1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. +1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. +1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. +1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. +1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. +1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach +1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori. +1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. +1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki. +1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. +1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. +1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. +1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. +1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. +1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. +1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei. +1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. +1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. +1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding. +1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang. +1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. +1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. +1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. +1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. +1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team. +1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei. +1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. +1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. +1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. +1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. +1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. +1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. +1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. +1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. +1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. +1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. +1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. +1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. +1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. +1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. +1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. +1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. +1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. +1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. +1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. +1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. +1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. +1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. +1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. +1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. +1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechT5](model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. +1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. +1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. +1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. +1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. +1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. +1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. +1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. +1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. +1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. +1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace). +1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani. +1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine +1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. +1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. +1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. +1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. +1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim. +1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas. +1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. +1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. +1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. +1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. +1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. +1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. +1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. +1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. +1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. +1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. +1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. +1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. +1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. +1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. + + +### Frameworks compatibles + +Le tableau ci-dessous représente la prise en charge actuelle dans la bibliothèque pour chacun de ces modèles, qu'ils aient ou non un tokenizer Python (appelé "slow"). Un tokenizer rapide ("fast") soutenu par la bibliothèque 🤗 Tokenizers, qu'ils aient un support en Jax (via Flax), PyTorch, et/ou TensorFlow. + + + +| Modèle | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | +|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| +| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| AltCLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| Audio Spectrogram Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | +| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | +| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | +| BigBird-Pegasus | ❌ | ❌ | ✅ | ❌ | ❌ | +| BioGpt | ✅ | ❌ | ✅ | ❌ | ❌ | +| BiT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | +| BLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| BLOOM | ❌ | ✅ | ✅ | ❌ | ❌ | +| BridgeTower | ❌ | ❌ | ✅ | ❌ | ❌ | +| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| CANINE | ✅ | ❌ | ✅ | ❌ | ❌ | +| Chinese-CLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | +| CLIPSeg | ❌ | ❌ | ✅ | ❌ | ❌ | +| CodeGen | ✅ | ✅ | ✅ | ❌ | ❌ | +| Conditional DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ | +| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | +| CvT | ❌ | ❌ | ✅ | ✅ | ❌ | +| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | +| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | +| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Deformable DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DeiT | ❌ | ❌ | ✅ | ✅ | ❌ | +| DETA | ❌ | ❌ | ✅ | ❌ | ❌ | +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DiNAT | ❌ | ❌ | ✅ | ❌ | ❌ | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| DonutSwin | ❌ | ❌ | ✅ | ❌ | ❌ | +| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| EfficientFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | +| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| ERNIE | ❌ | ❌ | ✅ | ❌ | ❌ | +| ESM | ✅ | ❌ | ✅ | ✅ | ❌ | +| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | +| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | +| FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | +| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | +| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| GIT | ❌ | ❌ | ✅ | ❌ | ❌ | +| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | +| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | +| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | +| GPT NeoX Japanese | ✅ | ❌ | ✅ | ❌ | ❌ | +| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | +| GPT-Sw3 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Graphormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| GroupViT | ❌ | ❌ | ✅ | ✅ | ❌ | +| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | +| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Jukebox | ✅ | ❌ | ✅ | ❌ | ❌ | +| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | +| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | +| LED | ✅ | ✅ | ✅ | ✅ | ❌ | +| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| LiLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| LongT5 | ❌ | ❌ | ✅ | ❌ | ✅ | +| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | +| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| M-CTC-T | ❌ | ❌ | ✅ | ❌ | ❌ | +| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | +| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | +| MarkupLM | ✅ | ✅ | ✅ | ❌ | ❌ | +| Mask2Former | ❌ | ❌ | ✅ | ❌ | ❌ | +| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| MaskFormerSwin | ❌ | ❌ | ❌ | ❌ | ❌ | +| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | +| Megatron-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| MobileNetV1 | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileNetV2 | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileViT | ❌ | ❌ | ✅ | ✅ | ❌ | +| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| MVP | ✅ | ✅ | ✅ | ❌ | ❌ | +| NAT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Nezha | ❌ | ❌ | ✅ | ❌ | ❌ | +| Nyströmformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| OneFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | +| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | +| OPT | ❌ | ❌ | ✅ | ✅ | ✅ | +| OWL-ViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | +| PEGASUS-X | ❌ | ❌ | ✅ | ❌ | ❌ | +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | +| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | +| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | +| REALM | ✅ | ✅ | ✅ | ❌ | ❌ | +| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | +| RegNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ResNet | ❌ | ❌ | ✅ | ✅ | ❌ | +| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| RoBERTa-PreLayerNorm | ❌ | ❌ | ✅ | ✅ | ✅ | +| RoCBert | ✅ | ❌ | ✅ | ❌ | ❌ | +| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | +| SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ | +| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | +| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | +| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | +| SpeechT5 | ✅ | ❌ | ✅ | ❌ | ❌ | +| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | +| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ | +| Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ | +| Swin2SR | ❌ | ❌ | ✅ | ❌ | ❌ | +| SwitchTransformers | ❌ | ❌ | ✅ | ❌ | ❌ | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Table Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +| Time Series Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| TimeSformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | +| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | +| UPerNet | ❌ | ❌ | ✅ | ❌ | ❌ | +| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | +| VideoMAE | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | +| ViT Hybrid | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | +| ViTMSN | ❌ | ❌ | ✅ | ❌ | ❌ | +| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | +| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | +| Whisper | ✅ | ❌ | ✅ | ✅ | ❌ | +| X-CLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| XGLM | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | +| XLM-ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | +| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | +| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | + + diff --git a/docs/source/fr/quicktour.md b/docs/source/fr/quicktour.md new file mode 100644 index 000000000000..666a931f825f --- /dev/null +++ b/docs/source/fr/quicktour.md @@ -0,0 +1,550 @@ + + +# Visite rapide + +[[open-in-colab]] + +Soyez opérationnel avec 🤗 Transformers ! Que vous soyez un développeur ou un utilisateur lambda, cette visite rapide vous aidera à démarrer et vous montrera comment utiliser le [`pipeline`] pour l'inférence, charger un modèle pré-entraîné et un préprocesseur avec une [AutoClass](./model_doc/auto), et entraîner rapidement un modèle avec PyTorch ou TensorFlow. Si vous êtes un débutant, nous vous recommandons de consulter nos tutoriels ou notre [cours](https://huggingface.co/course/chapter1/1) suivant pour des explications plus approfondies des concepts présentés ici. + +Avant de commencer, assurez-vous que vous avez installé toutes les bibliothèques nécessaires : + +```bash +!pip install transformers datasets +``` + +Vous aurez aussi besoin d'installer votre bibliothèque d'apprentissage profond favorite : + + + + +```bash +pip install torch +``` + + + +```bash +pip install tensorflow +``` + + + +## Pipeline + + + +Le [`pipeline`] est le moyen le plus simple d'utiliser un modèle pré-entraîné pour l'inférence. Vous pouvez utiliser le [`pipeline`] prêt à l'emploi pour de nombreuses tâches dans différentes modalités. Consultez le tableau ci-dessous pour connaître les tâches prises en charge : + +| **Tâche** | **Description** | **Modalité** | **Identifiant du pipeline** | +|------------------------------|--------------------------------------------------------------------------------------------------------------|----------------------|-----------------------------------------------| +| Classification de texte | Attribue une catégorie à une séquence de texte donnée | Texte | pipeline(task="sentiment-analysis") | +| Génération de texte | Génère du texte à partir d'une consigne donnée | Texte | pipeline(task="text-generation") | +| Reconnaissance de token nommé | Attribue une catégorie à chaque token dans une séquence (personnes, organisation, localisation, etc.) | Texte | pipeline(task="ner") | +| Question réponse | Extrait une réponse du texte en fonction du contexte et d'une question | Texte | pipeline(task="question-answering") | +| Prédiction de token masqué | Prédit correctement le token masqué dans une séquence | Texte | pipeline(task="fill-mask") | +| Génération de résumé | Génère un résumé d'une séquence de texte donnée ou d'un document | Texte | pipeline(task="summarization") | +| Traduction | Traduit du texte d'un langage à un autre | Texte | pipeline(task="translation") | +| Classification d'image | Attribue une catégorie à une image | Image | pipeline(task="image-classification") | +| Segmentation d'image | Attribue une catégorie à chaque pixel d'une image (supporte la segmentation sémantique, panoptique et d'instance) | Image | pipeline(task="image-segmentation") | +| Détection d'objets | Prédit les délimitations et catégories d'objets dans une image | Image | pipeline(task="object-detection") | +| Classification d'audio | Attribue une catégorie à un fichier audio | Audio | pipeline(task="audio-classification") | +| Reconnaissance automatique de la parole | Extrait le discours d'un fichier audio en texte | Audio | pipeline(task="automatic-speech-recognition") | +| Question réponse visuels | Etant données une image et une question, répond correctement à une question sur l'image | Modalités multiples | pipeline(task="vqa") | + +Commencez par créer une instance de [`pipeline`] et spécifiez la tâche pour laquelle vous souhaitez l'utiliser. Vous pouvez utiliser le [`pipeline`] pour n'importe laquelle des tâches mentionnées dans le tableau précédent. Pour obtenir une liste complète des tâches prises en charge, consultez la documentation de l'[API pipeline](./main_classes/pipelines). Dans ce guide, nous utiliserons le [`pipeline`] pour l'analyse des sentiments à titre d'exemple : + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("sentiment-analysis") +``` + +Le [`pipeline`] télécharge et stocke en cache un [modèle pré-entraîné](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) et un tokenizer par défaut pour l'analyse des sentiments. Vous pouvez maintenant utiliser le `classifier` sur le texte de votre choix : + +```py +>>> classifier("We are very happy to show you the 🤗 Transformers library.") +[{'label': 'POSITIVE', 'score': 0.9998}] +``` + +Si vous voulez classifier plus qu'un texte, donnez une liste de textes au [`pipeline`] pour obtenir une liste de dictionnaires en retour : + +```py +>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) +>>> for result in results: +... print(f"label: {result['label']}, avec le score de: {round(result['score'], 4)}") +label: POSITIVE, avec le score de: 0.9998 +label: NEGATIVE, avec le score de: 0.5309 +``` + +Le [`pipeline`] peut aussi itérer sur un jeu de données entier pour n'importe quelle tâche. Prenons par exemple la reconnaissance automatique de la parole : + +```py +>>> import torch +>>> from transformers import pipeline + +>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") +``` + +Chargez un jeu de données audio (voir le 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) pour plus de détails) sur lequel vous souhaitez itérer. Pour cet exemple, nous chargeons le jeu de données [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) : + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT +``` + +Vous devez vous assurer que le taux d'échantillonnage de l'ensemble de données correspond au taux d'échantillonnage sur lequel [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) a été entraîné : + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate)) +``` + +Les fichiers audio sont automatiquement chargés et rééchantillonnés lors de l'appel de la colonne `"audio"`. +Extrayez les tableaux de formes d'ondes brutes des quatre premiers échantillons et passez-les comme une liste au pipeline : + +```py +>>> result = speech_recognizer(dataset[:4]["audio"]) +>>> print([d["text"] for d in result]) +['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT'] +``` + +Pour les ensembles de données plus importants où les entrées sont volumineuses (comme dans les domaines de la parole ou de la vision), utilisez plutôt un générateur au lieu d'une liste pour charger toutes les entrées en mémoire. Pour plus d'informations, consultez la documentation de l'[API pipeline](./main_classes/pipelines). + +### Utiliser une autre modèle et tokenizer dans le pipeline + +Le [`pipeline`] peut être utilisé avec n'importe quel modèle du [Hub](https://huggingface.co/models), ce qui permet d'adapter facilement le [`pipeline`] à d'autres cas d'utilisation. Par exemple, si vous souhaitez un modèle capable de traiter du texte français, utilisez les filtres du Hub pour trouver un modèle approprié. Le premier résultat renvoie un [modèle BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingue finetuné pour l'analyse des sentiments que vous pouvez utiliser pour le texte français : + +```py +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +``` + + + +Utilisez [`AutoModelForSequenceClassification`] et [`AutoTokenizer`] pour charger le modèle pré-entraîné et le tokenizer adapté (plus de détails sur une `AutoClass` dans la section suivante) : + +```py +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + +Utilisez [`TFAutoModelForSequenceClassification`] et [`AutoTokenizer`] pour charger le modèle pré-entraîné et le tokenizer adapté (plus de détails sur une `TFAutoClass` dans la section suivante) : + +```py +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + + +Spécifiez le modèle et le tokenizer dans le [`pipeline`], et utilisez le `classifier` sur le texte en français : + +```py +>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) +>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") +[{'label': '5 stars', 'score': 0.7273}] +``` + +Si vous ne parvenez pas à trouver un modèle adapté à votre cas d'utilisation, vous devrez finetuner un modèle pré-entraîné sur vos données. Jetez un coup d'œil à notre [tutoriel sur le finetuning](./training) pour apprendre comment faire. Enfin, après avoir finetuné votre modèle pré-entraîné, pensez à [partager](./model_sharing) le modèle avec la communauté sur le Hub afin de démocratiser l'apprentissage automatique pour tous ! 🤗 + +## AutoClass + + + +Les classes [`AutoModelForSequenceClassification`] et [`AutoTokenizer`] fonctionnent ensemble pour créer un [`pipeline`] comme celui que vous avez utilisé ci-dessus. Une [AutoClass](./model_doc/auto) est un raccourci qui récupère automatiquement l'architecture d'un modèle pré-entraîné à partir de son nom ou de son emplacement. Il vous suffit de sélectionner l'`AutoClass` appropriée à votre tâche et la classe de prétraitement qui lui est associée. + +Reprenons l'exemple de la section précédente et voyons comment vous pouvez utiliser l'`AutoClass` pour reproduire les résultats du [`pipeline`]. + +### AutoTokenizer + +Un tokenizer est chargé de prétraiter le texte pour en faire un tableau de chiffres qui servira d'entrée à un modèle. De nombreuses règles régissent le processus de tokenisation, notamment la manière de diviser un mot et le niveau auquel les mots doivent être divisés (pour en savoir plus sur la tokenisation, consultez le [résumé](./tokenizer_summary)). La chose la plus importante à retenir est que vous devez instancier un tokenizer avec le même nom de modèle pour vous assurer que vous utilisez les mêmes règles de tokenisation que celles avec lesquelles un modèle a été pré-entraîné. + +Chargez un tokenizer avec [`AutoTokenizer`] : + +```py +>>> from transformers import AutoTokenizer + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + +Passez votre texte au tokenizer : + +```py +>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.") +>>> print(encoding) +{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +Le tokenizer retourne un dictionnaire contenant : + +* [input_ids](./glossary#input-ids): la représentation numérique des tokens. +* [attention_mask](.glossary#attention-mask): indique quels tokens doivent faire l'objet d'une attention particulière (plus particulièrement les tokens de remplissage). + +Un tokenizer peut également accepter une liste de textes, et remplir et tronquer le texte pour retourner un échantillon de longueur uniforme : + + + + +```py +>>> pt_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="pt", +... ) +``` + + + +```py +>>> tf_batch = tokenizer( +... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="tf", +... ) +``` + + + + + +Consultez le tutoriel [prétraitement](./preprocessing) pour plus de détails sur la tokenisation, et sur la manière d'utiliser un [`AutoImageProcessor`], un [`AutoFeatureExtractor`] et un [`AutoProcessor`] pour prétraiter les images, l'audio et les contenus multimodaux. + + + +### AutoModel + + + +🤗 Transformers fournit un moyen simple et unifié de charger des instances pré-entraînées. Cela signifie que vous pouvez charger un [`AutoModel`] comme vous chargeriez un [`AutoTokenizer`]. La seule différence est de sélectionner l'[`AutoModel`] approprié pour la tâche. Pour une classification de texte (ou de séquence de textes), vous devez charger [`AutoModelForSequenceClassification`] : + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +Voir le [résumé de la tâche](./task_summary) pour vérifier si elle est prise en charge par une classe [`AutoModel`]. + + + +Maintenant, passez votre échantillon d'entrées prétraitées directement au modèle. Il vous suffit de décompresser le dictionnaire en ajoutant `**` : + +```py +>>> pt_outputs = pt_model(**pt_batch) +``` + +Le modèle produit les activations finales dans l'attribut `logits`. Appliquez la fonction softmax aux `logits` pour récupérer les probabilités : + +```py +>>> from torch import nn + +>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) +>>> print(pt_predictions) +tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], + [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) +``` + + +🤗 Transformers fournit un moyen simple et unifié de charger des instances pré-entraînés. Cela signifie que vous pouvez charger un [`TFAutoModel`] comme vous chargeriez un [`AutoTokenizer`]. La seule différence est de sélectionner le [`TFAutoModel`] approprié pour la tâche. Pour une classification de texte (ou de séquence de textes), vous devez charger [`TFAutoModelForSequenceClassification`] : + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +Voir le [résumé de la tâche](./task_summary) pour vérifier si elle est prise en charge par une classe [`AutoModel`]. + + + +Passez maintenant votre échantillon d'entrées prétraitées directement au modèle en passant les clés du dictionnaire directement aux tensors : + +```py +>>> tf_outputs = tf_model(tf_batch) +``` + +Le modèle produit les activations finales dans l'attribut `logits`. Appliquez la fonction softmax aux `logits` pour récupérer les probabilités : + +```py +>>> import tensorflow as tf + +>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) +>>> tf_predictions # doctest: +IGNORE_RESULT +``` + + + + + +Tous les modèles 🤗 Transformers (PyTorch ou TensorFlow) produisent les tensors *avant* la fonction d'activation finale (comme softmax) car la fonction d'activation finale est souvent fusionnée avec le calcul de la perte. Les structures produites par le modèle sont des classes de données spéciales, de sorte que leurs attributs sont autocomplétés dans un environnement de développement. Les structures produites par le modèle se comportent comme un tuple ou un dictionnaire (vous pouvez les indexer avec un entier, une tranche ou une chaîne), auquel cas les attributs qui sont None sont ignorés. + + + +### Sauvegarder un modèle + + + +Une fois que votre modèle est finetuné, vous pouvez le sauvegarder avec son tokenizer en utilisant [`PreTrainedModel.save_pretrained`] : + +```py +>>> pt_save_directory = "./pt_save_pretrained" +>>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT +>>> pt_model.save_pretrained(pt_save_directory) +``` + +Lorsque vous voulez réutiliser le modèle, rechargez-le avec [`PreTrainedModel.from_pretrained`] : + +```py +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") +``` + + +Une fois que votre modèle est finetuné, vous pouvez le sauvegarder avec son tokenizer en utilisant [`TFPreTrainedModel.save_pretrained`] : + +```py +>>> tf_save_directory = "./tf_save_pretrained" +>>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT +>>> tf_model.save_pretrained(tf_save_directory) +``` + +Lorsque vous voulez réutiliser le modèle, rechargez-le avec [`TFPreTrainedModel.from_pretrained`] : + +```py +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") +``` + + + +Une fonctionnalité particulièrement cool 🤗 Transformers est la possibilité d'enregistrer un modèle et de le recharger en tant que modèle PyTorch ou TensorFlow. Le paramètre `from_pt` ou `from_tf` permet de convertir le modèle d'un framework à l'autre : + + + + +```py +>>> from transformers import AutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +``` + + + +```py +>>> from transformers import TFAutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +``` + + + +## Constructions de modèles personnalisés + +Vous pouvez modifier la configuration du modèle pour changer la façon dont un modèle est construit. La configuration spécifie les attributs d'un modèle, tels que le nombre de couches ou de têtes d'attention. Vous partez de zéro lorsque vous initialisez un modèle à partir d'une configuration personnalisée. Les attributs du modèle sont initialisés de manière aléatoire et vous devrez entraîner le modèle avant de pouvoir l'utiliser pour obtenir des résultats significatifs. + +Commencez par importer [`AutoConfig`], puis chargez le modèle pré-entraîné que vous voulez modifier. Dans [`AutoConfig.from_pretrained`], vous pouvez spécifier l'attribut que vous souhaitez modifier, tel que le nombre de têtes d'attention : + +```py +>>> from transformers import AutoConfig + +>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12) +``` + + + +Créez un modèle personnalisé à partir de votre configuration avec [`AutoModel.from_config`] : + +```py +>>> from transformers import AutoModel + +>>> my_model = AutoModel.from_config(my_config) +``` + + +Créez un modèle personnalisé à partir de votre configuration avec [`TFAutoModel.from_config`] : + +```py +>>> from transformers import TFAutoModel + +>>> my_model = TFAutoModel.from_config(my_config) +``` + + + +Consultez le guide [Créer une architecture personnalisée](./create_a_model) pour plus d'informations sur la création de configurations personnalisées. + +## Trainer - une boucle d'entraînement optimisée par PyTorch + +Tous les modèles sont des [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) standard, vous pouvez donc les utiliser dans n'importe quelle boucle d'entraînement typique. Bien que vous puissiez écrire votre propre boucle d'entraînement, 🤗 Transformers fournit une classe [`Trainer`] pour PyTorch, qui contient la boucle d'entraînement de base et ajoute des fonctionnalités supplémentaires comme l'entraînement distribué, la précision mixte, et plus encore. + +En fonction de votre tâche, vous passerez généralement les paramètres suivants à [`Trainer`] : + +1. Un [`PreTrainedModel`] ou un [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module): + + ```py + >>> from transformers import AutoModelForSequenceClassification + + >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") + ``` + +2. [`TrainingArguments`] contient les hyperparamètres du modèle que vous pouvez changer comme le taux d'apprentissage, la taille de l'échantillon, et le nombre d'époques pour s'entraîner. Les valeurs par défaut sont utilisées si vous ne spécifiez pas d'hyperparamètres d'apprentissage : + + ```py + >>> from transformers import TrainingArguments + + >>> training_args = TrainingArguments( + ... output_dir="path/to/save/folder/", + ... learning_rate=2e-5, + ... per_device_train_batch_size=8, + ... per_device_eval_batch_size=8, + ... num_train_epochs=2, + ... ) + ``` + +3. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques : + + ```py + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + ``` + +4. Chargez un jeu de données : + + ```py + >>> from datasets import load_dataset + + >>> dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT + ``` + +5. Créez une fonction qui transforme le texte du jeu de données en token : + + ```py + >>> def tokenize_dataset(dataset): + ... return tokenizer(dataset["text"]) + ``` + + Puis appliquez-la à l'intégralité du jeu de données avec [`~datasets.Dataset.map`]: + + ```py + >>> dataset = dataset.map(tokenize_dataset, batched=True) + ``` + +6. Un [`DataCollatorWithPadding`] pour créer un échantillon d'exemples à partir de votre jeu de données : + + ```py + >>> from transformers import DataCollatorWithPadding + + >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + ``` + +Maintenant, rassemblez tous ces éléments dans un [`Trainer`] : + +```py +>>> from transformers import Trainer + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=dataset["train"], +... eval_dataset=dataset["test"], +... tokenizer=tokenizer, +... data_collator=data_collator, +... ) # doctest: +SKIP +``` + +Une fois que vous êtes prêt, appelez la fonction [`~Trainer.train`] pour commencer l'entraînement : + +```py +>>> trainer.train() # doctest: +SKIP +``` + + + +Pour les tâches - comme la traduction ou la génération de résumé - qui utilisent un modèle séquence à séquence, utilisez plutôt les classes [`Seq2SeqTrainer`] et [`Seq2SeqTrainingArguments`]. + + + +Vous pouvez personnaliser le comportement de la boucle d'apprentissage en redéfinissant les méthodes à l'intérieur de [`Trainer`]. Cela vous permet de personnaliser des caractéristiques telles que la fonction de perte, l'optimiseur et le planificateur. Consultez la documentation de [`Trainer`] pour savoir quelles méthodes peuvent être redéfinies. + +L'autre moyen de personnaliser la boucle d'apprentissage est d'utiliser les [Callbacks](./main_classes/callbacks). Vous pouvez utiliser les callbacks pour intégrer d'autres bibliothèques et inspecter la boucle d'apprentissage afin de suivre la progression ou d'arrêter l'apprentissage plus tôt. Les callbacks ne modifient rien dans la boucle d'apprentissage elle-même. Pour personnaliser quelque chose comme la fonction de perte, vous devez redéfinir le [`Trainer`] à la place. + +## Entraînement avec TensorFlow + +Tous les modèles sont des modèles standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) afin qu'ils puissent être entraînés avec TensorFlow avec l'API [Keras](https://keras.io/). 🤗 Transformers fournit la fonction [`~TFPreTrainedModel.prepare_tf_dataset`] pour charger facilement votre jeu de données comme un `tf.data.Dataset` afin que vous puissiez commencer l'entraînement immédiatement avec les fonctions [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) et [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) de Keras. + +1. Vous commencez avec un modèle [`TFPreTrainedModel`] ou [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) : + + ```py + >>> from transformers import TFAutoModelForSequenceClassification + + >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") + ``` + +2. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques : + + ```py + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + ``` + +3. Créez une fonction qui transforme le texte du jeu de données en token : + + ```py + >>> def tokenize_dataset(dataset): + ... return tokenizer(dataset["text"]) # doctest: +SKIP + ``` + +4. Appliquez le tokenizer à l'ensemble du jeu de données avec [`~datasets.Dataset.map`] et passez ensuite le jeu de données et le tokenizer à [`~TFPreTrainedModel.prepare_tf_dataset`]. Vous pouvez également modifier la taille de l'échantillon et mélanger le jeu de données ici si vous le souhaitez : + + ```py + >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP + >>> tf_dataset = model.prepare_tf_dataset( + ... dataset, batch_size=16, shuffle=True, tokenizer=tokenizer + ... ) # doctest: +SKIP + ``` + +5. Une fois que vous êtes prêt, appelez les fonctions `compile` et `fit` pour commencer l'entraînement : + + ```py + >>> from tensorflow.keras.optimizers import Adam + + >>> model.compile(optimizer=Adam(3e-5)) + >>> model.fit(dataset) # doctest: +SKIP + ``` + +## Et après ? + +Maintenant que vous avez terminé la visite rapide de 🤗 Transformers, consultez nos guides et apprenez à faire des choses plus spécifiques comme créer un modèle personnalisé, finetuner un modèle pour une tâche, et comment entraîner un modèle avec un script. Si vous souhaitez en savoir plus sur les concepts fondamentaux de 🤗 Transformers, jetez un œil à nos guides conceptuels ! diff --git a/docs/source/it/_toctree.yml b/docs/source/it/_toctree.yml index 9c18dcdf9b70..47d90f9a9a85 100644 --- a/docs/source/it/_toctree.yml +++ b/docs/source/it/_toctree.yml @@ -1,47 +1,71 @@ -- sections: - - local: index - title: 🤗 Transformers - - local: quicktour - title: Tour rapido - - local: installation - title: Installazione - title: Iniziare -- sections: - - local: pipeline_tutorial - title: Pipeline per l'inferenza - - local: autoclass_tutorial - title: Carica istanze pre-allenate con AutoClass - - local: preprocessing - title: Preprocess - - local: training - title: Fine-tuning di un modello pre-addestrato - - local: accelerate - title: Allenamento distribuito con 🤗 Accelerate - - local: model_sharing - title: Condividere un modello - title: Esercitazione -- sections: - - local: create_a_model - title: Crea un'architettura personalizzata - - local: custom_models - title: Condividere modelli personalizzati - - local: run_scripts - title: Addestramento con script - - local: multilingual - title: Modelli multilingua per l'inferenza - - local: converting_tensorflow_models - title: Convertire modelli tensorflow - - local: serialization - title: Esporta modelli Transformers - - local: debugging - title: Debugging - title: Guide pratiche -- sections: - - local: add_new_pipeline - title: Come aggiungere una pipeline a 🤗 Transformers? - - local: add_new_model - title: Come aggiungere un modello a 🤗 Transformers? - - local: perf_hardware - title: Hardware ottimizzato per l'addestramento - title: Guide How-to - +- sections: + - local: index + title: 🤗 Transformers + - local: quicktour + title: Tour rapido + - local: installation + title: Installazione + title: Iniziare +- sections: + - local: pipeline_tutorial + title: Pipeline per l'inferenza + - local: autoclass_tutorial + title: Carica istanze pre-allenate con AutoClass + - local: preprocessing + title: Preprocess + - local: training + title: Fine-tuning di un modello pre-addestrato + - local: accelerate + title: Allenamento distribuito con 🤗 Accelerate + - local: model_sharing + title: Condividere un modello + title: Esercitazione +- sections: + - local: create_a_model + title: Crea un'architettura personalizzata + - local: custom_models + title: Condividere modelli personalizzati + - local: run_scripts + title: Addestramento con script + - local: multilingual + title: Modelli multilingua per l'inferenza + - local: converting_tensorflow_models + title: Convertire modelli tensorflow + - local: serialization + title: Esporta modelli Transformers + - local: perf_train_cpu + title: Addestramento efficiente su CPU + - local: perf_train_cpu_many + title: Addestramento efficiente su multiple CPU + - local: perf_train_tpu + title: Addestramento su TPU + - local: perf_train_special + title: Addestramento su Hardware Specializzato + - local: perf_infer_cpu + title: Inferenza Efficiente su CPU + - local: perf_infer_gpu_one + title: Inferenza su una GPU + - local: perf_infer_gpu_many + title: Inferenza Efficiente su GPU Multiple + - local: perf_infer_special + title: Inferenza su Hardware Specializzato + - local: big_models + title: Istanziare un big model + - local: migration + title: Passaggio da pacchetti precedenti + - local: debugging + title: Debugging + title: Guide pratiche +- sections: + - local: add_new_pipeline + title: Come aggiungere una pipeline a 🤗 Transformers? + - local: add_new_model + title: Come aggiungere un modello a 🤗 Transformers? + - local: perf_hardware + title: Hardware ottimizzato per l'addestramento + - local: community + title: Risorse della comunità + - local: pr_checks + title: Controlli su una Pull Request + title: Guide How-to + diff --git a/docs/source/it/accelerate.md b/docs/source/it/accelerate.md new file mode 100644 index 000000000000..3114613a9a79 --- /dev/null +++ b/docs/source/it/accelerate.md @@ -0,0 +1,136 @@ + + +# Allenamento distribuito con 🤗 Accelerate + +La parallelizzazione è emersa come strategia per allenare modelli sempre più grandi su hardware limitato e accelerarne la velocità di allenamento di diversi ordini di magnitudine. In Hugging Face, abbiamo creato la libreria [🤗 Accelerate](https://huggingface.co/docs/accelerate) per aiutarti ad allenare in modo semplice un modello 🤗 Transformers su qualsiasi tipo di configurazione distribuita, sia che si tratti di più GPU su una sola macchina o di più GPU su più macchine. In questo tutorial, imparerai come personalizzare il training loop nativo di PyTorch per consentire l'addestramento in un ambiente distribuito. + +## Configurazione + +Inizia installando 🤗 Accelerate: + +```bash +pip install accelerate +``` + +Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` rileverà automaticamente il tuo setup distribuito e inizializzerà tutte le componenti necessarie per l'allenamento. Non dovrai allocare esplicitamente il tuo modello su un device. + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## Preparati ad accelerare + +Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenamento al metodo [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Questo include i tuoi DataLoaders per l'allenamento e per la valutazione, un modello e un ottimizzatore: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## Backward + +Infine, sostituisci il tipico metodo `loss.backward()` nel tuo loop di allenamento con il metodo [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) di 🤗 Accelerate: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +Come puoi vedere nel seguente codice, hai solo bisogno di aggiungere quattro righe in più di codice al tuo training loop per abilitare l'allenamento distribuito! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## Allenamento + +Una volta che hai aggiunto le righe di codice rilevanti, lancia il tuo allenamento in uno script o in un notebook come Colaboratory. + +### Allenamento con uno script + +Se stai eseguendo il tuo allenamento da uno script, esegui il comando seguente per creare e salvare un file di configurazione: + +```bash +accelerate config +``` + +Poi lancia il tuo allenamento con: + +```bash +accelerate launch train.py +``` + +### Allenamento con un notebook + +La libreria 🤗 Accelerate può anche essere utilizzata in un notebook se stai pianificando di utilizzare le TPU di Colaboratory. Inserisci tutto il codice legato all'allenamento in una funzione, e passala al `notebook_launcher`: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +Per maggiori informazioni relative a 🤗 Accelerate e le sue numerose funzionalità, fai riferimento alla [documentazione](https://huggingface.co/docs/accelerate). \ No newline at end of file diff --git a/docs/source/it/accelerate.mdx b/docs/source/it/accelerate.mdx deleted file mode 100644 index 20dc1a7ff90b..000000000000 --- a/docs/source/it/accelerate.mdx +++ /dev/null @@ -1,132 +0,0 @@ - - -# Allenamento distribuito con 🤗 Accelerate - -La parallelizzazione è emersa come strategia per allenare modelli sempre più grandi su hardware limitato e accelerarne la velocità di allenamento di diversi ordini di magnitudine. In Hugging Face, abbiamo creato la libreria [🤗 Accelerate](https://huggingface.co/docs/accelerate) per aiutarti ad allenare in modo semplice un modello 🤗 Transformers su qualsiasi tipo di configurazione distribuita, sia che si tratti di più GPU su una sola macchina o di più GPU su più macchine. In questo tutorial, imparerai come personalizzare il training loop nativo di PyTorch per consentire l'addestramento in un ambiente distribuito. - -## Configurazione - -Inizia installando 🤗 Accelerate: - -```bash -pip install accelerate -``` - -Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` rileverà automaticamente il tuo setup distribuito e inizializzerà tutte le componenti necessarie per l'allenamento. Non dovrai allocare esplicitamente il tuo modello su un device. - -```py ->>> from accelerate import Accelerator - ->>> accelerator = Accelerator() -``` - -## Preparati ad accelerare - -Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenamento al metodo [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Questo include i tuoi DataLoaders per l'allenamento e per la valutazione, un modello e un ottimizzatore: - -```py ->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -... train_dataloader, eval_dataloader, model, optimizer -... ) -``` - -## Backward - -Infine, sostituisci il tipico metodo `loss.backward()` nel tuo loop di allenamento con il metodo [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) di 🤗 Accelerate: - -```py ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... outputs = model(**batch) -... loss = outputs.loss -... accelerator.backward(loss) - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -Come puoi vedere nel seguente codice, hai solo bisogno di aggiungere quattro righe in più di codice al tuo training loop per abilitare l'allenamento distribuito! - -```diff -+ from accelerate import Accelerator - from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler - -+ accelerator = Accelerator() - - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) - optimizer = AdamW(model.parameters(), lr=3e-5) - -- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -- model.to(device) - -+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -+ train_dataloader, eval_dataloader, model, optimizer -+ ) - - num_epochs = 3 - num_training_steps = num_epochs * len(train_dataloader) - lr_scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=num_training_steps - ) - - progress_bar = tqdm(range(num_training_steps)) - - model.train() - for epoch in range(num_epochs): - for batch in train_dataloader: -- batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss -- loss.backward() -+ accelerator.backward(loss) - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) -``` - -## Allenamento - -Una volta che hai aggiunto le righe di codice rilevanti, lancia il tuo allenamento in uno script o in un notebook come Colaboratory. - -### Allenamento con uno script - -Se stai eseguendo il tuo allenamento da uno script, esegui il comando seguente per creare e salvare un file di configurazione: - -```bash -accelerate config -``` - -Poi lancia il tuo allenamento con: - -```bash -accelerate launch train.py -``` - -### Allenamento con un notebook - -La libreria 🤗 Accelerate può anche essere utilizzata in un notebook se stai pianificando di utilizzare le TPU di Colaboratory. Inserisci tutto il codice legato all'allenamento in una funzione, e passala al `notebook_launcher`: - -```py ->>> from accelerate import notebook_launcher - ->>> notebook_launcher(training_function) -``` - -Per maggiori informazioni relative a 🤗 Accelerate e le sue numerose funzionalità, fai riferimento alla [documentazione](https://huggingface.co/docs/accelerate). \ No newline at end of file diff --git a/docs/source/it/add_new_model.md b/docs/source/it/add_new_model.md new file mode 100644 index 000000000000..3ee22e804aaa --- /dev/null +++ b/docs/source/it/add_new_model.md @@ -0,0 +1,779 @@ + + +# Come aggiungere un modello a 🤗 Transformers? + +Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche +della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere +modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo +creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con +questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare +un modello in 🤗 Transformers. + +Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md) +e contattarci. + +Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗 +Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, +sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di: + +- ottenere più comprensione delle best practices in open-source +- capire i principi di design di una della librerie NLP più popolari +- capire come efficientemente testare complessi modelli NLP +- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito + +Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. +Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli +già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed), +per capire se richiamano il modello che vorreste aggiungere. + +Per cominciare, vediamo una panoramica general della libreria Transformers. + +## Panoramica generale su 🤗 Transformers + +Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi +puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, +tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente +su larga scala, mantenendo i costi a un livello accettabile. + +Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia) +Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli: + +- La composizione é generalmente favorita sulla sovra-astrazione +- Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello +- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice +di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze. + + +La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità +di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi +un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, +cercheranno di capire e modificare il tuo modello. + +Tenendo questi principi in mente, immergiamoci nel design generale della libreria. + +### Panoramica sui modelli + +Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione, +[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers +`BrandNewBert`. + +Diamo un'occhiata: + + + +Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo +assoluto. Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita +da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] - semplice no? +Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità +importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`] +e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti +funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script +`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come +`BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel` +come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni +nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre +mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` +per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`: + +```python +model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") +model.config # il modello ha accesso al suo config +``` + +Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da +[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - +il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando +[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il +modello che la configurazione siano salvati. + + +### Stile per il codice + +Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò +ci sono alcuni fatti da considerare su come scrivere un codice :-) + +1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente + da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160) + per un ottimo esempio). +2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le + variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio + che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop. +3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico. +4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché + chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. +5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile + piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice. + +### Panoramica sui tokenizers + +Questa sezione sarà creata al piu presto :-( + +## Aggiungere un modello a 🤗 Transformers passo dopo passo + +Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello: + +1. [Aggiungere GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) scritto da [Thomas](https://huggingface.co/thomwolf) +2. [Aggiungere WMT19 MT](https://huggingface.co/blog/porting-fsmt) scritto da [Stas](https://huggingface.co/stas) + +Per esperienza, possiamo dirti che quando si aggiunge un modello é meglio tenere a mente le seguenti considerazioni: + +- Non sfondare una porta giá aperta! La maggior parte del codice che aggiungerai per un nuovo modello 🤗 Transformers + esiste già da qualche parte in 🤗 Transformers. Prendi un po' di tempo per trovare codici simili in modelli e tokenizers esistenti e fare un copia-incolla. Ricorda che [grep](https://www.gnu.org/software/grep/) e [rg](https://github.com/BurntSushi/ripgrep) sono tuoi buoni amici. Inoltre, ricorda che puó essere molto probabile che il tokenizer per il tuo modello sia basato sull'implementazione di un altro modello, e il codice del tuo modello stesso su un altro ancora. *Per esempio* il modello FSMT é basato su BART, mentre il tokenizer di FSMT é basato su XLM. +- Ricorda che qui é piu una sfida ingegneristica che scientifica. Spendi piú tempo per create un efficiente ambiente di debugging piuttosto che cercare di capire tutti gli aspetti teorici dell'articolo del modello. +- Chiedi aiuto se sei in panne! I modelli sono la parte principale di 🤗 Transformers, perciò qui a Hugging Face siamo più che contenti di aiutarti in ogni passo per aggiungere il tuo modello. Non esitare a chiedere se vedi che non riesci a progredire. + +Di seguito, diamo una ricetta generale per aiutare a portare un modello in 🤗 Transformers. + +La lista seguente é un sommario di tutto quello che é stato fatto per aggiungere un modello, e può essere usata come To-Do List: + +- 1. ☐ (Opzionale) Capire gli aspetti teorici del modello +- 2. ☐ Preparare l'ambiente dev per transformers +- 3. ☐ Preparare l'ambiente debugging della repository originale +- 4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint +- 5. ☐ Aggiungere con successo lo scheletro del modello a Transformers +- 6. ☐ Convertire i checkpoint original a Transformers checkpoint +- 7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale +- 8. ☐ Finire i tests per il modello in Transformers +- 9. ☐ Aggiungere con successo Tokenizer in Transformers +- 10. ☐ Testare e provare gli integration tests da capo a fine +- 11. ☐ Completare i docs +- 12. ☐ Caricare i moedl weights all'hub +- 13. ☐ Sottomettere una pull request +- 14. ☐ (Opzionale) Aggiungere un notebook con una demo + +Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici. + +### 1. (Opzionale) Aspetti teorici di BrandNewBert + +Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia: + +- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli +- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? +- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? +- Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*? +- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? + +Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) + + +### 2. Prepare il tuo ambiente + +1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub + +2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota: + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + + +3. Crea un ambiente di sviluppo, per esempio tramite questo comando: + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +quindi torna alla directory principale: + +```bash +cd .. +``` + + +4. Attenzione, raccomandiamo di aggiungere la versione di PyTorch di *brand_new_bert* a Transfomers. Per installare PyTorch, basta seguire queste istruzioni https://pytorch.org/get-started/locally/. + +**Nota bene:** Non c'é bisogno di installare o avere installato CUDA. Il nuovo modello può funzionare senza problemi su una CPU. + + +5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale: + +```bash +git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +cd brand_new_bert +pip install -e . +``` + +Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers. + + +### 3.-4. Provare un pretrained checkpoint usando la repo originale + +Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. + +Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti: + +- Dove si trovano i pretrained weights? +- Come caricare i pretrained weights nel modello corrispondente? +- Come girare un tokenizer independentemente dal modello? +- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta +- Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...? +- Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm? + +É molto importante che prima di cominciare a trasferire il modello nuovo tu spenda tempo a fare debug del codice originale in maniera **efficiente**! Inoltre, ricorda che tutta la library é open-soruce, quindi non temere di aprire issue o fare una pull request nella repo originale. Tutti coloro che mantengono la repository saranno piú che felici di avere qualcuno che guarda e gioca con i loro codici! + +A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in 🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU. + +In generale ci sono due possibili ambienti di debug per il testare il modello originale: + +- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb) +- Scripts locali in Python + +Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera. + +Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. + +Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é: + +```python +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +``` + +Per quanto riguarda la strategia di debugging, si può scegliere tra: + +- Decomporre il modello originario in piccole componenenti e testare ognuna di esse +- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, +e usare dei print statement o breakpoints intermedi per verificare + +Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu +avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario. + +Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base +originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. +Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: + +- negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare +automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1 +- avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro +- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore +comprensione del modello stesso +- verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro +nell'implementazione, così da continuare la modifica del codice senza interruzione + +Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) +per il modello ELECTRA + +Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite +compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. +Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria +é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare +affidamento ai print statements. + +In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal +primo layer al layer finale. +É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine: + +1. Recuperare gli IDs di input dati al modello +2. Recuperare i word embeddings +3. Recuperare l'input del primo Transformer layer +4. Recuperare l'output del primo Transformer layer +5. Recuperare l'output dei seguenti `n - 1` Transformer layers +6. Recuperare l'output dell'intero BrandNewBert Model + +Gli IDs in input dovrebbero essere un arrary di interi, *per esempio* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]` + +Gli output dei seguenti layer di solito dovrebbero essere degli array di float multi-dimensionali come questo: + +``` +[[ + [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], + ..., + [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], +``` + +Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo +significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione +di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente +diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, +é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del +modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente +possibile: + +- Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch? +Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il +modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale +é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) +per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** +quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). +- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro +ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. +Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere +a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers +con il vostro modello +- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare +la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata +`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici +volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`. +- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare +come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il +debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare +gli input al modello, anziche delle stringhe in input. +- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per +via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche +i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione +sono nello stesso framework. + +La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*. + + +### 5.-14. Trasferire BrandNewBert in 🤗 Transformers + +Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers: + + +```bash +cd transformers +``` + +Nel caso speciale in cui stiate aggiungendo un modello, la cui architettura sia identica a una di un modello già esistente, +dovrete solo aggiugnere uno script di conversione, come descritto [qui](#write-a-conversion-script). +In questo caso, potete riutilizzare l'intera architettura del modello gia esistente. + +Se questo non é il caso, cominciamo con il generare un nuovo modello. Avrete due opzioni: + +- `transformers-cli add-new-model-like` per aggiungere un nuovo modello come uno che gia esiste +- `transformers-cli add-new-model` per aggiungere un nuovo modello da un nostro template (questo assomigliera a BERT o Bart, in base al modello che selezionerete) + +In entrambi i casi, l'output vi darà un questionario da riempire con informazioni basi sul modello. Il secondo comando richiede di installare +un `cookiecutter` - maggiori informazioni [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model). + +**Aprire una Pull Request in main huggingface/transformers repo** + +Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", +*per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell' +integrare il modello in 🤗 Transformers. + +Questi sarebbero gli step generali da seguire: + +1. Creare un branch dal main branch con un nome descrittivo + +```bash +git checkout -b add_brand_new_bert +``` + +2. Commit del codice automaticamente generato + +```bash +git add . +git commit +``` + +3. Fare fetch e rebase del main esistente + +```bash +git fetch upstream +git rebase upstream/main +``` + +4. Push dei cambiamenti al proprio account: + +```bash +git push -u origin a-descriptive-name-for-my-changes +``` + +5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". +Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team +Hugging Face verrà notificato anche per i futuri cambiamenti. + +6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR + +Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, +ricordatevi di tenere aggiornato il vostro lavoro con il main esistente: + +```bash +git fetch upstream +git merge upstream/main +``` + +In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR +e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit +di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento +nella domanda, cosicche il team potra facilmente capire il problema o la domanda. + +Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea +dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema +é stato risolto, cliccate sul bottone "Resolve". + +In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più +domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi +di chiedere al team Hugging Face direttamente su slack o email. + + +**5. Adattare i codici per brand_new_bert** + +Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in +`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e +`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. + +Ora potete finalmente cominciare il codice :). Il codice generato in +`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un +modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo +agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare +questi cambi spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... +Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere +un'idea migliore su come implementare il modello. + +**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un +codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` +fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza +del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente +instanza: + +```python +from transformers import BrandNewBertModel, BrandNewBertConfig + +model = BrandNewBertModel(BrandNewBertConfig()) +``` + +Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce +che `init()` di tutte le componenti funzioni correttamente. + + +**6. Scrivere uno script di conversione** + +Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella +repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere +lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne +uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. +Non esistate a chiedre al team di Hugging Face a riguardo. + +- Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91) +- Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py) + +Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, +il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, +chiamato `SimpleModel`: + +```python +from torch import nn + + +class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.dense = nn.Linear(10, 10) + self.intermediate = nn.Linear(10, 10) + self.layer_norm = nn.LayerNorm(10) +``` +Ora possiamo creare un'instanza di questa definizione di modo da inizializzare a random weights: `dense`, `intermediate`, `layer_norm`. +Possiamo usare print per vedere l'architettura del modello: + +```python +model = SimpleModel() + +print(model) +``` + +Da cui si ottiene: + +``` +SimpleModel( + (dense): Linear(in_features=10, out_features=10, bias=True) + (intermediate): Linear(in_features=10, out_features=10, bias=True) + (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) +) +``` + +Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno +specifico layer possono essere visualizzati: + + +```python +print(model.dense.weight.data) +``` + +ad esempio: + +``` +tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, + -0.2077, 0.2157], + [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, + 0.2166, -0.0212], + [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, + -0.1023, -0.0447], + [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, + -0.1876, -0.2467], + [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, + 0.2577, 0.0402], + [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, + 0.2132, 0.1680], + [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, + 0.2707, -0.2509], + [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, + 0.1829, -0.1568], + [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, + 0.0333, -0.0536], + [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, + 0.2220, 0.2358]]). +``` + +Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente +layer nel checkpoint. *Per esempio* + +```python +# retrieve matching layer weights, e.g. by +# recursive algorithm +layer_name = "dense" +pretrained_weight = array_of_dense_layer + +model_pointer = getattr(model, "dense") + +model_pointer.weight.data = torch.from_numpy(pretrained_weight) +``` + +Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint +siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` +per la dimensione/shape e nome: + +```python +assert ( + model_pointer.weight.shape == pretrained_weight.shape +), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" +``` + +Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri che siano gli stessi: + +```python +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +``` + +Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di + 🤗 Transformers. + +Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche +che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. + +Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati +usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano +errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, +o un bug in `init()`. + +Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che +il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta +`/path/to/converted/checkpoint/folder` che contenga sia +`pytorch_model.bin` che `config.json`: + +```python +model.save_pretrained("/path/to/converted/checkpoint/folder") +``` + + +**7. Implementare il forward pass** + +Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass +sia correttamente implementato. [Qui](#provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato +uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo +usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere: + +```python +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states +``` + +Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - +é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. +Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. +Non esistate a chiedere al team Hugging Face! + +Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output +siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione +originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie +di errori comuni quando gli output non sono uguali: + +- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione +- La matrice del word embedding non é stata ripareggiata +- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset +- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che +il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout) + +La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers +fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le +implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra +i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino +all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. + +Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`. +A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊. + + +**8. Aggiungere i test necessari per il modello** + +A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia +del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é +necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, +di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni: + +```bash +pytest tests/test_modeling_brand_new_bert.py +``` + +Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia correttamente testato cosicchè: + +- a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*, +- b) Implementazioni future del vostro modello non rompano alcune feature importante del modello. + +Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di +debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, +voi dovrete solo completarlo. Una volta che questi tests sono OK, provate: + +```bash +RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +``` + + + +Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1` + + + +Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, +contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per: + + +- Aiuta gli utenti a capire il vostro codice meglio, richiamando l'attenzione su queste nuove features +- Developers e contributors futuri potranno velocemente testare nuove implementazioni del modello testanto questi casi speciali. + + +**9. Implementare il tokenizer** + +A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il tokenizer é uguale ad altri modelli in 🤗 Transformers. + +É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers. + +Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input +una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice: + +```python +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = model.tokenize(input_str) +``` + +Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. +A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. +A quel punto uno script analogo é necessario in 🤗 Transformers: + +```python +from transformers import BrandNewBertTokenizer + +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." + +tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") + +input_ids = tokenizer(input_str).input_ids +``` + +Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. + +Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione. + + +**10. Test end-to-end** + +Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer. +Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di 🤗 Transformers. +*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. +Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. +Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. +Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere +a testare questo aspetto per voi. + +**11. Aggiungere una Docstring** + +Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già +un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà +per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto +utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face +riguardo alle docstirng. + +Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. +Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per +scrivere la documentazione e docstring. + + +**Rifattorizzare il codice** + +Perfetto! Ora che abbiamo tutto per *brand_new_bert* controllate che lo stile del codice sia ok: + +```bash +make style +``` + +E che il codice passi i quality check: + +```bash +make quality +``` + +A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. +Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. + +Per ultimo, fare del refactoring del codice una volta che é stato creato. + +Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎 + +**12. Caricare il modello sul model hub** + +In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una +model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per +avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok +per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. +Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub: + +```python +brand_new_bert.push_to_hub( + repo_path_or_name="brand_new_bert", + # Uncomment the following line to push to an organization + # organization="", + commit_message="Add model", + use_temp_dir=True, +) +``` + +Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero +suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. +O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente. + + +**13. (Opzionale) Aggiungere un notebook** + +É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o +fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community. + +**14. Sottomettere la PR** + +L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, +ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice. + + +### Condividete il vostro lavoro!! + +É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello +é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara +sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro +traguardo con l'intera community :) + +** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯** diff --git a/docs/source/it/add_new_model.mdx b/docs/source/it/add_new_model.mdx deleted file mode 100644 index 464ba5830609..000000000000 --- a/docs/source/it/add_new_model.mdx +++ /dev/null @@ -1,775 +0,0 @@ - - -# Come aggiungere un modello a 🤗 Transformers? - -Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche -della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere -modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo -creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con -questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare -un modello in 🤗 Transformers. - -Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md) -e contattarci. - -Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗 -Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, -sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di: - -- ottenere più comprensione delle best practices in open-source -- capire i principi di design di una della librerie NLP più popolari -- capire come efficientemente testare complessi modelli NLP -- capire come integrare utilit Python come `black`, `isort`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito - -Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. -Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli -già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed), -per capire se richiamano il modello che vorreste aggiungere. - -Per cominciare, vediamo una panoramica general della libreria Transformers. - -## Panoramica generale su 🤗 Transformers - -Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi -puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, -tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente -su larga scala, mantenendo i costi a un livello accettabile. - -Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia) -Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli: - -- La composizione é generalmente favorita sulla sovra-astrazione -- Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello -- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice -di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze. - - -La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità -di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi -un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, -cercheranno di capire e modificare il tuo modello. - -Tenendo questi principi in mente, immergiamoci nel design generale della libreria. - -### Panoramica sui modelli - -Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione, -[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers -`BrandNewBert`. - -Diamo un'occhiata: - - - -Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo -assoluto. Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita -da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] - semplice no? -Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità -importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`] -e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti -funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script -`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come -`BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel` -come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni -nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre -mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` -per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`: - -```python -model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") -model.config # il modello ha accesso al suo config -``` - -Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da -[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - -il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando -[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il -modello che la configurazione siano salvati. - - -### Stile per il codice - -Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò -ci sono alcuni fatti da considerare su come scrivere un codice :-) - -1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente - da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160) - per un ottimo esempio). -2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le - variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio - che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop. -3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico. -4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché - chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. -5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile - piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice. - -### Panoramica sui tokenizers - -Questa sezione sarà creata al piu presto :-( - -## Aggiungere un modello a 🤗 Transformers passo dopo passo - -Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello: - -1. [Aggiungere GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) scritto da [Thomas](https://huggingface.co/thomwolf) -2. [Aggiungere WMT19 MT](https://huggingface.co/blog/porting-fsmt) scritto da [Stas](https://huggingface.co/stas) - -Per esperienza, possiamo dirti che quando si aggiunge un modello é meglio tenere a mente le seguenti considerazioni: - -- Non sfondare una porta giá aperta! La maggior parte del codice che aggiungerai per un nuovo modello 🤗 Transformers - esiste già da qualche parte in 🤗 Transformers. Prendi un po' di tempo per trovare codici simili in modelli e tokenizers esistenti e fare un copia-incolla. Ricorda che [grep](https://www.gnu.org/software/grep/) e [rg](https://github.com/BurntSushi/ripgrep) sono tuoi buoni amici. Inoltre, ricorda che puó essere molto probabile che il tokenizer per il tuo modello sia basato sull'implementazione di un altro modello, e il codice del tuo modello stesso su un altro ancora. *Per esempio* il modello FSMT é basato su BART, mentre il tokenizer di FSMT é basato su XLM. -- Ricorda che qui é piu una sfida ingegneristica che scientifica. Spendi piú tempo per create un efficiente ambiente di debugging piuttosto che cercare di capire tutti gli aspetti teorici dell'articolo del modello. -- Chiedi aiuto se sei in panne! I modelli sono la parte principale di 🤗 Transformers, perciò qui a Hugging Face siamo più che contenti di aiutarti in ogni passo per aggiungere il tuo modello. Non esitare a chiedere se vedi che non riesci a progredire. - -Di seguito, diamo una ricetta generale per aiutare a portare un modello in 🤗 Transformers. - -La lista seguente é un sommario di tutto quello che é stato fatto per aggiungere un modello, e può essere usata come To-Do List: - -- 1. ☐ (Opzionale) Capire gli aspetti teorici del modello -- 2. ☐ Preparare l'ambiente dev per transformers -- 3. ☐ Preparare l'ambiente debugging della repository originale -- 4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint -- 5. ☐ Aggiungere con successo lo scheletro del modello a Transformers -- 6. ☐ Convertire i checkpoint original a Transformers checkpoint -- 7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale -- 8. ☐ Finire i tests per il modello in Transformers -- 9. ☐ Aggiungere con successo Tokenizer in Transformers -- 10. ☐ Testare e provare gli integration tests da capo a fine -- 11. ☐ Completare i docs -- 12. ☐ Caricare i moedl weights all'hub -- 13. ☐ Sottomettere una pull request -- 14. ☐ (Opzionale) Aggiungere un notebook con una demo - -Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici. - -### 1. (Opzionale) Aspetti teorici di BrandNewBert - -Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia: - -- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli -- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? -- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? -- Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*? -- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? - -Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) - - -### 2. Prepare il tuo ambiente - -1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub - -2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota: - -```bash -git clone https://github.com/[your Github handle]/transformers.git -cd transformers -git remote add upstream https://github.com/huggingface/transformers.git -``` - - -3. Crea un ambiente di sviluppo, per esempio tramite questo comando: - -```bash -python -m venv .env -source .env/bin/activate -pip install -e ".[dev]" -``` - -quindi torna alla directory principale: - -```bash -cd .. -``` - - -4. Attenzione, raccomandiamo di aggiungere la versione di PyTorch di *brand_new_bert* a Transfomers. Per installare PyTorch, basta seguire queste istruzioni https://pytorch.org/get-started/locally/. - -**Nota bene:** Non c'é bisogno di installare o avere installato CUDA. Il nuovo modello può funzionare senza problemi su una CPU. - - -5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale: - -```bash -git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git -cd brand_new_bert -pip install -e . -``` - -Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers. - - -### 3.-4. Provare un pretrained checkpoint usando la repo originale - -Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. - -Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti: - -- Dove si trovano i pretrained weights? -- Come caricare i pretrained weights nel modello corrispondente? -- Come girare un tokenizer independentemente dal modello? -- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta -- Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...? -- Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm? - -É molto importante che prima di cominciare a trasferire il modello nuovo tu spenda tempo a fare debug del codice originale in maniera **efficiente**! Inoltre, ricorda che tutta la library é open-soruce, quindi non temere di aprire issue o fare una pull request nella repo originale. Tutti coloro che mantengono la repository saranno piú che felici di avere qualcuno che guarda e gioca con i loro codici! - -A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in 🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU. - -In generale ci sono due possibili ambienti di debug per il testare il modello originale: - -- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb) -- Scripts locali in Python - -Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera. - -Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. - -Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é: - -```python -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") -input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids -original_output = model.predict(input_ids) -``` - -Per quanto riguarda la strategia di debugging, si può scegliere tra: - -- Decomporre il modello originario in piccole componenenti e testare ognuna di esse -- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, -e usare dei print statement o breakpoints intermedi per verificare - -Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu -avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario. - -Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base -originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. -Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: - -- negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare -automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1 -- avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro -- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore -comprensione del modello stesso -- verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro -nell'implementazione, così da continuare la modifica del codice senza interruzione - -Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) -per il modello ELECTRA - -Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite -compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. -Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria -é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare -affidamento ai print statements. - -In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal -primo layer al layer finale. -É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine: - -1. Recuperare gli IDs di input dati al modello -2. Recuperare i word embeddings -3. Recuperare l'input del primo Transformer layer -4. Recuperare l'output del primo Transformer layer -5. Recuperare l'output dei seguenti `n - 1` Transformer layers -6. Recuperare l'output dell'intero BrandNewBert Model - -Gli IDs in input dovrebbero essere un arrary di interi, *per esempio* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]` - -Gli output dei seguenti layer di solito dovrebbero essere degli array di float multi-dimensionali come questo: - -``` -[[ - [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], - [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], - [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], - ..., - [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], - [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], - [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], -``` - -Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo -significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione -di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente -diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, -é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del -modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente -possibile: - -- Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch? -Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il -modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale -é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) -per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** -quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). -- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro -ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. -Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere -a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers -con il vostro modello -- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare -la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata -`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici -volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`. -- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare -come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il -debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare -gli input al modello, anziche delle stringhe in input. -- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per -via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche -i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione -sono nello stesso framework. - -La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*. - - -### 5.-14. Trasferire BrandNewBert in 🤗 Transformers - -Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers: - - -```bash -cd transformers -``` - -Nel caso speciale in cui stiate aggiungendo un modello, la cui architettura sia identica a una di un modello già esistente, -dovrete solo aggiugnere uno script di conversione, come descritto [qui](#write-a-conversion-script). -In questo caso, potete riutilizzare l'intera architettura del modello gia esistente. - -Se questo non é il caso, cominciamo con il generare un nuovo modello. Avrete due opzioni: - -- `transformers-cli add-new-model-like` per aggiungere un nuovo modello come uno che gia esiste -- `transformers-cli add-new-model` per aggiungere un nuovo modello da un nostro template (questo assomigliera a BERT o Bart, in base al modello che selezionerete) - -In entrambi i casi, l'output vi darà un questionario da riempire con informazioni basi sul modello. Il secondo comando richiede di installare -un `cookiecutter` - maggiori informazioni [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model). - -**Aprire una Pull Request in main huggingface/transformers repo** - -Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", -*per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell' -integrare il modello in 🤗 Transformers. - -Questi sarebbero gli step generali da seguire: - -1. Creare un branch dal main branch con un nome descrittivo - -```bash -git checkout -b add_brand_new_bert -``` - -2. Commit del codice automaticamente generato - -```bash -git add . -git commit -``` - -3. Fare fetch e rebase del main esistente - -```bash -git fetch upstream -git rebase upstream/main -``` - -4. Push dei cambiamenti al proprio account: - -```bash -git push -u origin a-descriptive-name-for-my-changes -``` - -5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". -Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team -Hugging Face verrà notificato anche per i futuri cambiamenti. - -6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR - -Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, -ricordatevi di tenere aggiornato il vostro lavoro con il main esistente: - -```bash -git fetch upstream -git merge upstream/main -``` - -In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR -e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit -di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento -nella domanda, cosicche il team potra facilmente capire il problema o la domanda. - -Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea -dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema -é stato risolto, cliccate sul bottone "Resolve". - -In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più -domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi -di chiedere al team Hugging Face direttamente su slack o email. - - -**5. Adattare i codici per brand_new_bert** - -Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in -`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e -`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. - -Ora potete finalmente cominciare il codice :). Il codice generato in -`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un -modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo -agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare -questi cambi spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... -Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere -un'idea migliore su come implementare il modello. - -**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un -codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` -fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza -del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente -instanza: - -```python -from transformers import BrandNewBertModel, BrandNewBertConfig - -model = BrandNewBertModel(BrandNewBertConfig()) -``` - -Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce -che `init()` di tutte le componenti funzioni correttamente. - - -**6. Scrivere uno script di conversione** - -Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella -repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere -lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne -uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. -Non esistate a chiedre al team di Hugging Face a riguardo. - -- Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91) -- Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py) - -Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, -il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, -chiamato `SimpleModel`: - -```python -from torch import nn - - -class SimpleModel(nn.Module): - def __init__(self): - super().__init__() - self.dense = nn.Linear(10, 10) - self.intermediate = nn.Linear(10, 10) - self.layer_norm = nn.LayerNorm(10) -``` -Ora possiamo creare un'instanza di questa definizione di modo da inizializzare a random weights: `dense`, `intermediate`, `layer_norm`. -Possiamo usare print per vedere l'architettura del modello: - -```python -model = SimpleModel() - -print(model) -``` - -Da cui si ottiene: - -``` -SimpleModel( - (dense): Linear(in_features=10, out_features=10, bias=True) - (intermediate): Linear(in_features=10, out_features=10, bias=True) - (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) -) -``` - -Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno -specifico layer possono essere visualizzati: - - -```python -print(model.dense.weight.data) -``` - -ad esempio: - -``` -tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, - -0.2077, 0.2157], - [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, - 0.2166, -0.0212], - [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, - -0.1023, -0.0447], - [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, - -0.1876, -0.2467], - [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, - 0.2577, 0.0402], - [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, - 0.2132, 0.1680], - [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, - 0.2707, -0.2509], - [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, - 0.1829, -0.1568], - [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, - 0.0333, -0.0536], - [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, - 0.2220, 0.2358]]). -``` - -Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente -layer nel checkpoint. *Per esempio* - -```python -# retrieve matching layer weights, e.g. by -# recursive algorithm -layer_name = "dense" -pretrained_weight = array_of_dense_layer - -model_pointer = getattr(model, "dense") - -model_pointer.weight.data = torch.from_numpy(pretrained_weight) -``` - -Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint -siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` -per la dimensione/shape e nome: - -```python -assert ( - model_pointer.weight.shape == pretrained_weight.shape -), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" -``` - -Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri che siano gli stessi: - -```python -logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") -``` - -Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di - 🤗 Transformers. - -Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche -che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. - -Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati -usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano -errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, -o un bug in `init()`. - -Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che -il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta -`/path/to/converted/checkpoint/folder` che contenga sia -`pytorch_model.bin` che `config.json`: - -```python -model.save_pretrained("/path/to/converted/checkpoint/folder") -``` - - -**7. Implementare il forward pass** - -Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass -sia correttamente implementato. [Qui](#provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato -uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo -usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere: - -```python -model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") -input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] -output = model(input_ids).last_hidden_states -``` - -Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - -é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. -Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. -Non esistate a chiedere al team Hugging Face! - -Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output -siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione -originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie -di errori comuni quando gli output non sono uguali: - -- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione -- La matrice del word embedding non é stata ripareggiata -- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset -- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che -il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout) - -La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers -fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le -implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra -i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino -all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. - -Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`. -A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊. - - -**8. Aggiungere i test necessari per il modello** - -A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia -del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é -necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, -di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni: - -```bash -pytest tests/test_modeling_brand_new_bert.py -``` - -Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia correttamente testato cosicchè: - -- a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*, -- b) Implementazioni future del vostro modello non rompano alcune feature importante del modello. - -Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di -debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, -voi dovrete solo completarlo. Una volta che questi tests sono OK, provate: - -```bash -RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests -``` - - - -Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1` - - - -Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, -contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per: - - -- Aiuta gli utenti a capire il vostro codice meglio, richiamando l'attenzione su queste nuove features -- Developers e contributors futuri potranno velocemente testare nuove implementazioni del modello testanto questi casi speciali. - - -**9. Implementare il tokenizer** - -A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il tokenizer é uguale ad altri modelli in 🤗 Transformers. - -É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers. - -Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input -una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice: - -```python -input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") -input_ids = model.tokenize(input_str) -``` - -Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. -A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. -A quel punto uno script analogo é necessario in 🤗 Transformers: - -```python -from transformers import BrandNewBertTokenizer - -input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." - -tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") - -input_ids = tokenizer(input_str).input_ids -``` - -Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. - -Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione. - - -**10. Test end-to-end** - -Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer. -Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di 🤗 Transformers. -*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. -Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. -Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. -Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere -a testare questo aspetto per voi. - -**11. Aggiungere una Docstring** - -Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già -un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà -per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto -utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face -riguardo alle docstirng. - -Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. -Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per -scrivere la documentazione e docstring. - - -**Rifattorizzare il codice** - -Perfetto! Ora che abbiamo tutto per *brand_new_bert* controllate che lo stile del codice sia ok: - -```bash -make style -``` - -E che il codice passi i quality check: - -```bash -make quality -``` - -A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. -Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. - -Per ultimo, fare del refactoring del codice una volta che é stato creato. - -Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎 - -**12. Caricare il modello sul model hub** - -In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una -model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per -avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok -per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. -Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub: - -```python -brand_new_bert.push_to_hub( - repo_path_or_name="brand_new_bert", - # Uncomment the following line to push to an organization - # organization="", - commit_message="Add model", - use_temp_dir=True, -) -``` - -Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero -suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. -O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente. - - -**13. (Opzionale) Aggiungere un notebook** - -É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o -fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community. - -**14. Sottomettere la PR** - -L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, -ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice. - - -### Condividete il vostro lavoro!! - -É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello -é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara -sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro -traguardo con l'intera community :) - -** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯** diff --git a/docs/source/it/add_new_pipeline.md b/docs/source/it/add_new_pipeline.md new file mode 100644 index 000000000000..adc1c3651a2c --- /dev/null +++ b/docs/source/it/add_new_pipeline.md @@ -0,0 +1,250 @@ + + +# Come creare una pipeline personalizzata? + +In questa guida, scopriremo come creare una pipeline personalizzata e condividerla sull' [Hub](hf.co/models) o aggiungerla nella libreria +Transformers. + +Innanzitutto, è necessario decidere gli input grezzi che la pipeline sarà in grado di accettare. Possono essere strings, raw bytes, +dictionaries o qualsiasi cosa sia l'input desiderato più probabile. Cerca di mantenere questi input il più possibile in Python +in quanto facilita la compatibilità (anche con altri linguaggi tramite JSON). Questi saranno gli `inputs` della +pipeline (`preprocess`). + +Poi definire gli `outputs`. Stessa strategia degli `inputs`. Più è seplice e meglio è. Questi saranno gli output del metodo +`postprocess`. + +Si parte ereditando la classe base `Pipeline`. con i 4 metodi che bisogna implementare `preprocess`, +`_forward`, `postprocess` e `_sanitize_parameters`. + + +```python +from transformers import Pipeline + + +class MyPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} + + def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} + + def _forward(self, model_inputs): + # model_inputs == {"model_input": model_input} + outputs = self.model(**model_inputs) + # Maybe {"logits": Tensor(...)} + return outputs + + def postprocess(self, model_outputs): + best_class = model_outputs["logits"].softmax(-1) + return best_class +``` + +La struttura di questa suddivisione consiste nel supportare in modo relativamente continuo CPU/GPU, supportando allo stesso tempo l'esecuzione di +pre/postelaborazione sulla CPU su thread diversi. + +`preprocess` prenderà gli input originariamente definiti e li trasformerà in qualcosa di alimentabile dal modello. Potrebbe +contenere più informazioni e di solito è un `Dict`. + +`_forward` è il dettaglio dell'implementazione e non è destinato a essere chiamato direttamente. `forward` è il metodo preferito per assicurarsi che tutto funzioni correttamente perchè contiene delle slavaguardie. Se qualcosa è +è collegato a un modello reale, appartiene al metodo `_forward`, tutto il resto è nel preprocess/postprocess. + +`postprocess` prende l'otput di `_forward` e lo trasforma nell'output finale che era stato deciso in precedenza. + +`_sanitize_parameters` esiste per consentire agli utenti di passare i parametri ogni volta che desiderano sia a inizialization time `pipeline(...., maybe_arg=4)` che al call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. + +`_sanitize_parameters` ritorna 3 dicts di kwargs che vengono passati direttamente a `preprocess`, +`_forward` e `postprocess`. Non riempire nulla se il chiamante non ha chiamato con alcun parametro aggiuntivo. Questo +consente di mantenere gli argomenti predefiniti nella definizione della funzione, che è sempre più "naturale". + +Un esempio classico potrebbe essere l'argomento `top_k` nel post processing dei classification tasks. + +```python +>>> pipe = pipeline("my-new-task") +>>> pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] + +>>> pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] +``` + +In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit +`_sanitize_parameters` to allow this new parameter. + + +```python +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # Add logic to handle top_k + return best_class + + +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + + postprocess_kwargs = {} + if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] + return preprocess_kwargs, {}, postprocess_kwargs +``` + +Cercare di mantenere gli input/output molto semplici e idealmente serializzabili in JSON, in quanto ciò rende l'uso della pipeline molto facile +senza richiedere agli utenti di comprendere nuovi tipi di oggetti. È anche relativamente comune supportare molti tipi di argomenti +per facilitarne l'uso (ad esempio file audio, possono essere nomi di file, URL o byte puri). + +## Aggiungilo alla lista dei tasks supportati + +Per registrar il tuo `new-task` alla lista dei tasks supportati, devi aggiungerlo al `PIPELINE_REGISTRY`: + +```python +from transformers.pipelines import PIPELINE_REGISTRY + +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) +``` + +Puoi specificare il modello di default che desideri, in questo caso dovrebbe essere accompagnato da una revisione specifica (che può essere il nome di un branch o l'hash di un commit, in questo caso abbiamo preso `"abcdef"`) e anche dal type: + +```python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # current support type: text, audio, image, multimodal +) +``` + +## Condividi la tua pipeline sull'Hub + +Per condividere la tua pipeline personalizzata sull'Hub, devi solo salvare il codice della tua sottoclasse `Pipeline` in un file +python. Per esempio, supponiamo di voler utilizzare una pipeline personalizzata per la classificazione delle coppie di frasi come la seguente: + +```py +import numpy as np + +from transformers import Pipeline + + +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} + + def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + + def _forward(self, model_inputs): + return self.model(**model_inputs) + + def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} +``` + +L'implementazione è agnostica al framework, e lavorerà sia con modelli PyTorch che con TensorFlow. Se l'abbiamo salvato in un file chiamato `pair_classification.py`, può essere successivamente importato e registrato in questo modo: + +```py +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification + +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) +``` + +Una volta fatto, possiamo usarla con un modello pretrained. L'istanza `sgugger/finetuned-bert-mrpc` è stata +fine-tuned sul dataset MRPC, che classifica le coppie di frasi come parafrasi o no. + +```py +from transformers import pipeline + +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") +``` + +Successivamente possiamo condividerlo sull'Hub usando il metodo `save_pretrained` in un `Repository`: + +```py +from huggingface_hub import Repository + +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() +``` + +Questo codice copierà il file dove è stato definitp `PairClassificationPipeline` all'interno della cartella `"test-dynamic-pipeline"`, +insieme al salvataggio del modello e del tokenizer della pipeline, prima di pushare il tutto nel repository +`{your_username}/test-dynamic-pipeline`. Dopodiché chiunque potrà utilizzarlo, purché fornisca l'opzione +`trust_remote_code=True`: + +```py +from transformers import pipeline + +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) +``` + +## Aggiungere la pipeline a Transformers + +Se vuoi contribuire con la tua pipeline a Transformers, dovrai aggiungere un modulo nel sottomodulo `pipelines` +con il codice della tua pipeline, quindi aggiungilo all'elenco dei tasks definiti in `pipelines/__init__.py`. + +Poi hai bisogno di aggiungere i test. Crea un nuovo file `tests/test_pipelines_MY_PIPELINE.py` con esempi ed altri test. + +La funzione `run_pipeline_test` sarà molto generica e su piccoli modelli casuali su ogni possibile +architettura, come definito da `model_mapping` e `tf_model_mapping`. + +Questo è molto importante per testare la compatibilità futura, nel senso che se qualcuno aggiunge un nuovo modello di +`XXXForQuestionAnswering` allora il test della pipeline tenterà di essere eseguito su di esso. Poiché i modelli sono casuali, è +è impossibile controllare i valori effettivi, per questo esiste un aiuto `ANY` che tenterà solamente di far corrispondere l'output della pipeline TYPE. + +Hai anche *bisogno* di implementare 2 (idealmente 4) test. + +- `test_small_model_pt` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso) + e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_tf`. +- `test_small_model_tf` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso) + e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_pt`. +- `test_large_model_pt` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere + senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi che non ci siano derive nelle versioni future +- `test_large_model_tf` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere + senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi + che non ci siano derive nelle versioni future \ No newline at end of file diff --git a/docs/source/it/add_new_pipeline.mdx b/docs/source/it/add_new_pipeline.mdx deleted file mode 100644 index cf9acd2902fc..000000000000 --- a/docs/source/it/add_new_pipeline.mdx +++ /dev/null @@ -1,246 +0,0 @@ - - -# Come creare una pipeline personalizzata? - -In questa guida, scopriremo come creare una pipeline personalizzata e condividerla sull' [Hub](hf.co/models) o aggiungerla nella libreria -Transformers. - -Innanzitutto, è necessario decidere gli input grezzi che la pipeline sarà in grado di accettare. Possono essere strings, raw bytes, -dictionaries o qualsiasi cosa sia l'input desiderato più probabile. Cerca di mantenere questi input il più possibile in Python -in quanto facilita la compatibilità (anche con altri linguaggi tramite JSON). Questi saranno gli `inputs` della -pipeline (`preprocess`). - -Poi definire gli `outputs`. Stessa strategia degli `inputs`. Più è seplice e meglio è. Questi saranno gli output del metodo -`postprocess`. - -Si parte ereditando la classe base `Pipeline`. con i 4 metodi che bisogna implementare `preprocess`, -`_forward`, `postprocess` e `_sanitize_parameters`. - - -```python -from transformers import Pipeline - - -class MyPipeline(Pipeline): - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - return preprocess_kwargs, {}, {} - - def preprocess(self, inputs, maybe_arg=2): - model_input = Tensor(inputs["input_ids"]) - return {"model_input": model_input} - - def _forward(self, model_inputs): - # model_inputs == {"model_input": model_input} - outputs = self.model(**model_inputs) - # Maybe {"logits": Tensor(...)} - return outputs - - def postprocess(self, model_outputs): - best_class = model_outputs["logits"].softmax(-1) - return best_class -``` - -La struttura di questa suddivisione consiste nel supportare in modo relativamente continuo CPU/GPU, supportando allo stesso tempo l'esecuzione di -pre/postelaborazione sulla CPU su thread diversi. - -`preprocess` prenderà gli input originariamente definiti e li trasformerà in qualcosa di alimentabile dal modello. Potrebbe -contenere più informazioni e di solito è un `Dict`. - -`_forward` è il dettaglio dell'implementazione e non è destinato a essere chiamato direttamente. `forward` è il metodo preferito per assicurarsi che tutto funzioni correttamente perchè contiene delle slavaguardie. Se qualcosa è -è collegato a un modello reale, appartiene al metodo `_forward`, tutto il resto è nel preprocess/postprocess. - -`postprocess` prende l'otput di `_forward` e lo trasforma nell'output finale che era stato deciso in precedenza. - -`_sanitize_parameters` esiste per consentire agli utenti di passare i parametri ogni volta che desiderano sia a inizialization time `pipeline(...., maybe_arg=4)` che al call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. - -`_sanitize_parameters` ritorna 3 dicts di kwargs che vengono passati direttamente a `preprocess`, -`_forward` e `postprocess`. Non riempire nulla se il chiamante non ha chiamato con alcun parametro aggiuntivo. Questo -consente di mantenere gli argomenti predefiniti nella definizione della funzione, che è sempre più "naturale". - -Un esempio classico potrebbe essere l'argomento `top_k` nel post processing dei classification tasks. - -```python ->>> pipe = pipeline("my-new-task") ->>> pipe("This is a test") -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} -{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] - ->>> pipe("This is a test", top_k=2) -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] -``` - -In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit -`_sanitize_parameters` to allow this new parameter. - - -```python -def postprocess(self, model_outputs, top_k=5): - best_class = model_outputs["logits"].softmax(-1) - # Add logic to handle top_k - return best_class - - -def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - - postprocess_kwargs = {} - if "top_k" in kwargs: - postprocess_kwargs["top_k"] = kwargs["top_k"] - return preprocess_kwargs, {}, postprocess_kwargs -``` - -Cercare di mantenere gli input/output molto semplici e idealmente serializzabili in JSON, in quanto ciò rende l'uso della pipeline molto facile -senza richiedere agli utenti di comprendere nuovi tipi di oggetti. È anche relativamente comune supportare molti tipi di argomenti -per facilitarne l'uso (ad esempio file audio, possono essere nomi di file, URL o byte puri). - -## Aggiungilo alla lista dei tasks supportati - -Per registrar il tuo `new-task` alla lista dei tasks supportati, devi aggiungerlo al `PIPELINE_REGISTRY`: - -```python -from transformers.pipelines import PIPELINE_REGISTRY - -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, -) -``` - -Puoi specificare il modello di default che desideri, in questo caso dovrebbe essere accompagnato da una revisione specifica (che può essere il nome di un branch o l'hash di un commit, in questo caso abbiamo preso `"abcdef"`) e anche dal type: - -```python -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, - default={"pt": ("user/awesome_model", "abcdef")}, - type="text", # current support type: text, audio, image, multimodal -) -``` - -## Condividi la tua pipeline sull'Hub - -Per condividere la tua pipeline personalizzata sull'Hub, devi solo salvare il codice della tua sottoclasse `Pipeline` in un file -python. Per esempio, supponiamo di voler utilizzare una pipeline personalizzata per la classificazione delle coppie di frasi come la seguente: - -```py -import numpy as np - -from transformers import Pipeline - - -def softmax(outputs): - maxes = np.max(outputs, axis=-1, keepdims=True) - shifted_exp = np.exp(outputs - maxes) - return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) - - -class PairClassificationPipeline(Pipeline): - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "second_text" in kwargs: - preprocess_kwargs["second_text"] = kwargs["second_text"] - return preprocess_kwargs, {}, {} - - def preprocess(self, text, second_text=None): - return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) - - def _forward(self, model_inputs): - return self.model(**model_inputs) - - def postprocess(self, model_outputs): - logits = model_outputs.logits[0].numpy() - probabilities = softmax(logits) - - best_class = np.argmax(probabilities) - label = self.model.config.id2label[best_class] - score = probabilities[best_class].item() - logits = logits.tolist() - return {"label": label, "score": score, "logits": logits} -``` - -L'implementazione è agnostica al framework, e lavorerà sia con modelli PyTorch che con TensorFlow. Se l'abbiamo salvato in un file chiamato `pair_classification.py`, può essere successivamente importato e registrato in questo modo: - -```py -from pair_classification import PairClassificationPipeline -from transformers.pipelines import PIPELINE_REGISTRY -from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification - -PIPELINE_REGISTRY.register_pipeline( - "pair-classification", - pipeline_class=PairClassificationPipeline, - pt_model=AutoModelForSequenceClassification, - tf_model=TFAutoModelForSequenceClassification, -) -``` - -Una volta fatto, possiamo usarla con un modello pretrained. L'istanza `sgugger/finetuned-bert-mrpc` è stata -fine-tuned sul dataset MRPC, che classifica le coppie di frasi come parafrasi o no. - -```py -from transformers import pipeline - -classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") -``` - -Successivamente possiamo condividerlo sull'Hub usando il metodo `save_pretrained` in un `Repository`: - -```py -from huggingface_hub import Repository - -repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") -classifier.save_pretrained("test-dynamic-pipeline") -repo.push_to_hub() -``` - -Questo codice copierà il file dove è stato definitp `PairClassificationPipeline` all'interno della cartella `"test-dynamic-pipeline"`, -insieme al salvataggio del modello e del tokenizer della pipeline, prima di pushare il tutto nel repository -`{your_username}/test-dynamic-pipeline`. Dopodiché chiunque potrà utilizzarlo, purché fornisca l'opzione -`trust_remote_code=True`: - -```py -from transformers import pipeline - -classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) -``` - -## Aggiungere la pipeline a Transformers - -Se vuoi contribuire con la tua pipeline a Transformers, dovrai aggiungere un modulo nel sottomodulo `pipelines` -con il codice della tua pipeline, quindi aggiungilo all'elenco dei tasks definiti in `pipelines/__init__.py`. - -Poi hai bisogno di aggiungere i test. Crea un nuovo file `tests/test_pipelines_MY_PIPELINE.py` con esempi ed altri test. - -La funzione `run_pipeline_test` sarà molto generica e su piccoli modelli casuali su ogni possibile -architettura, come definito da `model_mapping` e `tf_model_mapping`. - -Questo è molto importante per testare la compatibilità futura, nel senso che se qualcuno aggiunge un nuovo modello di -`XXXForQuestionAnswering` allora il test della pipeline tenterà di essere eseguito su di esso. Poiché i modelli sono casuali, è -è impossibile controllare i valori effettivi, per questo esiste un aiuto `ANY` che tenterà solamente di far corrispondere l'output della pipeline TYPE. - -Hai anche *bisogno* di implementare 2 (idealmente 4) test. - -- `test_small_model_pt` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso) - e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_tf`. -- `test_small_model_tf` : Definire 1 piccolo modello per questa pipeline (non importa se i risultati non hanno senso) - e testare i risultati della pipeline. I risultati dovrebbero essere gli stessi di `test_small_model_pt`. -- `test_large_model_pt` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere - senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi che non ci siano derive nelle versioni future -- `test_large_model_tf` (`optional`): Testare la pipeline su una pipeline reale in cui i risultati dovrebbero avere - senso. Questi test sono lenti e dovrebbero essere contrassegnati come tali. In questo caso l'obiettivo è mostrare la pipeline e assicurarsi - che non ci siano derive nelle versioni future \ No newline at end of file diff --git a/docs/source/it/autoclass_tutorial.md b/docs/source/it/autoclass_tutorial.md new file mode 100644 index 000000000000..51621d098302 --- /dev/null +++ b/docs/source/it/autoclass_tutorial.md @@ -0,0 +1,123 @@ + + +# Carica istanze pre-allenate con AutoClass + +Con così tante architetture Transformer differenti, può essere sfidante crearne una per il tuo checkpoint. Come parte della filosofia centrale di 🤗 Transformers per rendere la libreria facile, semplice e flessibile da utilizzare, una `AutoClass` inferisce e carica automaticamente l'architettura corretta da un dato checkpoint. Il metodo `from_pretrained` ti permette di caricare velocemente un modello pre-allenato per qualsiasi architettura, così non devi utilizzare tempo e risorse per allenare un modello da zero. Produrre questo codice agnostico ai checkpoint significa che se il tuo codice funziona per un checkpoint, funzionerà anche per un altro checkpoint, purché sia stato allenato per un compito simile, anche se l'architettura è differente. + + + +Ricorda, con architettura ci si riferisce allo scheletro del modello e con checkpoint ai pesi di una determinata architettura. Per esempio, [BERT](https://huggingface.co/bert-base-uncased) è un'architettura, mentre `bert-base-uncased` è un checkpoint. Modello è un termine generale che può significare sia architettura che checkpoint. + + + +In questo tutorial, imparerai a: + +* Caricare un tokenizer pre-allenato. +* Caricare un estrattore di caratteristiche (feature extractor, in inglese) pre-allenato. +* Caricare un processore pre-allenato. +* Caricare un modello pre-allenato. + +## AutoTokenizer + +Quasi tutti i compiti di NLP iniziano con un tokenizer. Un tokenizer converte il tuo input in un formato che possa essere elaborato dal modello. + +Carica un tokenizer con [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base") +``` + +Poi tokenizza il tuo input come mostrato in seguito: + +```py +>>> sequenza = "In un buco nel terreno viveva uno Hobbit." +>>> print(tokenizer(sequenza)) +{'input_ids': [0, 360, 51, 373, 587, 1718, 54644, 22597, 330, 3269, 2291, 22155, 18, 5, 2], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +## AutoFeatureExtractor + +Per compiti inerenti a audio e video, un feature extractor processa il segnale audio o l'immagine nel formato di input corretto. + +Carica un feature extractor con [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained( +... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +## AutoProcessor + +Compiti multimodali richiedono un processore che combini i due tipi di strumenti di elaborazione. Per esempio, il modello [LayoutLMV2](model_doc/layoutlmv2) richiede un feature extractor per gestire le immagine e un tokenizer per gestire il testo; un processore li combina entrambi. + +Carica un processore con [`AutoProcessor.from_pretrained`]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") +``` + +## AutoModel + + + +Infine, le classi `AutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`AutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `AutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning. + + + +Infine, le classi `TFAutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`TFAutoModelForSequenceClassification.from_pretrained`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `TFAutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning. + + diff --git a/docs/source/it/autoclass_tutorial.mdx b/docs/source/it/autoclass_tutorial.mdx deleted file mode 100644 index 88dd6cad6c42..000000000000 --- a/docs/source/it/autoclass_tutorial.mdx +++ /dev/null @@ -1,119 +0,0 @@ - - -# Carica istanze pre-allenate con AutoClass - -Con così tante architetture Transformer differenti, può essere sfidante crearne una per il tuo checkpoint. Come parte della filosofia centrale di 🤗 Transformers per rendere la libreria facile, semplice e flessibile da utilizzare, una `AutoClass` inferisce e carica automaticamente l'architettura corretta da un dato checkpoint. Il metodo `from_pretrained` ti permette di caricare velocemente un modello pre-allenato per qualsiasi architettura, così non devi utilizzare tempo e risorse per allenare un modello da zero. Produrre questo codice agnostico ai checkpoint significa che se il tuo codice funziona per un checkpoint, funzionerà anche per un altro checkpoint, purché sia stato allenato per un compito simile, anche se l'architettura è differente. - - - -Ricorda, con architettura ci si riferisce allo scheletro del modello e con checkpoint ai pesi di una determinata architettura. Per esempio, [BERT](https://huggingface.co/bert-base-uncased) è un'architettura, mentre `bert-base-uncased` è un checkpoint. Modello è un termine generale che può significare sia architettura che checkpoint. - - - -In questo tutorial, imparerai a: - -* Caricare un tokenizer pre-allenato. -* Caricare un estrattore di caratteristiche (feature extractor, in inglese) pre-allenato. -* Caricare un processore pre-allenato. -* Caricare un modello pre-allenato. - -## AutoTokenizer - -Quasi tutti i compiti di NLP iniziano con un tokenizer. Un tokenizer converte il tuo input in un formato che possa essere elaborato dal modello. - -Carica un tokenizer con [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base") -``` - -Poi tokenizza il tuo input come mostrato in seguito: - -```py ->>> sequenza = "In un buco nel terreno viveva uno Hobbit." ->>> print(tokenizer(sequenza)) -{'input_ids': [0, 360, 51, 373, 587, 1718, 54644, 22597, 330, 3269, 2291, 22155, 18, 5, 2], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -## AutoFeatureExtractor - -Per compiti inerenti a audio e video, un feature extractor processa il segnale audio o l'immagine nel formato di input corretto. - -Carica un feature extractor con [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained( -... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -## AutoProcessor - -Compiti multimodali richiedono un processore che combini i due tipi di strumenti di elaborazione. Per esempio, il modello [LayoutLMV2](model_doc/layoutlmv2) richiede un feature extractor per gestire le immagine e un tokenizer per gestire il testo; un processore li combina entrambi. - -Carica un processore con [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") -``` - -## AutoModel - - - -Infine, le classi `AutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`AutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente: - -```py ->>> from transformers import AutoModelForTokenClassification - ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - -Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `AutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning. - - - -Infine, le classi `TFAutoModelFor` ti permettono di caricare un modello pre-allenato per un determinato compito (guarda [qui](model_doc/auto) per una lista completa di compiti presenti). Per esempio, carica un modello per la classificazione di sequenze con [`TFAutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") -``` - -Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `TFAutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning. - - diff --git a/docs/source/it/big_models.md b/docs/source/it/big_models.md new file mode 100644 index 000000000000..cd0fd9017d9d --- /dev/null +++ b/docs/source/it/big_models.md @@ -0,0 +1,123 @@ + + +# Istanziare un big model + +Quando vuoi utilizzare un modello preaddestrato (pretrained) molto grande, una sfida è minimizzare l'uso della RAM. Il workflow classico +in PyTorch è: + +1. Crea il tuo modello con pesi casuali (random weights). +2. Carica i tuoi pesi preaddestrati. +3. Inserisci i pesi preaddestrati nel tuo modello casuale. + +I passi 1 e 2 una versione completa del modello in memoria, in molti casi non è un problema, ma se il modello inizia a pesare diversi GigaBytes, queste due copie possono sturare la nostra RAM. Ancora peggio, se stai usando `torch.distributed` per seguire l'addestramento (training) in distribuito, ogni processo caricherà il modello preaddestrato e memorizzerà queste due copie nella RAM. + + + +Nota che il modello creato casualmente è inizializzato con tensori "vuoti", che occupano spazio in memoria ma senza riempirlo (quindi i valori casuali sono quelli che si trovavano in questa porzione di memoria in un determinato momento). L'inizializzazione casuale che segue la distribuzione appropriata per il tipo di modello/parametri istanziato (come la distribuzione normale per le istanze) è eseguito solo dopo il passaggio 3 sui pesi non inizializzati, per essere più rapido possibile! + + + +In questa guida, esploreremo le soluzioni che Transformers offre per affrontare questo problema. C'è da tenere in conto che questa è un'area in cui si sta attualmente sviluppando, quindi le API spiegate qui possono variare velocemente in futuro. + +## Checkpoints condivisi + +Dalla versione 4.18.0, i checkpoints dei modelli che occupano più di 10GB di spazio vengono automaticamente frammentati in più parti. Per quanto riguarda la possibilità di avere un unico checkpoint quando si utilizza `model.save_pretrained(save_dir)`, si hanno diversi checkpoint parziali (ognuno con dimensione < 10GB) e un indice che mappa i nomi dei parametri ai file in cui sono memorizzati. + +Puoi controllare la dimensione massima dopo la frammentazione con il parametro `max_shard_size`, nel prossimo esempio, useremo modelli di dimensioni normali con frammenti di piccoli dimensioni: prendiamo un modello BERT classico. + +```py +from transformers import AutoModel + +model = AutoModel.from_pretrained("bert-base-cased") +``` + +Se tu salvi usando [`~PreTrainedModel.save_pretrained`], avrai una nuova cartella con due file: il config del modello e i suoi pesi: + +```py +>>> import os +>>> import tempfile + +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir) +... print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model.bin'] +``` + +Adesso usiamo una dimensione massima di frammentazione di 200MB: + +```py +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json'] +``` + +In aggiunta alla configurazione del modello, vediamo tre differenti file dei pesi, e un file `index.json` che è il nostro indice. Un checkpoint può essere ricaricato totalmente usando il metodo [`~PreTrainedModel.from_pretrained`]: + +```py +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... new_model = AutoModel.from_pretrained(tmp_dir) +``` + +Il vantaggio principale di applicare questo metodo per modelli grandi è che durante il passo 2 del workflow illustrato in precedenza, ogni frammento del checkpoint viene caricato dopo il precedente, limitando l'utilizzo della RAM alla dimensione del modello più la dimensione del frammento più grande. + +Dietro le quinte, il file indice è utilizzato per determinare quali chiavi sono nel checkpoint, e dove i corrispondenti pesi sono memorizzati. Possiamo caricare l'indice come un qualsiasi json e ottenere un dizionario: + +```py +>>> import json + +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f: +... index = json.load(f) + +>>> print(index.keys()) +dict_keys(['metadata', 'weight_map']) +``` + +I metadati consistono solo nella dimensione totale del modello per ora. Abbiamo in programma di aggiungere altre informazioni in futuro: + +```py +>>> index["metadata"] +{'total_size': 433245184} +``` + +La mappa dei pesi è la parte principale di questo indice, che mappa ogni nome dei parametri (si trova solitamente nei modelli PyTorch come `state_dict`) al file in cui è memorizzato: + +```py +>>> index["weight_map"] +{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin', + 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin', + ... +``` + +Se vuoi caricare direttamente un checkpoint frammentato in un modello senza usare [`~PreTrainedModel.from_pretrained`] (come si farebbe con `model.load_state_dict()` per un checkpoint completo) devi usare [`~modeling_utils.load_sharded_checkpoint`]: + +```py +>>> from transformers.modeling_utils import load_sharded_checkpoint + +>>> with tempfile.TemporaryDirectory() as tmp_dir: +... model.save_pretrained(tmp_dir, max_shard_size="200MB") +... load_sharded_checkpoint(model, tmp_dir) +``` + +## Caricamento low memory + +Frammentare i checkpoint l'utilizzo di memoria al passo 2 del workflow citato in precedenza, ma per utilizzare questo modello in un ambiente con poca memoria, consigliamo di utilizzare i nostri strumenti basati sulla libreria Accelerate. + +Per ulteriori informazioni, leggere la seguente guida: [Large model loading using Accelerate](./main_classes/model#large-model-loading) \ No newline at end of file diff --git a/docs/source/it/community.md b/docs/source/it/community.md new file mode 100644 index 000000000000..2f3c0c8a82b4 --- /dev/null +++ b/docs/source/it/community.md @@ -0,0 +1,68 @@ + + +# Comunità + +Questa pagina raggruppa le risorse sviluppate dalla comunità riguardo 🤗 Transformers. + +## Risorse della comunità: + +| Risorsa | Descrizione | Autore | +|:----------|:-------------|------:| +| [Glossario delle Flashcards di Transformers](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | Un insieme di flashcards basate sul [glossario della documentazione di Transformers](glossary), creato in un formato tale da permettere un facile apprendimento e revisione usando [Anki](https://apps.ankiweb.net/), un'applicazione open-source e multi-piattaforma, specificatamente progettata per ricordare informazioni nel lungo termine. Guarda questo [video introduttivo su come usare le flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## Notebook della comunità: + +| Notebook | Descrizione | Autore | | +|:----------|:-------------|:-------------|------:| +| [Fine-tuning di un Transformer pre-addestrato, al fine di generare testi di canzoni](https://github.com/AlekseyKorshuk/huggingartists) | Come generare testi di canzoni nello stile del vostro artista preferito attraverso il fine-tuning di un modello GPT-2. | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Addestramento di T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | Come addestrare T5 per qualsiasi attività usando Tensorflow 2. Questo notebook mostra come risolvere l'attività di "Question Answering" usando Tensorflow 2 e SQUAD. | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Addestramento di T5 con TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | Come addestrare T5 su SQUAD con Transformers e NLP. | [Suraj Patil](https://github.com/patil-suraj) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tuning di T5 per la classificazione e scelta multipla](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | Come effettuare il fine-tuning di T5 per le attività di classificazione a scelta multipla - usando un formato testo-a-testo - con PyTorch Lightning. | [Suraj Patil](https://github.com/patil-suraj) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tuning di DialoGPT su nuovi dataset e lingue](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | Come effettuare il fine-tuning di un modello DialoGPT su un nuovo dataset per chatbots conversazionali open-dialog. | [Nathan Cooper](https://github.com/ncoop57) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Modellamento di una lunga sequenza con Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | Come addestrare su sequenze di lunghezza fino a 500 mila token con Reformer. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tuning di BART per riassumere testi](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | Come effettuare il fine-tuning di BART per riassumere testi con fastai usando blurr. | [Wayde Gilliam](https://ohmeow.com/) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | +| [Fine-tuning di un Transformer pre-addestrato su tweet](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | Come generare tweet nello stile del tuo account Twitter preferito attraverso il fine-tuning di un modello GPT-2. | [Boris Dayma](https://github.com/borisdayma) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Ottimizzazione di modelli 🤗 Hugging Face con Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | Un tutorial completo che mostra l'integrazione di W&B con Hugging Face. | [Boris Dayma](https://github.com/borisdayma) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Longformer pre-addestrato](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | Come costruire una versione "long" degli esistenti modelli pre-addestrati. | [Iz Beltagy](https://beltagy.net) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tuning di Longformer per QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | Come effettuare il fine-tuning di un modello longformer per un task di QA.| [Suraj Patil](https://github.com/patil-suraj) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Valutazione di modelli con 🤗NLP](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | Come valutare longformer su TriviaQA con `NLP`. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tuning di T5 per Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | Come effettuare il fine-tuning di T5 per la sentiment span extraction - usando un formato testo-a-testo - con PyTorch Lightning. | [Lorenzo Ampil](https://github.com/enzoampil) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tuning di DistilBert per la classificazione multi-classe](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | Come effettuare il fine-tuning di DistilBert per la classificazione multi-classe con PyTorch. | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tuning di BERT per la classificazione multi-etichetta](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|Come effettuare il fine-tuning di BERT per la classificazione multi-etichetta con PyTorch. |[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Accelerazione del fine-tuning con il Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)| Come velocizzare il fine-tuning di un fattore 2X usando il dynamic padding / bucketing. |[Michael Benesty](https://github.com/pommedeterresautee) |[![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pre-addestramento di Reformer per Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| Come addestrare un modello Reformer usando livelli di self-attention bi-direzionali.| [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Espansione e fine-tuning di Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| Come incrementare il vocabolario di un modello SciBERT - pre-addestrato da AllenAI sul dataset CORD - e crearne una pipeline. | [Tanmay Thakur](https://github.com/lordtt13) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine-tuning di BlenderBotSmall per riassumere testi usando Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| Come effettuare il fine-tuning di BlenderBotSmall per riassumere testi su un dataset personalizzato, usando Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tuning di Electra e interpretazione con Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | Come effettuare il fine-tuning di Electra per l'analisi dei sentimenti e intepretare le predizioni con Captum Integrated Gradients. | [Eliza Szczechla](https://elsanns.github.io) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[Fine-tuning di un modello GPT-2 non inglese con la classe Trainer](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | Come effettuare il fine-tuning di un modello GPT-2 non inglese con la classe Trainer. | [Philipp Schmid](https://www.philschmid.de) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tuning di un modello DistilBERT per la classficazione multi-etichetta](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | Come effettuare il fine-tuning di un modello DistilBERT per l'attività di classificazione multi-etichetta. | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tuning di ALBERT per la classifcazione di coppie di frasi](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | Come effettuare il fine-tuning di un modello ALBERT - o un altro modello BERT-based - per l'attività di classificazione di coppie di frasi. | [Nadir El Manouzi](https://github.com/NadirEM) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tuning di Roberta per l'analisi di sentimenti](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | Come effettuare il fine-tuning di un modello Roberta per l'analisi di sentimenti. | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Valutazione di modelli che generano domande](https://github.com/flexudy-pipe/qugeev) | Quanto sono accurante le risposte alle domande generate dal tuo modello transformer seq2seq? | [Pascal Zoleko](https://github.com/zolekode) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classificazione di testo con DistilBERT e Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | Come effettuare il fine-tuning di DistilBERT per la classificazione di testo in TensorFlow. | [Peter Bayerle](https://github.com/peterbayerle) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Utilizzo di BERT per riassumere testi con un modello Encoder-Decoder su CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* attraverso l'utilizzo di un checkpoint *bert-base-uncased* per riassumere testi su CNN/Dailymail. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Utilizzo di RoBERTa per riassumere testi con un modello Encoder-Decoder su BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* (condiviso) attraverso l'utilizzo di un checkpoint *roberta-base* per riassumere testi su BBC/XSum. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tuning di TAPAS su Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | Come effettuare il fine-tuning di un modello *TapasForQuestionAnswering* attraverso l'utilizzo di un checkpoint *tapas-base* sul dataset Sequential Question Answering (SQA). | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Valutazione di TAPAS su Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | Come valutare un modello *TapasForSequenceClassification* - fine-tuned con un checkpoint *tapas-base-finetuned-tabfact* - usando una combinazione delle librerie 🤗 datasets e 🤗 transformers. | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning di mBART per la traduzione](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Come effettuare il fine-tuning di mBART usando Seq2SeqTrainer per la traduzione da hindi a inglese.| [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tuning di LayoutLM su FUNSD (un dataset per la comprensione della forma)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | Come effettuare il fine-tuning di un modello *LayoutLMForTokenClassification* sul dataset FUNSD per l'estrazione di informazioni da documenti scannerizzati.| [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-tuning di DistilGPT2 e generazione di testo](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | Come effettuare il fine-tuning di DistilGPT2 e generare testo. | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-tuning di LED fino a 8 mila token](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | Come effettuare il fine-tuning di LED su PubMed per riassumere "lunghi" testi. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Valutazione di LED su Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | Come valutare efficacemente LED sull'attività di riassumere "lunghi" testi. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tuning di LayoutLM su RVL-CDIP, un dataset per la classificazione di documenti (immagini)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | Come effettuare il fine-tuning di un modello *LayoutLMForSequenceClassification* sul dataset RVL-CDIP per la classificazione di documenti scannerizzati. | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Decodifica Wav2Vec2 CTC con variazioni di GPT2](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | Come decodificare sequenze CTC, variate da modelli di linguaggio. | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing) +|[Fine-tuning di BART per riassumere testi in due lingue con la classe Trainer](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | Come effettuare il fine-tuning di BART per riassumere testi in due lingue usando la classe Trainer. | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Valutazione di Big Bird su Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | Come valutare BigBird su question answering di "lunghi" documenti attraverso Trivia QA. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Creazione di sottotitoli per video usando Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | Come creare sottotitoli per qualsiasi video di YouTube trascrivendo l'audio con Wav2Vec. | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tuning di Vision Transformer su CIFAR-10 usando PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | Come effettuare il fine-tuning di Vision Transformer (ViT) su CIFAR-10 usando HuggingFace Transformers, Datasets e PyTorch Lightning.| [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tuning di Vision Transformer su CIFAR-10 usando 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | Come effettuare il fine-tuning di Vision Transformer (ViT) su CIFAR-10 usando HuggingFace Transformers, Datasets e 🤗 Trainer. | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Valutazione di LUKE su Open Entity, un dataset di entity typing](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | Come valutare un modello *LukeForEntityClassification* sul dataset Open Entity. | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Valutazione di LUKE su TACRED, un dataset per l'estrazione di relazioni](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | Come valutare un modello *LukeForEntityPairClassification* sul dataset TACRED. | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Valutazione di LUKE su CoNLL-2003, un importante benchmark NER](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | Come valutare un modello *LukeForEntitySpanClassification* sul dataset CoNLL-2003. | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Valutazione di BigBird-Pegasus su dataset PubMed](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | Come valutare un modello *BigBirdPegasusForConditionalGeneration* su dataset PubMed. | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Classificazione di emozioni dal discorso con Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | Come utilizzare un modello pre-addestrato Wav2Vec2 per la classificazione di emozioni sul dataset MEGA. | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Rilevamento oggetti in un'immagine con DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | Come usare un modello addestrato *DetrForObjectDetection* per rilevare oggetti in un'immagine e visualizzare l'attention. | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tuning di DETR su un dataset personalizzato per rilevare oggetti](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | Come effettuare fine-tuning di un modello *DetrForObjectDetection* su un dataset personalizzato per rilevare oggetti. | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Fine-tuning di T5 per Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | Come effettuare fine-tunining di *T5* per un'attività di Named Entity Recognition. | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | diff --git a/docs/source/it/converting_tensorflow_models.md b/docs/source/it/converting_tensorflow_models.md new file mode 100644 index 000000000000..04398636359c --- /dev/null +++ b/docs/source/it/converting_tensorflow_models.md @@ -0,0 +1,159 @@ + + +# Convertire checkpoint di Tensorflow + +È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM +in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria. + + + +A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione +di transformers >=2.3.0. + +La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**. + + + +## BERT + +Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare +[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models)) +in un file di salvataggio Pytorch utilizzando lo script +[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py). + +Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo +file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal +checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che +può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel +[quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ). + +Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare +il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione +(`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch. + +Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch +(`pip install tensorflow`). Il resto della repository richiede soltanto Pytorch. + +Questo è un esempio del processo di conversione per un modello `BERT-Base Uncased` pre-allenato: + +```bash +export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 +transformers-cli convert --model_type bert \ + --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \ + --config $BERT_BASE_DIR/bert_config.json \ + --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin +``` + +Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https://github.com/google-research/bert#pre-trained-models). + +## ALBERT + +Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script +[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py). + +Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di +configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione +avrai bisogno di un'installazione di Tensorflow e di Pytorch. + +Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato: + +```bash +export ALBERT_BASE_DIR=/path/to/albert/albert_base +transformers-cli convert --model_type albert \ + --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \ + --config $ALBERT_BASE_DIR/albert_config.json \ + --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin +``` + +Puoi scaricare i modelli pre-allenati di Google per la conversione [qui](https://github.com/google-research/albert#pre-trained-models). + +## OpenAI GPT + +Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenato, assumendo che il tuo checkpoint di NumPy +sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)): +```bash +export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights +transformers-cli convert --model_type gpt \ + --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--config OPENAI_GPT_CONFIG] \ + [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \ +``` + +## OpenAI GPT-2 + +Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)): + +```bash +export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights +transformers-cli convert --model_type gpt2 \ + --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--config OPENAI_GPT2_CONFIG] \ + [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK] +``` + +## Transformer-XL + + +Ecco un esempio del processo di conversione di un modello Transformer-XL pre-allenato +(vedi [qui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)): + +```bash +export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint +transformers-cli convert --model_type transfo_xl \ + --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--config TRANSFO_XL_CONFIG] \ + [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK] +``` + +## XLNet + +Ecco un esempio del processo di conversione di un modello XLNet pre-allenato: + +```bash +export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint +export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config +transformers-cli convert --model_type xlnet \ + --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \ + --config $TRANSFO_XL_CONFIG_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ + [--finetuning_task_name XLNET_FINETUNED_TASK] \ +``` + +## XLM + +Ecco un esempio del processo di conversione di un modello XLM pre-allenato: + +```bash +export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint +transformers-cli convert --model_type xlm \ + --tf_checkpoint $XLM_CHECKPOINT_PATH \ + --pytorch_dump_output $PYTORCH_DUMP_OUTPUT + [--config XML_CONFIG] \ + [--finetuning_task_name XML_FINETUNED_TASK] +``` + +## T5 + +Ecco un esempio del processo di conversione di un modello T5 pre-allenato: + +```bash +export T5=/path/to/t5/uncased_L-12_H-768_A-12 +transformers-cli convert --model_type t5 \ + --tf_checkpoint $T5/t5_model.ckpt \ + --config $T5/t5_config.json \ + --pytorch_dump_output $T5/pytorch_model.bin +``` diff --git a/docs/source/it/converting_tensorflow_models.mdx b/docs/source/it/converting_tensorflow_models.mdx deleted file mode 100644 index b9b30a315c6a..000000000000 --- a/docs/source/it/converting_tensorflow_models.mdx +++ /dev/null @@ -1,155 +0,0 @@ - - -# Convertire checkpoint di Tensorflow - -È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM -in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria. - - - -A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione -di transformers >=2.3.0. - -La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**. - - - -## BERT - -Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare -[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models)) -in un file di salvataggio Pytorch utilizzando lo script -[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py). - -Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo -file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal -checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che -può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel -[quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ). - -Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare -il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione -(`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch. - -Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch -(`pip install tensorflow`). Il resto della repository richiede soltanto Pytorch. - -Questo è un esempio del processo di conversione per un modello `BERT-Base Uncased` pre-allenato: - -```bash -export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12 -transformers-cli convert --model_type bert \ - --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \ - --config $BERT_BASE_DIR/bert_config.json \ - --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin -``` - -Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https://github.com/google-research/bert#pre-trained-models). - -## ALBERT - -Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script -[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py). - -Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di -configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione -avrai bisogno di un'installazione di Tensorflow e di Pytorch. - -Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato: - -```bash -export ALBERT_BASE_DIR=/path/to/albert/albert_base -transformers-cli convert --model_type albert \ - --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \ - --config $ALBERT_BASE_DIR/albert_config.json \ - --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin -``` - -Puoi scaricare i modelli pre-allenati di Google per la conversione [qui](https://github.com/google-research/albert#pre-trained-models). - -## OpenAI GPT - -Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenato, assumendo che il tuo checkpoint di NumPy -sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)): -```bash -export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights -transformers-cli convert --model_type gpt \ - --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config OPENAI_GPT_CONFIG] \ - [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \ -``` - -## OpenAI GPT-2 - -Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)): - -```bash -export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights -transformers-cli convert --model_type gpt2 \ - --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config OPENAI_GPT2_CONFIG] \ - [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK] -``` - -## Transformer-XL - - -Ecco un esempio del processo di conversione di un modello Transformer-XL pre-allenato -(vedi [qui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)): - -```bash -export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint -transformers-cli convert --model_type transfo_xl \ - --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--config TRANSFO_XL_CONFIG] \ - [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK] -``` - -## XLNet - -Ecco un esempio del processo di conversione di un modello XLNet pre-allenato: - -```bash -export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint -export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config -transformers-cli convert --model_type xlnet \ - --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \ - --config $TRANSFO_XL_CONFIG_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \ - [--finetuning_task_name XLNET_FINETUNED_TASK] \ -``` - -## XLM - -Ecco un esempio del processo di conversione di un modello XLM pre-allenato: - -```bash -export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint -transformers-cli convert --model_type xlm \ - --tf_checkpoint $XLM_CHECKPOINT_PATH \ - --pytorch_dump_output $PYTORCH_DUMP_OUTPUT - [--config XML_CONFIG] \ - [--finetuning_task_name XML_FINETUNED_TASK] -``` - -## T5 - -Ecco un esempio del processo di conversione di un modello T5 pre-allenato: - -```bash -export T5=/path/to/t5/uncased_L-12_H-768_A-12 -transformers-cli convert --model_type t5 \ - --tf_checkpoint $T5/t5_model.ckpt \ - --config $T5/t5_config.json \ - --pytorch_dump_output $T5/pytorch_model.bin -``` diff --git a/docs/source/it/create_a_model.md b/docs/source/it/create_a_model.md new file mode 100644 index 000000000000..c32040d7d389 --- /dev/null +++ b/docs/source/it/create_a_model.md @@ -0,0 +1,361 @@ + + +# Crea un'architettura personalizzata + +Una [`AutoClass`](model_doc/auto) deduce automaticamente il modello dell'architettura e scarica la configurazione e i pesi pre-allenati. Generalmente, noi consigliamo di usare un `AutoClass` per produrre un codice indipendente dal checkpoint. Ma gli utenti che desiderano un controllo maggiore su parametri specifici del modello possono creare un modello 🤗 Transformers personalizzato da poche classi base. Questo potrebbe essere particolarmente utile per qualunque persona sia interessata nel studiare, allenare o sperimentare con un modello 🤗 Transformers. In questa guida, approfondisci la creazione di un modello personalizzato senza `AutoClass`. Impara come: + +- Caricare e personalizzare una configurazione del modello. +- Creare un'architettura modello. +- Creare un tokenizer lento e veloce per il testo. +- Creare un estrattore di caratteristiche per attività riguardanti audio o immagini. +- Creare un processore per attività multimodali. + +## Configurazione + +Una [configurazione](main_classes/configuration) si riferisce agli attributi specifici di un modello. Ogni configurazione del modello ha attributi diversi; per esempio, tutti i modelli npl hanno questi attributi in comune `hidden_size`, `num_attention_heads`, `num_hidden_layers` e `vocab_size`. Questi attributi specificano il numero di attention heads o strati nascosti con cui costruire un modello. + +Dai un'occhiata più da vicino a [DistilBERT](model_doc/distilbert) accedendo a [`DistilBertConfig`] per ispezionare i suoi attributi: + +```py +>>> from transformers import DistilBertConfig + +>>> config = DistilBertConfig() +>>> print(config) +DistilBertConfig { + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +[`DistilBertConfig`] mostra tutti gli attributi predefiniti usati per costruire una base [`DistilBertModel`]. Tutti gli attributi sono personalizzabili, creando uno spazio per sperimentare. Per esempio, puoi configurare un modello predefinito per: + +- Provare un funzione di attivazione diversa con il parametro `activation`. +- Utilizzare tasso di drop out più elevato per le probalità di attention con il parametro `attention_dropout`. + +```py +>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) +>>> print(my_config) +DistilBertConfig { + "activation": "relu", + "attention_dropout": 0.4, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +Nella funzione [`~PretrainedConfig.from_pretrained`] possono essere modificati gli attributi del modello pre-allenato: + +```py +>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) +``` + +Quando la configurazione del modello ti soddisfa, la puoi salvare con [`~PretrainedConfig.save_pretrained`]. Il file della tua configurazione è memorizzato come file JSON nella save directory specificata: + +```py +>>> my_config.save_pretrained(save_directory="./your_model_save_path") +``` + +Per riutilizzare la configurazione del file, caricalo con [`~PretrainedConfig.from_pretrained`]: + +```py +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +``` + + + +Puoi anche salvare il file di configurazione come dizionario oppure come la differenza tra gli attributi della tua configurazione personalizzata e gli attributi della configurazione predefinita! Guarda la documentazione [configuration](main_classes/configuration) per più dettagli. + + + +## Modello + +Il prossimo passo e di creare [modello](main_classes/models). Il modello - vagamente riferito anche come architettura - definisce cosa ogni strato deve fare e quali operazioni stanno succedendo. Attributi come `num_hidden_layers` provenienti dalla configurazione sono usati per definire l'architettura. Ogni modello condivide la classe base [`PreTrainedModel`] e alcuni metodi comuni come il ridimensionamento degli input embeddings e la soppressione delle self-attention heads . Inoltre, tutti i modelli sono la sottoclasse di [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module). Cio significa che i modelli sono compatibili con l'uso di ciascun di framework. + + + +Carica gli attributi della tua configurazione personalizzata nel modello: + +```py +>>> from transformers import DistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> model = DistilBertModel(my_config) +``` + +Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento. + +Crea un modello pre-allenato con [`~PreTrainedModel.from_pretrained`]: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricata se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + +Carica gli attributi di configurazione personalizzati nel modello: + +```py +>>> from transformers import TFDistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> tf_model = TFDistilBertModel(my_config) +``` + + +Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento. + +Crea un modello pre-allenoto con [`~TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricato se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + + + +### Model head + +A questo punto, hai un modello DistilBERT base i cui output sono gli *hidden states* (in italiano stati nascosti). Gli stati nascosti sono passati come input a un model head per produrre l'output finale. 🤗 Transformers fornisce un model head diverso per ogni attività fintanto che il modello supporta l'attività (i.e., non puoi usare DistilBERT per un attività sequence-to-sequence come la traduzione). + + + +Per esempio, [`DistilBertForSequenceClassification`] è un modello DistilBERT base con una testa di classificazione per sequenze. La sequenza di classificazione head è uno strato lineare sopra gli output ragruppati. + +```py +>>> from transformers import DistilBertForSequenceClassification + +>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Riutilizza facilmente questo checkpoint per un'altra attività passando ad un model head differente. Per un attività di risposta alle domande, utilizzerai il model head [`DistilBertForQuestionAnswering`]. La head per compiti di question answering è simile alla classificazione di sequenza head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) + +```py +>>> from transformers import DistilBertForQuestionAnswering + +>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + +Per esempio, [`TFDistilBertForSequenceClassification`] è un modello DistilBERT base con classificazione di sequenza head. La classificazione di sequenza head è uno strato lineare sopra gli output raggruppati. + +```py +>>> from transformers import TFDistilBertForSequenceClassification + +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +Riutilizza facilmente questo checkpoint per un altra attività passando ad un modello head diverso. Per un attività di risposta alle domande, utilizzerai il model head [`TFDistilBertForQuestionAnswering`]. Il head di risposta alle domande è simile alla sequenza di classificazione head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) + +```py +>>> from transformers import TFDistilBertForQuestionAnswering + +>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + + +## Tokenizer + +L'ultima classe base di cui hai bisogno prima di utilizzare un modello per i dati testuali è un [tokenizer](main_classes/tokenizer) per convertire il testo grezzo in tensori. Ci sono due tipi di tokenizer che puoi usare con 🤗 Transformers: + +- [`PreTrainedTokenizer`]: un'implementazione Python di un tokenizer. +- [`PreTrainedTokenizerFast`]: un tokenizer dalla nostra libreria [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) basata su Rust. Questo tipo di tokenizer è significativamente più veloce, specialmente durante la batch tokenization, grazie alla sua implementazione Rust. Il tokenizer veloce offre anche metodi aggiuntivi come *offset mapping* che associa i token alle loro parole o caratteri originali. + +Entrambi i tokenizer supportano metodi comuni come la codifica e la decodifica, l'aggiunta di nuovi token e la gestione di token speciali. + + + +Non tutti i modelli supportano un tokenizer veloce. Dai un'occhiata a questo [tabella](index#supported-frameworks) per verificare se un modello ha il supporto per tokenizer veloce. + + + +Se hai addestrato il tuo tokenizer, puoi crearne uno dal tuo file *vocabolario*: + +```py +>>> from transformers import DistilBertTokenizer + +>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") +``` + +È importante ricordare che il vocabolario di un tokenizer personalizzato sarà diverso dal vocabolario generato dal tokenizer di un modello preallenato. È necessario utilizzare il vocabolario di un modello preallenato se si utilizza un modello preallenato, altrimenti gli input non avranno senso. Crea un tokenizer con il vocabolario di un modello preallenato con la classe [`DistilBertTokenizer`]: + +```py +>>> from transformers import DistilBertTokenizer + +>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") +``` + +Crea un tokenizer veloce con la classe [`DistilBertTokenizerFast`]: + +```py +>>> from transformers import DistilBertTokenizerFast + +>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") +``` + + + +Per l'impostazione predefinita, [`AutoTokenizer`] proverà a caricare un tokenizer veloce. Puoi disabilitare questo comportamento impostando `use_fast=False` in `from_pretrained`. + + + +## Estrattore Di Feature + +Un estrattore di caratteristiche (feature in inglese) elabora input audio o immagini. Eredita dalla classe [`~feature_extraction_utils.FeatureExtractionMixin`] base e può anche ereditare dalla classe [`ImageFeatureExtractionMixin`] per l'elaborazione delle caratteristiche dell'immagine o dalla classe [`SequenceFeatureExtractor`] per l'elaborazione degli input audio. + +A seconda che tu stia lavorando a un'attività audio o visiva, crea un estrattore di caratteristiche associato al modello che stai utilizzando. Ad esempio, crea un [`ViTFeatureExtractor`] predefinito se stai usando [ViT](model_doc/vit) per la classificazione delle immagini: + +```py +>>> from transformers import ViTFeatureExtractor + +>>> vit_extractor = ViTFeatureExtractor() +>>> print(vit_extractor) +ViTFeatureExtractor { + "do_normalize": true, + "do_resize": true, + "feature_extractor_type": "ViTFeatureExtractor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} +``` + + + +Se non stai cercando alcuna personalizzazione, usa il metodo `from_pretrained` per caricare i parametri di default dell'estrattore di caratteristiche di un modello. + + + +Modifica uno qualsiasi dei parametri [`ViTFeatureExtractor`] per creare il tuo estrattore di caratteristiche personalizzato: + +```py +>>> from transformers import ViTFeatureExtractor + +>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +>>> print(my_vit_extractor) +ViTFeatureExtractor { + "do_normalize": false, + "do_resize": true, + "feature_extractor_type": "ViTFeatureExtractor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} +``` + +Per gli input audio, puoi creare un [`Wav2Vec2FeatureExtractor`] e personalizzare i parametri in modo simile: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor() +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} +``` + +## Processore + +Per modelli che supportano attività multimodali, 🤗 Transformers offre una classe di processore che racchiude comodamente un estrattore di caratteristiche e un tokenizer in un unico oggetto. Ad esempio, utilizziamo [`Wav2Vec2Processor`] per un'attività di riconoscimento vocale automatico (ASR). ASR trascrive l'audio in testo, quindi avrai bisogno di un estrattore di caratteristiche e di un tokenizer. + +Crea un estrattore di feature per gestire gli input audio: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) +``` + +Crea un tokenizer per gestire gli input di testo: + +```py +>>> from transformers import Wav2Vec2CTCTokenizer + +>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") +``` + +Combinare l'estrattore di caratteristiche e il tokenizer in [`Wav2Vec2Processor`]: + +```py +>>> from transformers import Wav2Vec2Processor + +>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) +``` + +Con due classi di base - configurazione e modello - e una classe di preelaborazione aggiuntiva (tokenizer, estrattore di caratteristiche o processore), puoi creare qualsiasi modello supportato da 🤗 Transformers. Ognuna di queste classi base è configurabile, consentendoti di utilizzare gli attributi specifici che desideri. È possibile impostare facilmente un modello per l'addestramento o modificare un modello preallenato esistente per la messa a punto. \ No newline at end of file diff --git a/docs/source/it/create_a_model.mdx b/docs/source/it/create_a_model.mdx deleted file mode 100644 index 6e11f3f1d029..000000000000 --- a/docs/source/it/create_a_model.mdx +++ /dev/null @@ -1,357 +0,0 @@ - - -# Crea un'architettura personalizzata - -Una [`AutoClass`](model_doc/auto) deduce automaticamente il modello dell'architettura e scarica la configurazione e i pesi pre-allenati. Generalmente, noi consigliamo di usare un `AutoClass` per produrre un codice indipendente dal checkpoint. Ma gli utenti che desiderano un controllo maggiore su parametri specifici del modello possono creare un modello 🤗 Transformers personalizzato da poche classi base. Questo potrebbe essere particolarmente utile per qualunque persona sia interessata nel studiare, allenare o sperimentare con un modello 🤗 Transformers. In questa guida, approfondisci la creazione di un modello personalizzato senza `AutoClass`. Impara come: - -- Caricare e personalizzare una configurazione del modello. -- Creare un'architettura modello. -- Creare un tokenizer lento e veloce per il testo. -- Creare un estrattore di caratteristiche per attività riguardanti audio o immagini. -- Creare un processore per attività multimodali. - -## Configurazione - -Una [configurazione](main_classes/configuration) si riferisce agli attributi specifici di un modello. Ogni configurazione del modello ha attributi diversi; per esempio, tutti i modelli npl hanno questi attributi in comune `hidden_size`, `num_attention_heads`, `num_hidden_layers` e `vocab_size`. Questi attributi specificano il numero di attention heads o strati nascosti con cui costruire un modello. - -Dai un'occhiata più da vicino a [DistilBERT](model_doc/distilbert) accedendo a [`DistilBertConfig`] per ispezionare i suoi attributi: - -```py ->>> from transformers import DistilBertConfig - ->>> config = DistilBertConfig() ->>> print(config) -DistilBertConfig { - "activation": "gelu", - "attention_dropout": 0.1, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -[`DistilBertConfig`] mostra tutti gli attributi predefiniti usati per costruire una base [`DistilBertModel`]. Tutti gli attributi sono personalizzabili, creando uno spazio per sperimentare. Per esempio, puoi configurare un modello predefinito per: - -- Provare un funzione di attivazione diversa con il parametro `activation`. -- Utilizzare tasso di drop out più elevato per le probalità di attention con il parametro `attention_dropout`. - -```py ->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) ->>> print(my_config) -DistilBertConfig { - "activation": "relu", - "attention_dropout": 0.4, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -Nella funzione [`~PretrainedConfig.from_pretrained`] possono essere modificati gli attributi del modello pre-allenato: - -```py ->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) -``` - -Quando la configurazione del modello ti soddisfa, la puoi salvare con [`~PretrainedConfig.save_pretrained`]. Il file della tua configurazione è memorizzato come file JSON nella save directory specificata: - -```py ->>> my_config.save_pretrained(save_directory="./your_model_save_path") -``` - -Per riutilizzare la configurazione del file, caricalo con [`~PretrainedConfig.from_pretrained`]: - -```py ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") -``` - - - -Puoi anche salvare il file di configurazione come dizionario oppure come la differenza tra gli attributi della tua configurazione personalizzata e gli attributi della configurazione predefinita! Guarda la documentazione [configuration](main_classes/configuration) per più dettagli. - - - -## Modello - -Il prossimo passo e di creare [modello](main_classes/models). Il modello - vagamente riferito anche come architettura - definisce cosa ogni strato deve fare e quali operazioni stanno succedendo. Attributi come `num_hidden_layers` provenienti dalla configurazione sono usati per definire l'architettura. Ogni modello condivide la classe base [`PreTrainedModel`] e alcuni metodi comuni come il ridimensionamento degli input embeddings e la soppressione delle self-attention heads . Inoltre, tutti i modelli sono la sottoclasse di [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module). Cio significa che i modelli sono compatibili con l'uso di ciascun di framework. - - - -Carica gli attributi della tua configurazione personalizzata nel modello: - -```py ->>> from transformers import DistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> model = DistilBertModel(my_config) -``` - -Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento. - -Crea un modello pre-allenato con [`~PreTrainedModel.from_pretrained`]: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") -``` - -Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricata se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) -``` - - -Carica gli attributi di configurazione personalizzati nel modello: - -```py ->>> from transformers import TFDistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> tf_model = TFDistilBertModel(my_config) -``` - - -Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in grado di usare questo modello per niente di utile finché non lo alleni. L'allenamento è un processo costoso e che richiede tempo . Generalmente è meglio usare un modello pre-allenato per ottenere risultati migliori velocemente, utilizzando solo una frazione delle risorse neccesarie per l'allenamento. - -Crea un modello pre-allenoto con [`~TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") -``` - -Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricato se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) -``` - - - - -### Model head - -A questo punto, hai un modello DistilBERT base i cui output sono gli *hidden states* (in italiano stati nascosti). Gli stati nascosti sono passati come input a un model head per produrre l'output finale. 🤗 Transformers fornisce un model head diverso per ogni attività fintanto che il modello supporta l'attività (i.e., non puoi usare DistilBERT per un attività sequence-to-sequence come la traduzione). - - - -Per esempio, [`DistilBertForSequenceClassification`] è un modello DistilBERT base con una testa di classificazione per sequenze. La sequenza di classificazione head è uno strato lineare sopra gli output ragruppati. - -```py ->>> from transformers import DistilBertForSequenceClassification - ->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Riutilizza facilmente questo checkpoint per un'altra attività passando ad un model head differente. Per un attività di risposta alle domande, utilizzerai il model head [`DistilBertForQuestionAnswering`]. La head per compiti di question answering è simile alla classificazione di sequenza head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) - -```py ->>> from transformers import DistilBertForQuestionAnswering - ->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - -Per esempio, [`TFDistilBertForSequenceClassification`] è un modello DistilBERT base con classificazione di sequenza head. La classificazione di sequenza head è uno strato lineare sopra gli output raggruppati. - -```py ->>> from transformers import TFDistilBertForSequenceClassification - ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") -``` - -Riutilizza facilmente questo checkpoint per un altra attività passando ad un modello head diverso. Per un attività di risposta alle domande, utilizzerai il model head [`TFDistilBertForQuestionAnswering`]. Il head di risposta alle domande è simile alla sequenza di classificazione head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) - -```py ->>> from transformers import TFDistilBertForQuestionAnswering - ->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") -``` - - - -## Tokenizer - -L'ultima classe base di cui hai bisogno prima di utilizzare un modello per i dati testuali è un [tokenizer](main_classes/tokenizer) per convertire il testo grezzo in tensori. Ci sono due tipi di tokenizer che puoi usare con 🤗 Transformers: - -- [`PreTrainedTokenizer`]: un'implementazione Python di un tokenizer. -- [`PreTrainedTokenizerFast`]: un tokenizer dalla nostra libreria [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) basata su Rust. Questo tipo di tokenizer è significativamente più veloce, specialmente durante la batch tokenization, grazie alla sua implementazione Rust. Il tokenizer veloce offre anche metodi aggiuntivi come *offset mapping* che associa i token alle loro parole o caratteri originali. - -Entrambi i tokenizer supportano metodi comuni come la codifica e la decodifica, l'aggiunta di nuovi token e la gestione di token speciali. - - - -Non tutti i modelli supportano un tokenizer veloce. Dai un'occhiata a questo [tabella](index#supported-frameworks) per verificare se un modello ha il supporto per tokenizer veloce. - - - -Se hai addestrato il tuo tokenizer, puoi crearne uno dal tuo file *vocabolario*: - -```py ->>> from transformers import DistilBertTokenizer - ->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") -``` - -È importante ricordare che il vocabolario di un tokenizer personalizzato sarà diverso dal vocabolario generato dal tokenizer di un modello preallenato. È necessario utilizzare il vocabolario di un modello preallenato se si utilizza un modello preallenato, altrimenti gli input non avranno senso. Crea un tokenizer con il vocabolario di un modello preallenato con la classe [`DistilBertTokenizer`]: - -```py ->>> from transformers import DistilBertTokenizer - ->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") -``` - -Crea un tokenizer veloce con la classe [`DistilBertTokenizerFast`]: - -```py ->>> from transformers import DistilBertTokenizerFast - ->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") -``` - - - -Per l'impostazione predefinita, [`AutoTokenizer`] proverà a caricare un tokenizer veloce. Puoi disabilitare questo comportamento impostando `use_fast=False` in `from_pretrained`. - - - -## Estrattore Di Feature - -Un estrattore di caratteristiche (feature in inglese) elabora input audio o immagini. Eredita dalla classe [`~feature_extraction_utils.FeatureExtractionMixin`] base e può anche ereditare dalla classe [`ImageFeatureExtractionMixin`] per l'elaborazione delle caratteristiche dell'immagine o dalla classe [`SequenceFeatureExtractor`] per l'elaborazione degli input audio. - -A seconda che tu stia lavorando a un'attività audio o visiva, crea un estrattore di caratteristiche associato al modello che stai utilizzando. Ad esempio, crea un [`ViTFeatureExtractor`] predefinito se stai usando [ViT](model_doc/vit) per la classificazione delle immagini: - -```py ->>> from transformers import ViTFeatureExtractor - ->>> vit_extractor = ViTFeatureExtractor() ->>> print(vit_extractor) -ViTFeatureExtractor { - "do_normalize": true, - "do_resize": true, - "feature_extractor_type": "ViTFeatureExtractor", - "image_mean": [ - 0.5, - 0.5, - 0.5 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": 2, - "size": 224 -} -``` - - - -Se non stai cercando alcuna personalizzazione, usa il metodo `from_pretrained` per caricare i parametri di default dell'estrattore di caratteristiche di un modello. - - - -Modifica uno qualsiasi dei parametri [`ViTFeatureExtractor`] per creare il tuo estrattore di caratteristiche personalizzato: - -```py ->>> from transformers import ViTFeatureExtractor - ->>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) ->>> print(my_vit_extractor) -ViTFeatureExtractor { - "do_normalize": false, - "do_resize": true, - "feature_extractor_type": "ViTFeatureExtractor", - "image_mean": [ - 0.3, - 0.3, - 0.3 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": "PIL.Image.BOX", - "size": 224 -} -``` - -Per gli input audio, puoi creare un [`Wav2Vec2FeatureExtractor`] e personalizzare i parametri in modo simile: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> w2v2_extractor = Wav2Vec2FeatureExtractor() ->>> print(w2v2_extractor) -Wav2Vec2FeatureExtractor { - "do_normalize": true, - "feature_extractor_type": "Wav2Vec2FeatureExtractor", - "feature_size": 1, - "padding_side": "right", - "padding_value": 0.0, - "return_attention_mask": false, - "sampling_rate": 16000 -} -``` - -## Processore - -Per modelli che supportano attività multimodali, 🤗 Transformers offre una classe di processore che racchiude comodamente un estrattore di caratteristiche e un tokenizer in un unico oggetto. Ad esempio, utilizziamo [`Wav2Vec2Processor`] per un'attività di riconoscimento vocale automatico (ASR). ASR trascrive l'audio in testo, quindi avrai bisogno di un estrattore di caratteristiche e di un tokenizer. - -Crea un estrattore di feature per gestire gli input audio: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) -``` - -Crea un tokenizer per gestire gli input di testo: - -```py ->>> from transformers import Wav2Vec2CTCTokenizer - ->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") -``` - -Combinare l'estrattore di caratteristiche e il tokenizer in [`Wav2Vec2Processor`]: - -```py ->>> from transformers import Wav2Vec2Processor - ->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) -``` - -Con due classi di base - configurazione e modello - e una classe di preelaborazione aggiuntiva (tokenizer, estrattore di caratteristiche o processore), puoi creare qualsiasi modello supportato da 🤗 Transformers. Ognuna di queste classi base è configurabile, consentendoti di utilizzare gli attributi specifici che desideri. È possibile impostare facilmente un modello per l'addestramento o modificare un modello preallenato esistente per la messa a punto. \ No newline at end of file diff --git a/docs/source/it/custom_models.md b/docs/source/it/custom_models.md new file mode 100644 index 000000000000..b0cdf4cd7bf0 --- /dev/null +++ b/docs/source/it/custom_models.md @@ -0,0 +1,359 @@ + + +# Condividere modelli personalizzati +La libreria 🤗 Transformers è studiata per essere facilmente estendibile. Il codice di ogni modello è interamente +situato in una sottocartella del repository senza alcuna astrazione, perciò puoi facilmente copiare il file di un +modello e modificarlo in base ai tuoi bisogni. + +Se stai scrivendo un nuovo modello, potrebbe essere più semplice iniziare da zero. In questo tutorial, ti mostreremo +come scrivere un modello personalizzato e la sua configurazione in modo che possa essere utilizzato all’interno di +Transformers, e come condividerlo con la community (assieme al relativo codice) così che tutte le persone possano usarlo, anche +se non presente nella libreria 🤗 Transformers. + +Illustriamo tutto questo su un modello ResNet, avvolgendo la classe ResNet della +[libreria timm](https://github.com/rwightman/pytorch-image-models) in un [`PreTrainedModel`]. + +## Scrivere una configurazione personalizzata +Prima di iniziare a lavorare al modello, scriviamone la configurazione. La configurazione di un modello è un oggetto +che contiene tutte le informazioni necessarie per la build del modello. Come vedremo nella prossima sezione, il +modello può soltanto essere inizializzato tramite `config`, per cui dovremo rendere tale oggetto più completo possibile. + +Nel nostro esempio, prenderemo un paio di argomenti della classe ResNet che potremmo voler modificare. +Configurazioni differenti ci daranno quindi i differenti possibili tipi di ResNet. Salveremo poi questi argomenti, +dopo averne controllato la validità. + +```python +from transformers import PretrainedConfig +from typing import List + + +class ResnetConfig(PretrainedConfig): + model_type = "resnet" + + def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, + ): + if block_type not in ["basic", "bottleneck"]: + raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") + if stem_type not in ["", "deep", "deep-tiered"]: + raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") + + self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) +``` + +Le tre cose più importanti da ricordare quando scrivi le tue configurazioni sono le seguenti: +- Devi ereditare da `Pretrainedconfig`, +- Il metodo `__init__` del tuo `Pretrainedconfig` deve accettare i kwargs, +- I `kwargs` devono essere passati alla superclass `__init__` + +L’eredità è importante per assicurarsi di ottenere tutte le funzionalità della libreria 🤗 transformers, +mentre gli altri due vincoli derivano dal fatto che un `Pretrainedconfig` ha più campi di quelli che stai settando. +Quando ricarichi una config da un metodo `from_pretrained`, questi campi devono essere accettati dalla tua config e +poi inviati alla superclasse. + +Definire un `model_type` per la tua configurazione (qua `model_type = “resnet”`) non è obbligatorio, a meno che tu +non voglia registrare il modello con le classi Auto (vedi l'ultima sezione). + +Una volta completato, puoi facilmente creare e salvare la tua configurazione come faresti con ogni altra configurazione +di modelli della libreria. Ecco come possiamo creare la config di un resnet50d e salvarlo: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d_config.save_pretrained("custom-resnet") +``` + +Questo salverà un file chiamato `config.json` all'interno della cartella `custom-resnet`. Potrai poi ricaricare la tua +config con il metodo `from_pretrained`. + +```py +resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") +``` + +Puoi anche usare qualunque altro metodo della classe [`PretrainedConfig`], come [`~PretrainedConfig.push_to_hub`] +per caricare direttamente la tua configurazione nell'hub. + +## Scrivere un modello personalizzato + +Ora che abbiamo la nostra configurazione ResNet, possiamo continuare a scrivere il modello. In realtà, ne scriveremo +due: uno che estrae le features nascoste da una batch di immagini (come [`BertModel`]) e uno che è utilizzabile per +la classificazione di immagini (come [`BertModelForSequenceClassification`]). + +Come abbiamo menzionato in precedenza, scriveremo soltanto un wrapper del modello, per mantenerlo semplice ai fini di +questo esempio. L'unica cosa che dobbiamo fare prima di scrivere questa classe è una mappatura fra i tipi di blocco e +le vere classi dei blocchi. Successivamente il modello è definito tramite la configurazione, passando tutto quanto alla +classe `ResNet`. + +```py +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig + + +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} + + +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor): + return self.model.forward_features(tensor) +``` + +Per il modello che classificherà le immagini, cambiamo soltanto il metodo forward: + +```py +import torch + + +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} +``` + +Nota come, in entrambi i casi, ereditiamo da `PreTrainedModel` e chiamiamo l'inizializzazione della superclasse +con il metodo `config` (un po' come quando scrivi un normale `torch.nn.Module`). La riga che imposta la `config_class` +non è obbligatoria, a meno che tu non voglia registrare il modello con le classi Auto (vedi l'ultima sezione). + + + +Se il tuo modello è molto simile a un modello all'interno della libreria, puoi ri-usare la stessa configurazione di quel modello. + + + +Puoi fare in modo che il tuo modello restituisca in output qualunque cosa tu voglia, ma far restituire un dizionario +come abbiamo fatto per `ResnetModelForImageClassification`, con la funzione di perdita inclusa quando vengono passate le labels, +renderà il tuo modello direttamente utilizzabile all'interno della classe [`Trainer`]. Utilizzare altri formati di output va bene +se hai in progetto di utilizzare un tuo loop di allenamento, o se utilizzerai un'altra libreria per l'addestramento. + +Ora che abbiamo la classe del nostro modello, creiamone uno: + +```py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +``` + +Ribadiamo, puoi usare qualunque metodo dei [`PreTrainedModel`], come [`~PreTrainedModel.save_pretrained`] o +[`~PreTrainedModel.push_to_hub`]. Utilizzeremo quest'ultimo nella prossima sezione, e vedremo come caricare i pesi del +modello assieme al codice del modello stesso. Ma prima, carichiamo alcuni pesi pre-allenati all'interno del nostro modello. + +Nel tuo caso specifico, probabilmente allenerai il tuo modello sui tuoi dati. Per velocizzare in questo tutorial, +utilizzeremo la versione pre-allenata del resnet50d. Dato che il nostro modello è soltanto un wrapper attorno a quel modello, +sarà facile trasferirne i pesi: + +```py +import timm + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +Vediamo adesso come assicurarci che quando facciamo [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], +il codice del modello venga salvato. + +## Inviare il codice all'Hub + + + +Questa API è sperimentale e potrebbe avere alcuni cambiamenti nei prossimi rilasci. + + + +Innanzitutto, assicurati che il tuo modello sia completamente definito in un file `.py`. Può sfruttare import relativi +ad altri file, purchè questi siano nella stessa directory (non supportiamo ancora sotto-moduli per questa funzionalità). +Per questo esempio, definiremo un file `modeling_resnet.py` e un file `configuration_resnet.py` in una cartella dell'attuale +working directory chiamata `resnet_model`. Il file configuration contiene il codice per `ResnetConfig` e il file modeling +contiene il codice di `ResnetModel` e `ResnetModelForImageClassification`. + +``` +. +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +``` + +Il file `__init__.py` può essere vuoto, serve solo perchè Python capisca che `resnet_model` può essere utilizzato come un modulo. + + + +Se stai copiando i file relativi alla modellazione della libreria, dovrai sostituire tutti gli import relativi in cima al file con import del + pacchetto `transformers`. + + + +Nota che puoi ri-utilizzare (o usare come sottoclassi) un modello/configurazione esistente. + +Per condividere il tuo modello con la community, segui questi passi: prima importa il modello ResNet e la sua configurazione +dai nuovi file creati: + +```py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +``` + +Dopodichè dovrai dire alla libreria che vuoi copiare i file con il codice di quegli oggetti quando utilizzi il metodo +`save_pretrained` e registrarli in modo corretto con una Auto classe (specialmente per i modelli). Utilizza semplicemente: + +```py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +``` + +Nota che non c'è bisogno di specificare una Auto classe per la configurazione (c'è solo una Auto classe per le configurazioni, +[`AutoConfig`], ma è diversa per i modelli). Il tuo modello personalizato potrebbe essere utilizzato per diverse tasks, +per cui devi specificare quale delle classi Auto è quella corretta per il tuo modello. + +Successivamente, creiamo i modelli e la config come abbiamo fatto in precedenza: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +Adesso, per inviare il modello all'Hub, assicurati di aver effettuato l'accesso. Lancia dal tuo terminale: + +```bash +huggingface-cli login +``` + +O da un notebook: + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +Potrai poi inviare il tutto sul tuo profilo (o di un'organizzazione di cui fai parte) in questo modo: + +```py +resnet50d.push_to_hub("custom-resnet50d") +``` + +Oltre ai pesi del modello e alla configurazione in formato json, questo ha anche copiato i file `.py` modeling e +configuration all'interno della cartella `custom-resnet50d` e ha caricato i risultati sull'Hub. Puoi controllare +i risultati in questa [model repo](https://huggingface.co/sgugger/custom-resnet50d). + +Puoi controllare il tutorial di condivisione [tutorial di condivisione](model_sharing) per più informazioni sul +metodo con cui inviare all'Hub. + +## Usare un modello con codice personalizzato + +Puoi usare ogni configurazione, modello o tokenizer con file di codice personalizzati nella sua repository +con le classi Auto e il metodo `from_pretrained`. Tutti i files e il codice caricati sull'Hub sono scansionati da malware +(fai riferimento alla documentazione [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) per più informazioni), +ma dovresti comunque assicurarti dell'affidabilità del codice e dell'autore per evitare di eseguire codice dannoso sulla tua macchina. +Imposta `trust_remote_code=True` per usare un modello con codice personalizzato: + +```py +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) +``` + +Inoltre, raccomandiamo fortemente di passare un hash del commit come `revision` per assicurarti che le autrici o gli autori del modello +non abbiano modificato il codice con alcune nuove righe dannose (a meno che non ti fidi completamente della fonte): + +```py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +``` + +Nota che quando cerchi la storia dei commit della repo del modello sull'Hub, c'è un bottone con cui facilmente copiare il +commit hash di ciascun commit. + +## Registrare un modello con codice personalizzato nelle classi Auto + +Se stai scrivendo una libreria che estende 🤗 Transformers, potresti voler estendere le classi Auto per includere il tuo modello. +Questo è diverso dall'inviare codice nell'Hub: gli utenti dovranno importare la tua libreria per ottenere il modello personalizzato +(anzichè scaricare automaticamente il modello dall'Hub). + +Finchè il tuo file di configurazione ha un attributo `model_type` diverso dai model types esistenti, e finchè le tue +classi modello hanno i corretti attributi `config_class`, potrai semplicemente aggiungerli alle classi Auto come segue: + +```py +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification + +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) +``` + +Nota che il primo argomento utilizzato quando registri la configurazione di un modello personalizzato con [`AutoConfig`] +deve corrispondere al `model_type` della tua configurazione personalizzata, ed il primo argomento utilizzato quando +registri i tuoi modelli personalizzati in una qualunque classe Auto del modello deve corrispondere alla `config_class` +di quei modelli. diff --git a/docs/source/it/custom_models.mdx b/docs/source/it/custom_models.mdx deleted file mode 100644 index b4b0302e29e3..000000000000 --- a/docs/source/it/custom_models.mdx +++ /dev/null @@ -1,355 +0,0 @@ - - -# Condividere modelli personalizzati -La libreria 🤗 Transformers è studiata per essere facilmente estendibile. Il codice di ogni modello è interamente -situato in una sottocartella del repository senza alcuna astrazione, perciò puoi facilmente copiare il file di un -modello e modificarlo in base ai tuoi bisogni. - -Se stai scrivendo un nuovo modello, potrebbe essere più semplice iniziare da zero. In questo tutorial, ti mostreremo -come scrivere un modello personalizzato e la sua configurazione in modo che possa essere utilizzato all’interno di -Transformers, e come condividerlo con la community (assieme al relativo codice) così che tutte le persone possano usarlo, anche -se non presente nella libreria 🤗 Transformers. - -Illustriamo tutto questo su un modello ResNet, avvolgendo la classe ResNet della -[libreria timm](https://github.com/rwightman/pytorch-image-models) in un [`PreTrainedModel`]. - -## Scrivere una configurazione personalizzata -Prima di iniziare a lavorare al modello, scriviamone la configurazione. La configurazione di un modello è un oggetto -che contiene tutte le informazioni necessarie per la build del modello. Come vedremo nella prossima sezione, il -modello può soltanto essere inizializzato tramite `config`, per cui dovremo rendere tale oggetto più completo possibile. - -Nel nostro esempio, prenderemo un paio di argomenti della classe ResNet che potremmo voler modificare. -Configurazioni differenti ci daranno quindi i differenti possibili tipi di ResNet. Salveremo poi questi argomenti, -dopo averne controllato la validità. - -```python -from transformers import PretrainedConfig -from typing import List - - -class ResnetConfig(PretrainedConfig): - model_type = "resnet" - - def __init__( - self, - block_type="bottleneck", - layers: List[int] = [3, 4, 6, 3], - num_classes: int = 1000, - input_channels: int = 3, - cardinality: int = 1, - base_width: int = 64, - stem_width: int = 64, - stem_type: str = "", - avg_down: bool = False, - **kwargs, - ): - if block_type not in ["basic", "bottleneck"]: - raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") - if stem_type not in ["", "deep", "deep-tiered"]: - raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") - - self.block_type = block_type - self.layers = layers - self.num_classes = num_classes - self.input_channels = input_channels - self.cardinality = cardinality - self.base_width = base_width - self.stem_width = stem_width - self.stem_type = stem_type - self.avg_down = avg_down - super().__init__(**kwargs) -``` - -Le tre cose più importanti da ricordare quando scrivi le tue configurazioni sono le seguenti: -- Devi ereditare da `Pretrainedconfig`, -- Il metodo `__init__` del tuo `Pretrainedconfig` deve accettare i kwargs, -- I `kwargs` devono essere passati alla superclass `__init__` - -L’eredità è importante per assicurarsi di ottenere tutte le funzionalità della libreria 🤗 transformers, -mentre gli altri due vincoli derivano dal fatto che un `Pretrainedconfig` ha più campi di quelli che stai settando. -Quando ricarichi una config da un metodo `from_pretrained`, questi campi devono essere accettati dalla tua config e -poi inviati alla superclasse. - -Definire un `model_type` per la tua configurazione (qua `model_type = “resnet”`) non è obbligatorio, a meno che tu -non voglia registrare il modello con le classi Auto (vedi l'ultima sezione). - -Una volta completato, puoi facilmente creare e salvare la tua configurazione come faresti con ogni altra configurazione -di modelli della libreria. Ecco come possiamo creare la config di un resnet50d e salvarlo: - -```py -resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) -resnet50d_config.save_pretrained("custom-resnet") -``` - -Questo salverà un file chiamato `config.json` all'interno della cartella `custom-resnet`. Potrai poi ricaricare la tua -config con il metodo `from_pretrained`. - -```py -resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") -``` - -Puoi anche usare qualunque altro metodo della classe [`PretrainedConfig`], come [`~PretrainedConfig.push_to_hub`] -per caricare direttamente la tua configurazione nell'hub. - -## Scrivere un modello personalizzato - -Ora che abbiamo la nostra configurazione ResNet, possiamo continuare a scrivere il modello. In realtà, ne scriveremo -due: uno che estrae le features nascoste da una batch di immagini (come [`BertModel`]) e uno che è utilizzabile per -la classificazione di immagini (come [`BertModelForSequenceClassification`]). - -Come abbiamo menzionato in precedenza, scriveremo soltanto un wrapper del modello, per mantenerlo semplice ai fini di -questo esempio. L'unica cosa che dobbiamo fare prima di scrivere questa classe è una mappatura fra i tipi di blocco e -le vere classi dei blocchi. Successivamente il modello è definito tramite la configurazione, passando tutto quanto alla -classe `ResNet`. - -```py -from transformers import PreTrainedModel -from timm.models.resnet import BasicBlock, Bottleneck, ResNet -from .configuration_resnet import ResnetConfig - - -BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} - - -class ResnetModel(PreTrainedModel): - config_class = ResnetConfig - - def __init__(self, config): - super().__init__(config) - block_layer = BLOCK_MAPPING[config.block_type] - self.model = ResNet( - block_layer, - config.layers, - num_classes=config.num_classes, - in_chans=config.input_channels, - cardinality=config.cardinality, - base_width=config.base_width, - stem_width=config.stem_width, - stem_type=config.stem_type, - avg_down=config.avg_down, - ) - - def forward(self, tensor): - return self.model.forward_features(tensor) -``` - -Per il modello che classificherà le immagini, cambiamo soltanto il metodo forward: - -```py -import torch - - -class ResnetModelForImageClassification(PreTrainedModel): - config_class = ResnetConfig - - def __init__(self, config): - super().__init__(config) - block_layer = BLOCK_MAPPING[config.block_type] - self.model = ResNet( - block_layer, - config.layers, - num_classes=config.num_classes, - in_chans=config.input_channels, - cardinality=config.cardinality, - base_width=config.base_width, - stem_width=config.stem_width, - stem_type=config.stem_type, - avg_down=config.avg_down, - ) - - def forward(self, tensor, labels=None): - logits = self.model(tensor) - if labels is not None: - loss = torch.nn.cross_entropy(logits, labels) - return {"loss": loss, "logits": logits} - return {"logits": logits} -``` - -Nota come, in entrambi i casi, ereditiamo da `PreTrainedModel` e chiamiamo l'inizializzazione della superclasse -con il metodo `config` (un po' come quando scrivi un normale `torch.nn.Module`). La riga che imposta la `config_class` -non è obbligatoria, a meno che tu non voglia registrare il modello con le classi Auto (vedi l'ultima sezione). - - - -Se il tuo modello è molto simile a un modello all'interno della libreria, puoi ri-usare la stessa configurazione di quel modello. - - - -Puoi fare in modo che il tuo modello restituisca in output qualunque cosa tu voglia, ma far restituire un dizionario -come abbiamo fatto per `ResnetModelForImageClassification`, con la funzione di perdita inclusa quando vengono passate le labels, -renderà il tuo modello direttamente utilizzabile all'interno della classe [`Trainer`]. Utilizzare altri formati di output va bene -se hai in progetto di utilizzare un tuo loop di allenamento, o se utilizzerai un'altra libreria per l'addestramento. - -Ora che abbiamo la classe del nostro modello, creiamone uno: - -```py -resnet50d = ResnetModelForImageClassification(resnet50d_config) -``` - -Ribadiamo, puoi usare qualunque metodo dei [`PreTrainedModel`], come [`~PreTrainedModel.save_pretrained`] o -[`~PreTrainedModel.push_to_hub`]. Utilizzeremo quest'ultimo nella prossima sezione, e vedremo come caricare i pesi del -modello assieme al codice del modello stesso. Ma prima, carichiamo alcuni pesi pre-allenati all'interno del nostro modello. - -Nel tuo caso specifico, probabilmente allenerai il tuo modello sui tuoi dati. Per velocizzare in questo tutorial, -utilizzeremo la versione pre-allenata del resnet50d. Dato che il nostro modello è soltanto un wrapper attorno a quel modello, -sarà facile trasferirne i pesi: - -```py -import timm - -pretrained_model = timm.create_model("resnet50d", pretrained=True) -resnet50d.model.load_state_dict(pretrained_model.state_dict()) -``` - -Vediamo adesso come assicurarci che quando facciamo [`~PreTrainedModel.save_pretrained`] o [`~PreTrainedModel.push_to_hub`], -il codice del modello venga salvato. - -## Inviare il codice all'Hub - - - -Questa API è sperimentale e potrebbe avere alcuni cambiamenti nei prossimi rilasci. - - - -Innanzitutto, assicurati che il tuo modello sia completamente definito in un file `.py`. Può sfruttare import relativi -ad altri file, purchè questi siano nella stessa directory (non supportiamo ancora sotto-moduli per questa funzionalità). -Per questo esempio, definiremo un file `modeling_resnet.py` e un file `configuration_resnet.py` in una cartella dell'attuale -working directory chiamata `resnet_model`. Il file configuration contiene il codice per `ResnetConfig` e il file modeling -contiene il codice di `ResnetModel` e `ResnetModelForImageClassification`. - -``` -. -└── resnet_model - ├── __init__.py - ├── configuration_resnet.py - └── modeling_resnet.py -``` - -Il file `__init__.py` può essere vuoto, serve solo perchè Python capisca che `resnet_model` può essere utilizzato come un modulo. - - - -Se stai copiando i file relativi alla modellazione della libreria, dovrai sostituire tutti gli import relativi in cima al file con import del - pacchetto `transformers`. - - - -Nota che puoi ri-utilizzare (o usare come sottoclassi) un modello/configurazione esistente. - -Per condividere il tuo modello con la community, segui questi passi: prima importa il modello ResNet e la sua configurazione -dai nuovi file creati: - -```py -from resnet_model.configuration_resnet import ResnetConfig -from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification -``` - -Dopodichè dovrai dire alla libreria che vuoi copiare i file con il codice di quegli oggetti quando utilizzi il metodo -`save_pretrained` e registrarli in modo corretto con una Auto classe (specialmente per i modelli). Utilizza semplicemente: - -```py -ResnetConfig.register_for_auto_class() -ResnetModel.register_for_auto_class("AutoModel") -ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") -``` - -Nota che non c'è bisogno di specificare una Auto classe per la configurazione (c'è solo una Auto classe per le configurazioni, -[`AutoConfig`], ma è diversa per i modelli). Il tuo modello personalizato potrebbe essere utilizzato per diverse tasks, -per cui devi specificare quale delle classi Auto è quella corretta per il tuo modello. - -Successivamente, creiamo i modelli e la config come abbiamo fatto in precedenza: - -```py -resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) -resnet50d = ResnetModelForImageClassification(resnet50d_config) - -pretrained_model = timm.create_model("resnet50d", pretrained=True) -resnet50d.model.load_state_dict(pretrained_model.state_dict()) -``` - -Adesso, per inviare il modello all'Hub, assicurati di aver effettuato l'accesso. Lancia dal tuo terminale: - -```bash -huggingface-cli login -``` - -O da un notebook: - -```py -from huggingface_hub import notebook_login - -notebook_login() -``` - -Potrai poi inviare il tutto sul tuo profilo (o di un'organizzazione di cui fai parte) in questo modo: - -```py -resnet50d.push_to_hub("custom-resnet50d") -``` - -Oltre ai pesi del modello e alla configurazione in formato json, questo ha anche copiato i file `.py` modeling e -configuration all'interno della cartella `custom-resnet50d` e ha caricato i risultati sull'Hub. Puoi controllare -i risultati in questa [model repo](https://huggingface.co/sgugger/custom-resnet50d). - -Puoi controllare il tutorial di condivisione [tutorial di condivisione](model_sharing) per più informazioni sul -metodo con cui inviare all'Hub. - -## Usare un modello con codice personalizzato - -Puoi usare ogni configurazione, modello o tokenizer con file di codice personalizzati nella sua repository -con le classi Auto e il metodo `from_pretrained`. Tutti i files e il codice caricati sull'Hub sono scansionati da malware -(fai riferimento alla documentazione [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) per più informazioni), -ma dovresti comunque assicurarti dell'affidabilità del codice e dell'autore per evitare di eseguire codice dannoso sulla tua macchina. -Imposta `trust_remote_code=True` per usare un modello con codice personalizzato: - -```py -from transformers import AutoModelForImageClassification - -model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) -``` - -Inoltre, raccomandiamo fortemente di passare un hash del commit come `revision` per assicurarti che le autrici o gli autori del modello -non abbiano modificato il codice con alcune nuove righe dannose (a meno che non ti fidi completamente della fonte): - -```py -commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" -model = AutoModelForImageClassification.from_pretrained( - "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash -) -``` - -Nota che quando cerchi la storia dei commit della repo del modello sull'Hub, c'è un bottone con cui facilmente copiare il -commit hash di ciascun commit. - -## Registrare un modello con codice personalizzato nelle classi Auto - -Se stai scrivendo una libreria che estende 🤗 Transformers, potresti voler estendere le classi Auto per includere il tuo modello. -Questo è diverso dall'inviare codice nell'Hub: gli utenti dovranno importare la tua libreria per ottenere il modello personalizzato -(anzichè scaricare automaticamente il modello dall'Hub). - -Finchè il tuo file di configurazione ha un attributo `model_type` diverso dai model types esistenti, e finchè le tue -classi modello hanno i corretti attributi `config_class`, potrai semplicemente aggiungerli alle classi Auto come segue: - -```py -from transformers import AutoConfig, AutoModel, AutoModelForImageClassification - -AutoConfig.register("resnet", ResnetConfig) -AutoModel.register(ResnetConfig, ResnetModel) -AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) -``` - -Nota che il primo argomento utilizzato quando registri la configurazione di un modello personalizzato con [`AutoConfig`] -deve corrispondere al `model_type` della tua configurazione personalizzata, ed il primo argomento utilizzato quando -registri i tuoi modelli personalizzati in una qualunque classe Auto del modello deve corrispondere alla `config_class` -di quei modelli. diff --git a/docs/source/it/debugging.md b/docs/source/it/debugging.md new file mode 100644 index 000000000000..5c1dab51bd11 --- /dev/null +++ b/docs/source/it/debugging.md @@ -0,0 +1,318 @@ + + +# Debugging + +## Debug dei problemi di rete multi-GPU + +Quando addestri o fai inferenza con `DistributedDataParallel` e GPU multiple, se si verificano problemi di intercomunicazione tra processi e/o nodi, puoi utilizzare il seguente script per diagnosticare i problemi della rete. + +```bash +wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py +``` + +Per esempio per testare come 2 GPU interagiscono fai: + +```bash +python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +``` + +Se entrambi i processi sono in grado di comunicare tra loro e di allocare la memoria della GPU, ciascuno di essi stamperà lo stato OK. + +Per più GPU o nodi adatta gli argumenti nello script. + +All'interno dello script di diagnostica troverai molti altri dettagli e anche una guida per eseguirlo in ambiente SLURM. + +Un livello di debug superiore è aggiungere la variabile d'ambiente `NCCL_DEBUG=INFO` come di seguito: + +```bash +NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +``` + +In questo modo si scaricano molte informazioni di debug relative a NCCL, che puoi cercare online in caso di problemi. Oppure, se non hai la sicurezza di come interpretare l'output, puoi condividere il file di log in una Issue. + +## Rilevamento di Underflow e Overflow + + + +Questa funzionalità al momento è disponibile solo per PyTorch. + + + + + +Per addestramento multi-GPU richiede DDP (`torch.distributed.launch`). + + + + + +Questa funzionalità può essere usata con modelli basati su `nn.Module`. + + + +Se inizi a ottenere `loss=NaN` o il modello presenta qualche altro comportamento anomalo a causa di valori `inf` o `nan` in +attivazioni o nei pesi, è necessario scoprire dove si verifica il primo underflow o overflow e cosa lo ha determinato. Fortunatamente +è possibile farlo facilmente attivando un modulo speciale che effettuerà il rilevamento automaticamente. + +Se stai usando [`Trainer`], hai bisogno di aggiungere solo: + +```bash +--debug underflow_overflow +``` + +ai normali argomenti della riga di comando, o passa `debug="underflow_overflow"` quando viene creato l'oggetto +[`TrainingArguments`]. + +Se stai usando il tuo ciclo di allenamento o un altro trainer, puoi ottenere lo stesso risultato con: + +```python +from .debug_utils import DebugUnderflowOverflow + +debug_overflow = DebugUnderflowOverflow(model) +``` + +[`~debug_utils.DebugUnderflowOverflow`] inserisce dei ganci nel modello che dopo ogni chiamata +testeranno le variabili di ingresso e di uscita e anche i pesi del modulo corrispondente. Non appena viene rilevato `inf` o +o `nan` in almeno un elemento delle attivazioni o dei pesi, il programma lo notifica e stampa un rapporto come il seguente (questo è stato rilevato con `google/mt5-small` sotto fp16 mixed precision): + +``` +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata + encoder.block.1.layer.1.DenseReluDense.dropout Dropout +0.00e+00 2.57e+02 input[0] +0.00e+00 2.85e+02 output +[...] + encoder.block.2.layer.0 T5LayerSelfAttention +6.78e-04 3.15e+03 input[0] +2.65e-04 3.42e+03 output[0] + None output[1] +2.25e-01 1.00e+04 output[2] + encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.dropout Dropout +0.00e+00 8.76e+03 input[0] +0.00e+00 9.74e+03 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +``` + +L'output di esempio è stato tagliato al centro per brevità. + +La seconda colonna mostra il valore dell'elemento più grande in assoluto,così se osserviamo da vicino gli ultimi istanti, +input e output sono nel range di `1e4`. Questo addestramento è stato eseguito con una mixed precision fp16 e l'ultimo passo usciva fuori (sotto `fp16` il valore più grande prima di `inf` è `64e3`). Per evitare overflows sotto `fp16` le attivazionioni devono rimanere molto al di sotto di `1e4`, perché `1e4 * 1e4 = 1e8` quindi qualsiasi moltiplicazione di matrice con grandi attivazioni porterà a una condizione di overflow numerico. + +All'inizio della traccia è possibile scoprire a quale lotto si è verificato il problema (questo `Detected inf/nan during batch_number=0` significa che il problema si è verificato nel primo lotto). + +Ogni frame segnalato inizia dichiarando la voce completamente qualificata per il modulo corrispondente per il quale il frame è stato segnalato. +Se osserviamo il seguente frame: + +``` + encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output +``` + +Questo, `encoder.block.2.layer.1.layer_norm` indica che si tratta di un layer norm nel primo layer, del secondo blocco dell'encoder. E le chiamata specifica di `forward` è `T5LayerNorm`. + +Osserviamo gli ultimi frame del report: + +``` +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata +[...] + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +``` + +L'ultimo frame report per la funzione `Dropout.forward` con la prima voce per l'unico input e la seconda per l'unico output. Si può notare che è stato richiamato da un attibuto `dropout` dentro la classe `DenseReluDense`. Si può notare che ciò è avvenuto durante il primo strato, del 2° blocco, durante il primissimo lotto. Infine, gli elementi di input più grandi in assoluto sono stati `6.27e+04` e l'equivalente per l'output era `inf`. + +Puoi vedere qui, che `T5DenseGatedGeluDense.forward` risulta in output activations, il cui valore massimo assoluto era circa 62,7K, che è molto vicino al limite massimo di 64K di fp16. Nel prossimo frame abbiamo `Dropout` che rinormalizza i pesi, dopo aver azzerato alcuni elementi, il che spinge il valore massimo assoluto a più di 64K e si verifica un overflow.(`inf`). + +Come puoi notare, è nei frames precedenti che occorre esaminare quando i numeri iniziano a diventare molto grandi per i valori fp16. + +Confrontiamo il report al codice `models/t5/modeling_t5.py`: + +```python +class T5DenseGatedGeluDense(nn.Module): + def __init__(self, config): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.gelu_act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states +``` + +Ora è facile vedere la chiamata `dropout`, e tutte le chiamate precedenti. + +Poiché il rilevamento avviene in un avanzamento (forward hook in eng.), i rapporti vengono creati immeditamente dopo ogni rientro da `forward` (forward returns in eng.). + +Tornando al rapporto completo, per agire e risolvere il problema, dobbiamo andare qualche frame più in alto, dove i numeri hanno iniziato a salire, e probabilmente passare alla modalità `fp32`, in modo che i numeri non trabocchino quando vengono moltiplicati o sommati. Naturalmente, potrebbero esserci altre soluzioni. Per esempio, potremmo spegnere temporanemante `amp` se è abilitato, successivamente spostare `forward` in un helper wrapper, come: + +```python +def _forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +import torch + + +def forward(self, hidden_states): + if torch.is_autocast_enabled(): + with torch.cuda.amp.autocast(enabled=False): + return self._forward(hidden_states) + else: + return self._forward(hidden_states) +``` + +Poiché il rilevatore automatico riporta solo gli ingressi e le uscite di fotogrammi completi, una volta che si sa dove cercare, si può +analizzare anche le fasi intermedie di una specifica funzione `forward`. In alcuni casi puoi usare la funzione di supporto `detect_overflow` per indirizzare il rilevatore dove preferisci, ad esempio: + +```python +from debug_utils import detect_overflow + + +class T5LayerFF(nn.Module): + [...] + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + detect_overflow(forwarded_states, "after layer_norm") + forwarded_states = self.DenseReluDense(forwarded_states) + detect_overflow(forwarded_states, "after DenseReluDense") + return hidden_states + self.dropout(forwarded_states) +``` + +Si può vedere che abbiamo aggiunto 2 di questi e ora teniamo traccia se `inf` o `nan` per `forwarded_states` è stato rilevato +da qualche parte. + +In realtà, il rilevatore li riporta già, perché ciascuna delle chiamate nell'esempio precedente è un `nn.Module`, ma +diciamo che se avessimo dei calcoli diretti locali, questo è il modo in cui lo faremmo. + +Inoltre, se si istanzia il debugger nel proprio codice, è possibile modificare il numero di fotogrammi stampati rispetto a +predefinito, ad esempio.: + +```python +from .debug_utils import DebugUnderflowOverflow + +debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) +``` + +### Tracciamento della mistura assoluta del lotto specifico e del valore massimo + +La stessa classe di debug può essere utilizzata per il tracciamento per-batch con la funzione di rilevamento di underflow/overflow disattivata. + +Supponiamo di voler osservare i valori minimi e massimi assoluti per tutti gli ingredienti di ogni chiamata `forward` di un dato lotto. +lotto, e che lo si voglia fare solo per i lotti 1 e 3. Si istanzia questa classe come: + +```python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) +``` + +Ora i batch completi 1 e 3 saranno tracciati utilizzando lo stesso formato del rilevatore di underflow/overflow. + +I batches sono 0-indexed. + +Questo è utile se si sa che il programma inizia a comportarsi male dopo un certo numero di batch, in modo da poter avanzare velocemente fino a quell'area. +direttamente a quell'area. Ecco un esempio di output troncato per questa configurazione: + +``` + *** Starting batch number=1 *** +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.47e+04 input[0] +5.36e-05 7.92e+02 output +[...] + decoder.dropout Dropout +1.60e-07 2.27e+01 input[0] +0.00e+00 2.52e+01 output + decoder T5Stack + not a tensor output + lm_head Linear +1.01e-06 7.92e+02 weight +0.00e+00 1.11e+00 input[0] +6.06e-02 8.39e+01 output + T5ForConditionalGeneration + not a tensor output + + *** Starting batch number=3 *** +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.78e+04 input[0] +5.36e-05 7.92e+02 output +[...] +``` + +Qui verrà scaricato un numero enorme di fotogrammi, tanti quanti sono le chiamate in avanti nel modello, quindi può essere o non essere quello che volete, ma a volte può essere più utile usarlo di un classico debugger. Per esempio, se il problema inizia a verificarsi a partire dal lotto numero 150. Quindi è possibile scaricare le tracce dei lotti 149 e 150 e confrontare i punti in cui i numeri hanno iniziato a divergere. + +È inoltre possibile specificare il numero di batch dopo il quale interrompere l'addestramento, con: + +```python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) +``` diff --git a/docs/source/it/debugging.mdx b/docs/source/it/debugging.mdx deleted file mode 100644 index 5b392489eab9..000000000000 --- a/docs/source/it/debugging.mdx +++ /dev/null @@ -1,314 +0,0 @@ - - -# Debugging - -## Debug dei problemi di rete multi-GPU - -Quando addestri o fai inferenza con `DistributedDataParallel` e GPU multiple, se si verificano problemi di intercomunicazione tra processi e/o nodi, puoi utilizzare il seguente script per diagnosticare i problemi della rete. - -```bash -wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py -``` - -Per esempio per testare come 2 GPU interagiscono fai: - -```bash -python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py -``` - -Se entrambi i processi sono in grado di comunicare tra loro e di allocare la memoria della GPU, ciascuno di essi stamperà lo stato OK. - -Per più GPU o nodi adatta gli argumenti nello script. - -All'interno dello script di diagnostica troverai molti altri dettagli e anche una guida per eseguirlo in ambiente SLURM. - -Un livello di debug superiore è aggiungere la variabile d'ambiente `NCCL_DEBUG=INFO` come di seguito: - -```bash -NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py -``` - -In questo modo si scaricano molte informazioni di debug relative a NCCL, che puoi cercare online in caso di problemi. Oppure, se non hai la sicurezza di come interpretare l'output, puoi condividere il file di log in una Issue. - -## Rilevamento di Underflow e Overflow - - - -Questa funzionalità al momento è disponibile solo per PyTorch. - - - - - -Per addestramento multi-GPU richiede DDP (`torch.distributed.launch`). - - - - - -Questa funzionalità può essere usata con modelli basati su `nn.Module`. - - - -Se inizi a ottenere `loss=NaN` o il modello presenta qualche altro comportamento anomalo a causa di valori `inf` o `nan` in -attivazioni o nei pesi, è necessario scoprire dove si verifica il primo underflow o overflow e cosa lo ha determinato. Fortunatamente -è possibile farlo facilmente attivando un modulo speciale che effettuerà il rilevamento automaticamente. - -Se stai usando [`Trainer`], hai bisogno di aggiungere solo: - -```bash ---debug underflow_overflow -``` - -ai normali argomenti della riga di comando, o passa `debug="underflow_overflow"` quando viene creato l'oggetto -[`TrainingArguments`]. - -Se stai usando il tuo ciclo di allenamento o un altro trainer, puoi ottenere lo stesso risultato con: - -```python -from .debug_utils import DebugUnderflowOverflow - -debug_overflow = DebugUnderflowOverflow(model) -``` - -[`~debug_utils.DebugUnderflowOverflow`] inserisce dei ganci nel modello che dopo ogni chiamata -testeranno le variabili di ingresso e di uscita e anche i pesi del modulo corrispondente. Non appena viene rilevato `inf` o -o `nan` in almeno un elemento delle attivazioni o dei pesi, il programma lo notifica e stampa un rapporto come il seguente (questo è stato rilevato con `google/mt5-small` sotto fp16 mixed precision): - -``` -Detected inf/nan during batch_number=0 -Last 21 forward frames: -abs min abs max metadata - encoder.block.1.layer.1.DenseReluDense.dropout Dropout -0.00e+00 2.57e+02 input[0] -0.00e+00 2.85e+02 output -[...] - encoder.block.2.layer.0 T5LayerSelfAttention -6.78e-04 3.15e+03 input[0] -2.65e-04 3.42e+03 output[0] - None output[1] -2.25e-01 1.00e+04 output[2] - encoder.block.2.layer.1.layer_norm T5LayerNorm -8.69e-02 4.18e-01 weight -2.65e-04 3.42e+03 input[0] -1.79e-06 4.65e+00 output - encoder.block.2.layer.1.DenseReluDense.wi_0 Linear -2.17e-07 4.50e+00 weight -1.79e-06 4.65e+00 input[0] -2.68e-06 3.70e+01 output - encoder.block.2.layer.1.DenseReluDense.wi_1 Linear -8.08e-07 2.66e+01 weight -1.79e-06 4.65e+00 input[0] -1.27e-04 2.37e+02 output - encoder.block.2.layer.1.DenseReluDense.dropout Dropout -0.00e+00 8.76e+03 input[0] -0.00e+00 9.74e+03 output - encoder.block.2.layer.1.DenseReluDense.wo Linear -1.01e-06 6.44e+00 weight -0.00e+00 9.74e+03 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense -1.79e-06 4.65e+00 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.dropout Dropout -3.18e-04 6.27e+04 input[0] -0.00e+00 inf output -``` - -L'output di esempio è stato tagliato al centro per brevità. - -La seconda colonna mostra il valore dell'elemento più grande in assoluto,così se osserviamo da vicino gli ultimi istanti, -input e output sono nel range di `1e4`. Questo addestramento è stato eseguito con una mixed precision fp16 e l'ultimo passo usciva fuori (sotto `fp16` il valore più grande prima di `inf` è `64e3`). Per evitare overflows sotto `fp16` le attivazionioni devono rimanere molto al di sotto di `1e4`, perché `1e4 * 1e4 = 1e8` quindi qualsiasi moltiplicazione di matrice con grandi attivazioni porterà a una condizione di overflow numerico. - -All'inizio della traccia è possibile scoprire a quale lotto si è verificato il problema (questo `Detected inf/nan during batch_number=0` significa che il problema si è verificato nel primo lotto). - -Ogni frame segnalato inizia dichiarando la voce completamente qualificata per il modulo corrispondente per il quale il frame è stato segnalato. -Se osserviamo il seguente frame: - -``` - encoder.block.2.layer.1.layer_norm T5LayerNorm -8.69e-02 4.18e-01 weight -2.65e-04 3.42e+03 input[0] -1.79e-06 4.65e+00 output -``` - -Questo, `encoder.block.2.layer.1.layer_norm` indica che si tratta di un layer norm nel primo layer, del secondo blocco dell'encoder. E le chiamata specifica di `forward` è `T5LayerNorm`. - -Osserviamo gli ultimi frame del report: - -``` -Detected inf/nan during batch_number=0 -Last 21 forward frames: -abs min abs max metadata -[...] - encoder.block.2.layer.1.DenseReluDense.wi_0 Linear -2.17e-07 4.50e+00 weight -1.79e-06 4.65e+00 input[0] -2.68e-06 3.70e+01 output - encoder.block.2.layer.1.DenseReluDense.wi_1 Linear -8.08e-07 2.66e+01 weight -1.79e-06 4.65e+00 input[0] -1.27e-04 2.37e+02 output - encoder.block.2.layer.1.DenseReluDense.wo Linear -1.01e-06 6.44e+00 weight -0.00e+00 9.74e+03 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense -1.79e-06 4.65e+00 input[0] -3.18e-04 6.27e+04 output - encoder.block.2.layer.1.dropout Dropout -3.18e-04 6.27e+04 input[0] -0.00e+00 inf output -``` - -L'ultimo frame report per la funzione `Dropout.forward` con la prima voce per l'unico input e la seconda per l'unico output. Si può notare che è stato richiamato da un attibuto `dropout` dentro la classe `DenseReluDense`. Si può notare che ciò è avvenuto durante il primo strato, del 2° blocco, durante il primissimo lotto. Infine, gli elementi di input più grandi in assoluto sono stati `6.27e+04` e l'equivalente per l'output era `inf`. - -Puoi vedere qui, che `T5DenseGatedGeluDense.forward` risulta in output activations, il cui valore massimo assoluto era circa 62,7K, che è molto vicino al limite massimo di 64K di fp16. Nel prossimo frame abbiamo `Dropout` che rinormalizza i pesi, dopo aver azzerato alcuni elementi, il che spinge il valore massimo assoluto a più di 64K e si verifica un overflow.(`inf`). - -Come puoi notare, è nei frames precedenti che occorre esaminare quando i numeri iniziano a diventare molto grandi per i valori fp16. - -Confrontiamo il report al codice `models/t5/modeling_t5.py`: - -```python -class T5DenseGatedGeluDense(nn.Module): - def __init__(self, config): - super().__init__() - self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) - self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout_rate) - self.gelu_act = ACT2FN["gelu_new"] - - def forward(self, hidden_states): - hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) - hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear - hidden_states = self.dropout(hidden_states) - hidden_states = self.wo(hidden_states) - return hidden_states -``` - -Ora è facile vedere la chiamata `dropout`, e tutte le chiamate precedenti. - -Poiché il rilevamento avviene in un avanzamento (forward hook in eng.), i rapporti vengono creati immeditamente dopo ogni rientro da `forward` (forward returns in eng.). - -Tornando al rapporto completo, per agire e risolvere il problema, dobbiamo andare qualche frame più in alto, dove i numeri hanno iniziato a salire, e probabilmente passare alla modalità `fp32`, in modo che i numeri non trabocchino quando vengono moltiplicati o sommati. Naturalmente, potrebbero esserci altre soluzioni. Per esempio, potremmo spegnere temporanemante `amp` se è abilitato, successivamente spostare `forward` in un helper wrapper, come: - -```python -def _forward(self, hidden_states): - hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) - hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear - hidden_states = self.dropout(hidden_states) - hidden_states = self.wo(hidden_states) - return hidden_states - - -import torch - - -def forward(self, hidden_states): - if torch.is_autocast_enabled(): - with torch.cuda.amp.autocast(enabled=False): - return self._forward(hidden_states) - else: - return self._forward(hidden_states) -``` - -Poiché il rilevatore automatico riporta solo gli ingressi e le uscite di fotogrammi completi, una volta che si sa dove cercare, si può -analizzare anche le fasi intermedie di una specifica funzione `forward`. In alcuni casi puoi usare la funzione di supporto `detect_overflow` per indirizzare il rilevatore dove preferisci, ad esempio: - -```python -from debug_utils import detect_overflow - - -class T5LayerFF(nn.Module): - [...] - - def forward(self, hidden_states): - forwarded_states = self.layer_norm(hidden_states) - detect_overflow(forwarded_states, "after layer_norm") - forwarded_states = self.DenseReluDense(forwarded_states) - detect_overflow(forwarded_states, "after DenseReluDense") - return hidden_states + self.dropout(forwarded_states) -``` - -Si può vedere che abbiamo aggiunto 2 di questi e ora teniamo traccia se `inf` o `nan` per `forwarded_states` è stato rilevato -da qualche parte. - -In realtà, il rilevatore li riporta già, perché ciascuna delle chiamate nell'esempio precedente è un `nn.Module`, ma -diciamo che se avessimo dei calcoli diretti locali, questo è il modo in cui lo faremmo. - -Inoltre, se si istanzia il debugger nel proprio codice, è possibile modificare il numero di fotogrammi stampati rispetto a -predefinito, ad esempio.: - -```python -from .debug_utils import DebugUnderflowOverflow - -debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) -``` - -### Tracciamento della mistura assoluta del lotto specifico e del valore massimo - -La stessa classe di debug può essere utilizzata per il tracciamento per-batch con la funzione di rilevamento di underflow/overflow disattivata. - -Supponiamo di voler osservare i valori minimi e massimi assoluti per tutti gli ingredienti di ogni chiamata `forward` di un dato lotto. -lotto, e che lo si voglia fare solo per i lotti 1 e 3. Si istanzia questa classe come: - -```python -debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) -``` - -Ora i batch completi 1 e 3 saranno tracciati utilizzando lo stesso formato del rilevatore di underflow/overflow. - -I batches sono 0-indexed. - -Questo è utile se si sa che il programma inizia a comportarsi male dopo un certo numero di batch, in modo da poter avanzare velocemente fino a quell'area. -direttamente a quell'area. Ecco un esempio di output troncato per questa configurazione: - -``` - *** Starting batch number=1 *** -abs min abs max metadata - shared Embedding -1.01e-06 7.92e+02 weight -0.00e+00 2.47e+04 input[0] -5.36e-05 7.92e+02 output -[...] - decoder.dropout Dropout -1.60e-07 2.27e+01 input[0] -0.00e+00 2.52e+01 output - decoder T5Stack - not a tensor output - lm_head Linear -1.01e-06 7.92e+02 weight -0.00e+00 1.11e+00 input[0] -6.06e-02 8.39e+01 output - T5ForConditionalGeneration - not a tensor output - - *** Starting batch number=3 *** -abs min abs max metadata - shared Embedding -1.01e-06 7.92e+02 weight -0.00e+00 2.78e+04 input[0] -5.36e-05 7.92e+02 output -[...] -``` - -Qui verrà scaricato un numero enorme di fotogrammi, tanti quanti sono le chiamate in avanti nel modello, quindi può essere o non essere quello che volete, ma a volte può essere più utile usarlo di un classico debugger. Per esempio, se il problema inizia a verificarsi a partire dal lotto numero 150. Quindi è possibile scaricare le tracce dei lotti 149 e 150 e confrontare i punti in cui i numeri hanno iniziato a divergere. - -È inoltre possibile specificare il numero di batch dopo il quale interrompere l'addestramento, con: - -```python -debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) -``` diff --git a/docs/source/it/index.md b/docs/source/it/index.md new file mode 100644 index 000000000000..5c7d22c1e6b1 --- /dev/null +++ b/docs/source/it/index.md @@ -0,0 +1,300 @@ + + +# 🤗 Transformers + +Machine Learning allo stato dell'arte per PyTorch, TensorFlow e JAX. + +🤗 Transformers fornisce delle API per scaricare in modo semplice e allenare modelli pre-allenati allo stato dell'arte. L'utilizzo di modelli pre-allenati può ridurre i tuoi costi computazionali, l'impatto ambientale, e farti risparmiare il tempo che utilizzeresti per allenare un modello da zero. I modelli possono essere utilizzati in diverse modalità come ad esempio: + +* 📝 Testo: classificazione del testo, estrazione delle informazioni, rispondere a domande, riassumere, traduzione e generazione del testo in più di 100 lingue. +* 🖼️ Immagini: classificazione di immagini, rilevazione di oggetti e segmentazione. +* 🗣️ Audio: riconoscimento vocale e classificazione dell'audio. +* 🐙 Multimodale: rispondere a domande inerenti dati tabulari, riconoscimento ottico dei caratteri, estrazione di informazioni a partire da documenti scannerizzati, classificazione di video e risposta visuale a domande. + +La nostra libreria supporta un'integrazione perfetta tra tre delle librerie per il deep learning più popolari: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) e [JAX](https://jax.readthedocs.io/en/latest/). Allena il tuo modello in tre righe di codice in un framework, e caricalo per l'inferenza in un altro. + +Ogni architettura di 🤗 Transformers è definita in un modulo Python indipendente così da poter essere personalizzata in modo semplice per la ricerca e gli esperimenti. + +## Se stai cercando supporto personalizzato dal team di Hugging Face + + +HuggingFace Expert Acceleration Program + + +## Contenuti + +La documentazione è organizzata in cinque parti: + +- **INIZIARE** contiene un tour rapido e le istruzioni di installazione per cominciare ad utilizzare 🤗 Transformers. +- **TUTORIALS** è un buon posto da cui iniziare se per te la nostra libreria è nuova. Questa sezione ti aiuterà ad acquisire le competenze basilari di cui hai bisogno per iniziare ad utilizzare 🤗 Transformers. +- **GUIDE PRATICHE** ti mostrerà come raggiungere obiettivi specifici come fare fine-tuning di un modello pre-allenato per la modellizzazione del linguaggio o come creare una testa per un modello personalizzato. +- **GUIDE CONCETTUALI** fornisce discussioni e spiegazioni dei concetti sottostanti alle idee dietro ai modelli, compiti, e la filosofia di progettazione di 🤗 Transformers. +- **API** descrive ogni classe e funzione, raggruppate in: + - **CLASSI PRINCIPALI** per le classi principali che espongono le API importanti della libreria. + - **MODELLI** per le classi e le funzioni relative ad ogni modello implementato all'interno della libreria. + - **HELPERS INTERNI** per le classi e le funzioni che utilizziamo internamente. + +La libreria attualmente contiene implementazioni in JAX, PyTorch e TensorFlow, pesi di modelli pre-allenati, script di utilizzo e strumenti di conversione per i seguenti modelli. + +### Modelli supportati + + + +1. **[ALBERT](model_doc/albert)** (da Google Research e l'Istituto Tecnologico di Chicago) rilasciato con il paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), da Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. +1. **[ALIGN](model_doc/align)** (from Google Research) rilasciato con il paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) da Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. +1. **[BART](model_doc/bart)** (da Facebook) rilasciato con il paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) da Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov e Luke Zettlemoyer. +1. **[BARThez](model_doc/barthez)** (da politecnico di École) rilasciato con il paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) da Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](model_doc/bartpho)** (da VinAI Research) rilasciato con il paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) da Nguyen Luong Tran, Duong Minh Le e Dat Quoc Nguyen. +1. **[BEiT](model_doc/beit)** (da Microsoft) rilasciato con il paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) da Hangbo Bao, Li Dong, Furu Wei. +1. **[BERT](model_doc/bert)** (da Google) rilasciato con il paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) da Jacob Devlin, Ming-Wei Chang, Kenton Lee e Kristina Toutanova. +1. **[BERTweet](model_doc/bertweet)** (da VinAI Research) rilasciato con il paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) da Dat Quoc Nguyen, Thanh Vu e Anh Tuan Nguyen. +1. **[BERT For Sequence Generation](model_doc/bert-generation)** (da Google) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[BigBird-RoBERTa](model_doc/big_bird)** (da Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (v Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. +1. **[Blenderbot](model_doc/blenderbot)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](model_doc/blenderbot-small)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BORT](model_doc/bort)** (da Alexa) rilasciato con il paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) da Adrian de Wynter e Daniel J. Perry. +1. **[ByT5](model_doc/byt5)** (da Google Research) rilasciato con il paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) da Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. +1. **[CamemBERT](model_doc/camembert)** (da Inria/Facebook/Sorbonne) rilasciato con il paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) da Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah e Benoît Sagot. +1. **[CANINE](model_doc/canine)** (da Google Research) rilasciato con il paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) da Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. +1. **[ConvNeXT](model_doc/convnext)** (da Facebook AI) rilasciato con il paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) da Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. +1. **[ConvNeXTV2](model_doc/convnextv2)** (da Facebook AI) rilasciato con il paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) da Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. +1. **[CLIP](model_doc/clip)** (da OpenAI) rilasciato con il paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) da Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. +1. **[ConvBERT](model_doc/convbert)** (da YituTech) rilasciato con il paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) da Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. +1. **[CPM](model_doc/cpm)** (dalla Università di Tsinghua) rilasciato con il paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) da Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. +1. **[CTRL](model_doc/ctrl)** (da Salesforce) rilasciato con il paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) da Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong e Richard Socher. +1. **[CvT](model_doc/cvt)** (da Microsoft) rilasciato con il paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) da Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. +1. **[Data2Vec](model_doc/data2vec)** (da Facebook) rilasciato con il paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) da Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. +1. **[DeBERTa](model_doc/deberta)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[DeBERTa-v2](model_doc/deberta-v2)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. +1. **[Decision Transformer](model_doc/decision_transformer)** (da Berkeley/Facebook/Google) rilasciato con il paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) da Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. +1. **[DiT](model_doc/dit)** (da Microsoft Research) rilasciato con il paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) da Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. +1. **[DeiT](model_doc/deit)** (da Facebook) rilasciato con il paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) da Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. +1. **[DETR](model_doc/detr)** (da Facebook) rilasciato con il paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) da Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DialoGPT](model_doc/dialogpt)** (da Microsoft Research) rilasciato con il paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) da Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. +1. **[DistilBERT](model_doc/distilbert)** (da HuggingFace), rilasciato assieme al paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) da Victor Sanh, Lysandre Debut e Thomas Wolf. La stessa tecnica è stata applicata per comprimere GPT2 in [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa in [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT in [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. +1. **[DPR](model_doc/dpr)** (da Facebook) rilasciato con il paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) da Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, e Wen-tau Yih. +1. **[DPT](master/model_doc/dpt)** (da Intel Labs) rilasciato con il paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) da René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le. +1. **[EncoderDecoder](model_doc/encoder-decoder)** (da Google Research) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ELECTRA](model_doc/electra)** (da Google Research/Stanford University) rilasciato con il paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) da Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. +1. **[FlauBERT](model_doc/flaubert)** (da CNRS) rilasciato con il paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) da Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. +1. **[FLAVA](model_doc/flava)** (da Facebook AI) rilasciato con il paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) da Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, e Douwe Kiela. +1. **[FNet](model_doc/fnet)** (da Google Research) rilasciato con il paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) da James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. +1. **[Funnel Transformer](model_doc/funnel)** (da CMU/Google Brain) rilasciato con il paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) da Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GLPN](model_doc/glpn)** (da KAIST) rilasciato con il paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) da Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. +1. **[GPT](model_doc/openai-gpt)** (da OpenAI) rilasciato con il paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) da Alec Radford, Karthik Narasimhan, Tim Salimans e Ilya Sutskever. +1. **[GPT-2](model_doc/gpt2)** (da OpenAI) rilasciato con il paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) da Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** e Ilya Sutskever**. +1. **[GPT-J](model_doc/gptj)** (da EleutherAI) rilasciato nel repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) da Ben Wang e Aran Komatsuzaki. +1. **[GPT Neo](model_doc/gpt_neo)** (da EleutherAI) rilasciato nel repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) da Sid Black, Stella Biderman, Leo Gao, Phil Wang e Connor Leahy. +1. **[GPT NeoX](model_doc/gpt_neox)** (da EleutherAI) rilasciato con il paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) da Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach +1. **[Hubert](model_doc/hubert)** (da Facebook) rilasciato con il paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) da Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. +1. **[I-BERT](model_doc/ibert)** (da Berkeley) rilasciato con il paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) da Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. +1. **[ImageGPT](model_doc/imagegpt)** (da OpenAI) rilasciato con il paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) da Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. +1. **[LayoutLM](model_doc/layoutlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) da Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. +1. **[LayoutLMv2](model_doc/layoutlmv2)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) da Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. +1. **[LayoutLMv3](model_doc/layoutlmv3)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) da Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei. +1. **[LayoutXLM](model_doc/layoutlxlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) da Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. +1. **[LED](model_doc/led)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[Longformer](model_doc/longformer)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan. +1. **[LUKE](model_doc/luke)** (da Studio Ousia) rilasciato con il paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) da Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. +1. **[mLUKE](model_doc/mluke)** (da Studio Ousia) rilasciato con il paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) da Ryokan Ri, Ikuya Yamada, e Yoshimasa Tsuruoka. +1. **[LXMERT](model_doc/lxmert)** (da UNC Chapel Hill) rilasciato con il paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) da Hao Tan e Mohit Bansal. +1. **[M2M100](model_doc/m2m_100)** (da Facebook) rilasciato con il paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) da Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. +1. **[MarianMT](model_doc/marian)** Modello di machine learning per le traduzioni allenato utilizzando i dati [OPUS](http://opus.nlpl.eu/) di Jörg Tiedemann. Il [Framework Marian](https://marian-nmt.github.io/) è stato sviluppato dal Microsoft Translator Team. +1. **[Mask2Former](model_doc/mask2former)** (da FAIR e UIUC) rilasciato con il paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) da Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. +1. **[MaskFormer](model_doc/maskformer)** (da Meta e UIUC) rilasciato con il paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) da Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. +1. **[MBart](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) da Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. +1. **[MBart-50](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) da Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. +1. **[Megatron-BERT](model_doc/megatron-bert)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro. +1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro. +1. **[MPNet](model_doc/mpnet)** (da Microsoft Research) rilasciato con il paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) da Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. +1. **[MT5](model_doc/mt5)** (da Google AI) rilasciato con il paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) da Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[Nyströmformer](model_doc/nystromformer)** (dalla Università del Wisconsin - Madison) rilasciato con il paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) da Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. +1. **[OneFormer](model_doc/oneformer)** (da SHI Labs) rilasciato con il paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) da Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. +1. **[OPT](master/model_doc/opt)** (da Meta AI) rilasciato con il paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) da Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. +1. **[Pegasus](model_doc/pegasus)** (da Google) rilasciato con il paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) da Jingqing Zhang, Yao Zhao, Mohammad Saleh e Peter J. Liu. +1. **[Perceiver IO](model_doc/perceiver)** (da Deepmind) rilasciato con il paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) da Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. +1. **[PhoBERT](model_doc/phobert)** (da VinAI Research) rilasciato con il paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) da Dat Quoc Nguyen e Anh Tuan Nguyen. +1. **[PLBart](model_doc/plbart)** (da UCLA NLP) rilasciato con il paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) da Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. +1. **[PoolFormer](model_doc/poolformer)** (da Sea AI Labs) rilasciato con il paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) da Yu, Weihao e Luo, Mi e Zhou, Pan e Si, Chenyang e Zhou, Yichen e Wang, Xinchao e Feng, Jiashi e Yan, Shuicheng. +1. **[ProphetNet](model_doc/prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou. +1. **[QDQBert](model_doc/qdqbert)** (da NVIDIA) rilasciato con il paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) da Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev e Paulius Micikevicius. +1. **[REALM](model_doc/realm.html)** (da Google Research) rilasciato con il paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) da Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat e Ming-Wei Chang. +1. **[Reformer](model_doc/reformer)** (da Google Research) rilasciato con il paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) da Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. +1. **[RemBERT](model_doc/rembert)** (da Google Research) rilasciato con il paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) da Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. +1. **[RegNet](model_doc/regnet)** (da META Platforms) rilasciato con il paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) da Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. +1. **[ResNet](model_doc/resnet)** (da Microsoft Research) rilasciato con il paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) da Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. +1. **[RoBERTa](model_doc/roberta)** (da Facebook), rilasciato assieme al paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) da Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. +1. **[RoFormer](model_doc/roformer)** (da ZhuiyiTechnology), rilasciato assieme al paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) da Jianlin Su e Yu Lu e Shengfeng Pan e Bo Wen e Yunfeng Liu. +1. **[SegFormer](model_doc/segformer)** (da NVIDIA) rilasciato con il paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) da Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. +1. **[SEW](model_doc/sew)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SEW-D](model_doc/sew_d)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. +1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (da Facebook), rilasciato assieme al paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) da Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. +1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (da Facebook), rilasciato assieme al paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) da Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. +1. **[Splinter](model_doc/splinter)** (dalla Università di Tel Aviv), rilasciato assieme al paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) da Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. +1. **[SqueezeBert](model_doc/squeezebert)** (da Berkeley) rilasciato con il paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) da Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, e Kurt W. Keutzer. +1. **[Swin Transformer](model_doc/swin)** (da Microsoft) rilasciato con il paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) da Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. +1. **[T5](model_doc/t5)** (da Google AI) rilasciato con il paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu. +1. **[T5v1.1](model_doc/t5v1.1)** (da Google AI) rilasciato nel repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu. +1. **[TAPAS](model_doc/tapas)** (da Google AI) rilasciato con il paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) da Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno e Julian Martin Eisenschlos. +1. **[TAPEX](model_doc/tapex)** (da Microsoft Research) rilasciato con il paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) da Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (dall'Università della California a Berkeley) rilasciato con il paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) da Michael Janner, Qiyang Li, Sergey Levine +1. **[Transformer-XL](model_doc/transfo-xl)** (da Google/CMU) rilasciato con il paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) da Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. +1. **[TrOCR](model_doc/trocr)** (da Microsoft), rilasciato assieme al paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) da Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. +1. **[UniSpeech](model_doc/unispeech)** (da Microsoft Research) rilasciato con il paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) da Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. +1. **[UniSpeechSat](model_doc/unispeech-sat)** (da Microsoft Research) rilasciato con il paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) da Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. +1. **[VAN](model_doc/van)** (dalle Università di Tsinghua e Nankai) rilasciato con il paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) da Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. +1. **[ViLT](model_doc/vilt)** (da NAVER AI Lab/Kakao Enterprise/Kakao Brain) rilasciato con il paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) da Wonjae Kim, Bokyung Son, Ildoo Kim. +1. **[Vision Transformer (ViT)](model_doc/vit)** (da Google AI) rilasciato con il paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) da Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +1. **[ViTMAE](model_doc/vit_mae)** (da Meta AI) rilasciato con il paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) da Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. +1. **[VisualBERT](model_doc/visual_bert)** (da UCLA NLP) rilasciato con il paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) da Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. +1. **[WavLM](model_doc/wavlm)** (da Microsoft Research) rilasciato con il paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) da Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. +1. **[Wav2Vec2](model_doc/wav2vec2)** (da Facebook AI) rilasciato con il paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) da Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (da Facebook AI) rilasciato con il paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) da Qiantong Xu, Alexei Baevski, Michael Auli. +1. **[XGLM](model_doc/xglm)** (da Facebook AI) rilasciato con il paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) da Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. +1. **[XLM](model_doc/xlm)** (v Facebook) rilasciato assieme al paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) da Guillaume Lample e Alexis Conneau. +1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou. +1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (da Facebook AI), rilasciato assieme al paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) da Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer e Veselin Stoyanov. +1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (da Facebook AI), rilasciato assieme al paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) da Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. +1. **[XLNet](model_doc/xlnet)** (da Google/CMU) rilasciato con il paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) da Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. +1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (da Facebook AI) rilasciato con il paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) da Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. +1. **[XLS-R](model_doc/xls_r)** (da Facebook AI) rilasciato con il paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) da Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. +1. **[YOLOS](model_doc/yolos)** (dalla Università della scienza e tecnologia di Huazhong) rilasciato con il paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) da Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. +1. **[YOSO](model_doc/yoso)** (dall'Università del Wisconsin - Madison) rilasciato con il paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) da Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. + + +### Framework supportati + +La tabella seguente rappresenta il supporto attuale nella libreria per ognuno di questi modelli, si può identificare se questi hanno un Python +tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗 Tokenizers, e se hanno supporto in Jax (via Flax), PyTorch, e/o TensorFlow. + + + +| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | +|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| +| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | +| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | +| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | +| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ | +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | +| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| Canine | ✅ | ❌ | ✅ | ❌ | ❌ | +| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ | +| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | +| CvT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | +| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | +| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ | +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | +| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | +| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | +| Flava | ❌ | ❌ | ✅ | ❌ | ❌ | +| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | +| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | +| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | +| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | +| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | +| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | +| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | +| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | +| LED | ✅ | ✅ | ✅ | ✅ | ❌ | +| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | +| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | +| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | +| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | +| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | +| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | +| OPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | +| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | +| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | +| Realm | ✅ | ✅ | ✅ | ❌ | ❌ | +| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | +| RegNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ResNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | +| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | +| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | +| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | +| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | +| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| Swin | ❌ | ❌ | ✅ | ✅ | ❌ | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | +| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | +| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | +| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | +| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | +| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | +| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | +| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | +| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | +| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | +| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | + + diff --git a/docs/source/it/index.mdx b/docs/source/it/index.mdx deleted file mode 100644 index e612c3699b59..000000000000 --- a/docs/source/it/index.mdx +++ /dev/null @@ -1,291 +0,0 @@ - - -# 🤗 Transformers - -Machine Learning allo stato dell'arte per PyTorch, TensorFlow e JAX. - -🤗 Transformers fornisce delle API per scaricare in modo semplice e allenare modelli pre-allenati allo stato dell'arte. L'utilizzo di modelli pre-allenati può ridurre i tuoi costi computazionali, l'impatto ambientale, e farti risparmiare il tempo che utilizzeresti per allenare un modello da zero. I modelli possono essere utilizzati in diverse modalità come ad esempio: - -* 📝 Testo: classificazione del testo, estrazione delle informazioni, rispondere a domande, riassumere, traduzione e generazione del testo in più di 100 lingue. -* 🖼️ Immagini: classificazione di immagini, rilevazione di oggetti e segmentazione. -* 🗣️ Audio: riconoscimento vocale e classificazione dell'audio. -* 🐙 Multimodale: rispondere a domande inerenti dati tabulari, riconoscimento ottico dei caratteri, estrazione di informazioni a partire da documenti scannerizzati, classificazione di video e risposta visuale a domande. - -La nostra libreria supporta un'integrazione perfetta tra tre delle librerie per il deep learning più popolari: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) e [JAX](https://jax.readthedocs.io/en/latest/). Allena il tuo modello in tre righe di codice in un framework, e caricalo per l'inferenza in un altro. - -Ogni architettura di 🤗 Transformers è definita in un modulo Python indipendente così da poter essere personalizzata in modo semplice per la ricerca e gli esperimenti. - -## Se stai cercando supporto personalizzato dal team di Hugging Face - - -HuggingFace Expert Acceleration Program - - -## Contenuti - -La documentazione è organizzata in cinque parti: - -- **INIZIARE** contiene un tour rapido e le istruzioni di installazione per cominciare ad utilizzare 🤗 Transformers. -- **TUTORIALS** è un buon posto da cui iniziare se per te la nostra libreria è nuova. Questa sezione ti aiuterà ad acquisire le competenze basilari di cui hai bisogno per iniziare ad utilizzare 🤗 Transformers. -- **GUIDE PRATICHE** ti mostrerà come raggiungere obiettivi specifici come fare fine-tuning di un modello pre-allenato per la modellizzazione del linguaggio o come creare una testa per un modello personalizzato. -- **GUIDE CONCETTUALI** fornisce discussioni e spiegazioni dei concetti sottostanti alle idee dietro ai modelli, compiti, e la filosofia di progettazione di 🤗 Transformers. -- **API** descrive ogni classe e funzione, raggruppate in: - - **CLASSI PRINCIPALI** per le classi principali che espongono le API importanti della libreria. - - **MODELLI** per le classi e le funzioni relative ad ogni modello implementato all'interno della libreria. - - **HELPERS INTERNI** per le classi e le funzioni che utilizziamo internamente. - -La libreria attualmente contiene implementazioni in JAX, PyTorch e TensorFlow, pesi di modelli pre-allenati, script di utilizzo e strumenti di conversione per i seguenti modelli. - -### Modelli supportati - - - -1. **[ALBERT](model_doc/albert)** (da Google Research e l'Istituto Tecnologico di Chicago) rilasciato con il paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), da Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -1. **[BART](model_doc/bart)** (da Facebook) rilasciato con il paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) da Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov e Luke Zettlemoyer. -1. **[BARThez](model_doc/barthez)** (da politecnico di École) rilasciato con il paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) da Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. -1. **[BARTpho](model_doc/bartpho)** (da VinAI Research) rilasciato con il paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) da Nguyen Luong Tran, Duong Minh Le e Dat Quoc Nguyen. -1. **[BEiT](model_doc/beit)** (da Microsoft) rilasciato con il paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) da Hangbo Bao, Li Dong, Furu Wei. -1. **[BERT](model_doc/bert)** (da Google) rilasciato con il paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) da Jacob Devlin, Ming-Wei Chang, Kenton Lee e Kristina Toutanova. -1. **[BERTweet](model_doc/bertweet)** (da VinAI Research) rilasciato con il paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) da Dat Quoc Nguyen, Thanh Vu e Anh Tuan Nguyen. -1. **[BERT For Sequence Generation](model_doc/bert-generation)** (da Google) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -1. **[BigBird-RoBERTa](model_doc/big_bird)** (da Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (v Google Research) rilasciato con il paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) da Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -1. **[Blenderbot](model_doc/blenderbot)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BlenderbotSmall](model_doc/blenderbot-small)** (da Facebook) rilasciato con il paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) da Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -1. **[BORT](model_doc/bort)** (da Alexa) rilasciato con il paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) da Adrian de Wynter e Daniel J. Perry. -1. **[ByT5](model_doc/byt5)** (da Google Research) rilasciato con il paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) da Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. -1. **[CamemBERT](model_doc/camembert)** (da Inria/Facebook/Sorbonne) rilasciato con il paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) da Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah e Benoît Sagot. -1. **[CANINE](model_doc/canine)** (da Google Research) rilasciato con il paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) da Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. -1. **[ConvNeXT](model_doc/convnext)** (da Facebook AI) rilasciato con il paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) da Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. -1. **[CLIP](model_doc/clip)** (da OpenAI) rilasciato con il paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) da Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. -1. **[ConvBERT](model_doc/convbert)** (da YituTech) rilasciato con il paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) da Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. -1. **[CPM](model_doc/cpm)** (dalla Università di Tsinghua) rilasciato con il paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) da Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. -1. **[CTRL](model_doc/ctrl)** (da Salesforce) rilasciato con il paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) da Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong e Richard Socher. -1. **[CvT](model_doc/cvt)** (da Microsoft) rilasciato con il paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) da Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang. -1. **[Data2Vec](model_doc/data2vec)** (da Facebook) rilasciato con il paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) da Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli. -1. **[DeBERTa](model_doc/deberta)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -1. **[DeBERTa-v2](model_doc/deberta-v2)** (da Microsoft) rilasciato con il paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) da Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -1. **[Decision Transformer](model_doc/decision_transformer)** (da Berkeley/Facebook/Google) rilasciato con il paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) da Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch. -1. **[DiT](model_doc/dit)** (da Microsoft Research) rilasciato con il paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) da Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. -1. **[DeiT](model_doc/deit)** (da Facebook) rilasciato con il paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) da Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. -1. **[DETR](model_doc/detr)** (da Facebook) rilasciato con il paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) da Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. -1. **[DialoGPT](model_doc/dialogpt)** (da Microsoft Research) rilasciato con il paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) da Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -1. **[DistilBERT](model_doc/distilbert)** (da HuggingFace), rilasciato assieme al paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) da Victor Sanh, Lysandre Debut e Thomas Wolf. La stessa tecnica è stata applicata per comprimere GPT2 in [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa in [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT in [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. -1. **[DPR](model_doc/dpr)** (da Facebook) rilasciato con il paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) da Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, e Wen-tau Yih. -1. **[DPT](master/model_doc/dpt)** (da Intel Labs) rilasciato con il paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) da René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. -1. **[EncoderDecoder](model_doc/encoder-decoder)** (da Google Research) rilasciato con il paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) da Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -1. **[ELECTRA](model_doc/electra)** (da Google Research/Stanford University) rilasciato con il paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) da Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -1. **[FlauBERT](model_doc/flaubert)** (da CNRS) rilasciato con il paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) da Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -1. **[FLAVA](model_doc/flava)** (da Facebook AI) rilasciato con il paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) da Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, e Douwe Kiela. -1. **[FNet](model_doc/fnet)** (da Google Research) rilasciato con il paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) da James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. -1. **[Funnel Transformer](model_doc/funnel)** (da CMU/Google Brain) rilasciato con il paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) da Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[GLPN](model_doc/glpn)** (da KAIST) rilasciato con il paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) da Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. -1. **[GPT](model_doc/openai-gpt)** (da OpenAI) rilasciato con il paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) da Alec Radford, Karthik Narasimhan, Tim Salimans e Ilya Sutskever. -1. **[GPT-2](model_doc/gpt2)** (da OpenAI) rilasciato con il paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) da Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** e Ilya Sutskever**. -1. **[GPT-J](model_doc/gptj)** (da EleutherAI) rilasciato nel repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) da Ben Wang e Aran Komatsuzaki. -1. **[GPT Neo](model_doc/gpt_neo)** (da EleutherAI) rilasciato nel repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) da Sid Black, Stella Biderman, Leo Gao, Phil Wang e Connor Leahy. -1. **[GPT NeoX](model_doc/gpt_neox)** (da EleutherAI) rilasciato con il paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) da Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach -1. **[Hubert](model_doc/hubert)** (da Facebook) rilasciato con il paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) da Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. -1. **[I-BERT](model_doc/ibert)** (da Berkeley) rilasciato con il paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) da Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. -1. **[ImageGPT](model_doc/imagegpt)** (da OpenAI) rilasciato con il paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) da Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. -1. **[LayoutLM](model_doc/layoutlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) da Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -1. **[LayoutLMv2](model_doc/layoutlmv2)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) da Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. -1. **[LayoutLMv3](model_doc/layoutlmv3)** (da Microsoft Research Asia) rilasciato con il paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) da Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei. -1. **[LayoutXLM](model_doc/layoutlxlm)** (da Microsoft Research Asia) rilasciato con il paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) da Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. -1. **[LED](model_doc/led)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan. -1. **[Longformer](model_doc/longformer)** (da AllenAI) rilasciato con il paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) da Iz Beltagy, Matthew E. Peters, Arman Cohan. -1. **[LUKE](model_doc/luke)** (da Studio Ousia) rilasciato con il paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) da Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. -1. **[mLUKE](model_doc/mluke)** (da Studio Ousia) rilasciato con il paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) da Ryokan Ri, Ikuya Yamada, e Yoshimasa Tsuruoka. -1. **[LXMERT](model_doc/lxmert)** (da UNC Chapel Hill) rilasciato con il paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) da Hao Tan e Mohit Bansal. -1. **[M2M100](model_doc/m2m_100)** (da Facebook) rilasciato con il paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) da Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. -1. **[MarianMT](model_doc/marian)** Modello di machine learning per le traduzioni allenato utilizzando i dati [OPUS](http://opus.nlpl.eu/) di Jörg Tiedemann. Il [Framework Marian](https://marian-nmt.github.io/) è stato sviluppato dal Microsoft Translator Team. -1. **[MaskFormer](model_doc/maskformer)** (da Meta and UIUC) rilasciato con il paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) da Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. -1. **[MBart](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) da Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -1. **[MBart-50](model_doc/mbart)** (da Facebook) rilasciato con il paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) da Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. -1. **[Megatron-BERT](model_doc/megatron-bert)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro. -1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (da NVIDIA) rilasciato con il paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) da Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper e Bryan Catanzaro. -1. **[MPNet](model_doc/mpnet)** (da Microsoft Research) rilasciato con il paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) da Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -1. **[MT5](model_doc/mt5)** (da Google AI) rilasciato con il paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) da Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -1. **[Nyströmformer](model_doc/nystromformer)** (dalla Università del Wisconsin - Madison) rilasciato con il paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) da Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. -1. **[OPT](master/model_doc/opt)** (da Meta AI) rilasciato con il paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) da Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. -1. **[Pegasus](model_doc/pegasus)** (da Google) rilasciato con il paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) da Jingqing Zhang, Yao Zhao, Mohammad Saleh e Peter J. Liu. -1. **[Perceiver IO](model_doc/perceiver)** (da Deepmind) rilasciato con il paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) da Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. -1. **[PhoBERT](model_doc/phobert)** (da VinAI Research) rilasciato con il paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) da Dat Quoc Nguyen e Anh Tuan Nguyen. -1. **[PLBart](model_doc/plbart)** (da UCLA NLP) rilasciato con il paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) da Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. -1. **[PoolFormer](model_doc/poolformer)** (da Sea AI Labs) rilasciato con il paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) da Yu, Weihao e Luo, Mi e Zhou, Pan e Si, Chenyang e Zhou, Yichen e Wang, Xinchao e Feng, Jiashi e Yan, Shuicheng. -1. **[ProphetNet](model_doc/prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou. -1. **[QDQBert](model_doc/qdqbert)** (da NVIDIA) rilasciato con il paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) da Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev e Paulius Micikevicius. -1. **[REALM](model_doc/realm.html)** (da Google Research) rilasciato con il paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) da Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat e Ming-Wei Chang. -1. **[Reformer](model_doc/reformer)** (da Google Research) rilasciato con il paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) da Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -1. **[RemBERT](model_doc/rembert)** (da Google Research) rilasciato con il paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) da Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. -1. **[RegNet](model_doc/regnet)** (da META Platforms) rilasciato con il paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) da Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. -1. **[ResNet](model_doc/resnet)** (da Microsoft Research) rilasciato con il paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) da Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. -1. **[RoBERTa](model_doc/roberta)** (da Facebook), rilasciato assieme al paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) da Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -1. **[RoFormer](model_doc/roformer)** (da ZhuiyiTechnology), rilasciato assieme al paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) da Jianlin Su e Yu Lu e Shengfeng Pan e Bo Wen e Yunfeng Liu. -1. **[SegFormer](model_doc/segformer)** (da NVIDIA) rilasciato con il paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) da Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo. -1. **[SEW](model_doc/sew)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -1. **[SEW-D](model_doc/sew_d)** (da ASAPP) rilasciato con il paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) da Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (da Facebook), rilasciato assieme al paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) da Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. -1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (da Facebook), rilasciato assieme al paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) da Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. -1. **[Splinter](model_doc/splinter)** (dalla Università di Tel Aviv), rilasciato assieme al paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) da Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. -1. **[SqueezeBert](model_doc/squeezebert)** (da Berkeley) rilasciato con il paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) da Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, e Kurt W. Keutzer. -1. **[Swin Transformer](model_doc/swin)** (da Microsoft) rilasciato con il paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) da Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. -1. **[T5](model_doc/t5)** (da Google AI) rilasciato con il paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu. -1. **[T5v1.1](model_doc/t5v1.1)** (da Google AI) rilasciato nel repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) da Colin Raffel e Noam Shazeer e Adam Roberts e Katherine Lee e Sharan Narang e Michael Matena e Yanqi Zhou e Wei Li e Peter J. Liu. -1. **[TAPAS](model_doc/tapas)** (da Google AI) rilasciato con il paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) da Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno e Julian Martin Eisenschlos. -1. **[TAPEX](model_doc/tapex)** (da Microsoft Research) rilasciato con il paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) da Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. -1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (dall'Università della California a Berkeley) rilasciato con il paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) da Michael Janner, Qiyang Li, Sergey Levine -1. **[Transformer-XL](model_doc/transfo-xl)** (da Google/CMU) rilasciato con il paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) da Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -1. **[TrOCR](model_doc/trocr)** (da Microsoft), rilasciato assieme al paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) da Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. -1. **[UniSpeech](model_doc/unispeech)** (da Microsoft Research) rilasciato con il paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) da Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. -1. **[UniSpeechSat](model_doc/unispeech-sat)** (da Microsoft Research) rilasciato con il paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) da Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. -1. **[VAN](model_doc/van)** (dalle Università di Tsinghua e Nankai) rilasciato con il paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) da Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu. -1. **[ViLT](model_doc/vilt)** (da NAVER AI Lab/Kakao Enterprise/Kakao Brain) rilasciato con il paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) da Wonjae Kim, Bokyung Son, Ildoo Kim. -1. **[Vision Transformer (ViT)](model_doc/vit)** (da Google AI) rilasciato con il paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) da Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. -1. **[ViTMAE](model_doc/vit_mae)** (da Meta AI) rilasciato con il paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) da Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick. -1. **[VisualBERT](model_doc/visual_bert)** (da UCLA NLP) rilasciato con il paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) da Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -1. **[WavLM](model_doc/wavlm)** (da Microsoft Research) rilasciato con il paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) da Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. -1. **[Wav2Vec2](model_doc/wav2vec2)** (da Facebook AI) rilasciato con il paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) da Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. -1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (da Facebook AI) rilasciato con il paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) da Qiantong Xu, Alexei Baevski, Michael Auli. -1. **[XGLM](model_doc/xglm)** (da Facebook AI) rilasciato con il paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) da Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. -1. **[XLM](model_doc/xlm)** (v Facebook) rilasciato assieme al paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) da Guillaume Lample e Alexis Conneau. -1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (da Microsoft Research) rilasciato con il paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) da Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang e Ming Zhou. -1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (da Facebook AI), rilasciato assieme al paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) da Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer e Veselin Stoyanov. -1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (da Facebook AI), rilasciato assieme al paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) da Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. -1. **[XLNet](model_doc/xlnet)** (da Google/CMU) rilasciato con il paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) da Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. -1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (da Facebook AI) rilasciato con il paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) da Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. -1. **[XLS-R](model_doc/xls_r)** (da Facebook AI) rilasciato con il paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) da Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli. -1. **[YOLOS](model_doc/yolos)** (dalla Università della scienza e tecnologia di Huazhong) rilasciato con il paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) da Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. -1. **[YOSO](model_doc/yoso)** (dall'Università del Wisconsin - Madison) rilasciato con il paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) da Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh. - - -### Framework supportati - -La tabella seguente rappresenta il supporto attuale nella libreria per ognuno di questi modelli, si può identificare se questi hanno un Python -tokenizer (chiamato "slow"). Un tokenizer "fast" supportato dalla libreria 🤗 Tokenizers, e se hanno supporto in Jax (via Flax), PyTorch, e/o TensorFlow. - - - -| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | -|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| -| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| BART | ✅ | ✅ | ✅ | ✅ | ✅ | -| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | -| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | -| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | -| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ | -| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | -| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | -| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| Canine | ✅ | ❌ | ✅ | ❌ | ❌ | -| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | -| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ | -| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | -| CvT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | -| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | -| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | -| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | -| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ | -| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | -| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | -| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | -| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | -| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | -| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | -| Flava | ❌ | ❌ | ✅ | ❌ | ❌ | -| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | -| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | -| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | -| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | -| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | -| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | -| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | -| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | -| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | -| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | -| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | -| LED | ✅ | ✅ | ✅ | ✅ | ❌ | -| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | -| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | -| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | -| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | -| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | -| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | -| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ | -| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | -| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | -| OPT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | -| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | -| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | -| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | -| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | -| Realm | ✅ | ✅ | ✅ | ❌ | ❌ | -| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | -| RegNet | ❌ | ❌ | ✅ | ❌ | ❌ | -| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | -| ResNet | ❌ | ❌ | ✅ | ❌ | ❌ | -| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | -| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | -| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ | -| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | -| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | -| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | -| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | -| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | -| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | -| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | -| Swin | ❌ | ❌ | ✅ | ✅ | ❌ | -| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | -| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | -| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | -| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | -| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | -| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | -| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | -| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | -| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | -| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ | -| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | -| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | -| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | -| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | -| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | -| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | -| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | -| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | -| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | -| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | -| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | -| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | -| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | - - diff --git a/docs/source/it/installation.md b/docs/source/it/installation.md new file mode 100644 index 000000000000..4f884f80d936 --- /dev/null +++ b/docs/source/it/installation.md @@ -0,0 +1,239 @@ + + +# Installazione + +Installa 🤗 Transformers per qualsiasi libreria di deep learning con cui stai lavorando, imposta la tua cache, e opzionalmente configura 🤗 Transformers per l'esecuzione offline. + +🤗 Transformers è testato su Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, e Flax. Segui le istruzioni di installazione seguenti per la libreria di deep learning che stai utilizzando: + +* [PyTorch](https://pytorch.org/get-started/locally/) istruzioni di installazione. +* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) istruzioni di installazione. +* [Flax](https://flax.readthedocs.io/en/latest/) istruzioni di installazione. + +## Installazione con pip + +Puoi installare 🤗 Transformers in un [ambiente virtuale](https://docs.python.org/3/library/venv.html). Se non sei familiare con gli ambienti virtuali in Python, dai un'occhiata a questa [guida](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Un ambiente virtuale rende più semplice la gestione di progetti differenti, evitando problemi di compatibilità tra dipendenze. + +Inizia creando un ambiente virtuale nella directory del tuo progetto: + +```bash +python -m venv .env +``` + +Attiva l'ambiente virtuale: + +```bash +source .env/bin/activate +``` + +Ora puoi procedere con l'installazione di 🤗 Transformers eseguendo il comando seguente: + +```bash +pip install transformers +``` + +Per il solo supporto della CPU, puoi installare facilmente 🤗 Transformers e una libreria di deep learning in solo una riga. Ad esempio, installiamo 🤗 Transformers e PyTorch con: + +```bash +pip install transformers[torch] +``` + +🤗 Transformers e TensorFlow 2.0: + +```bash +pip install transformers[tf-cpu] +``` + +🤗 Transformers e Flax: + +```bash +pip install transformers[flax] +``` + +Infine, verifica se 🤗 Transformers è stato installato in modo appropriato eseguendo il seguente comando. Questo scaricherà un modello pre-allenato: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" +``` + +Dopodiché stampa l'etichetta e il punteggio: + +```bash +[{'label': 'POSITIVE', 'score': 0.9998704791069031}] +``` + +## Installazione dalla fonte + +Installa 🤗 Transformers dalla fonte con il seguente comando: + +```bash +pip install git+https://github.com/huggingface/transformers +``` + +Questo comando installa la versione `main` più attuale invece dell'ultima versione stabile. Questo è utile per stare al passo con gli ultimi sviluppi. Ad esempio, se un bug è stato sistemato da quando è uscita l'ultima versione ufficiale ma non è stata ancora rilasciata una nuova versione. Tuttavia, questo significa che questa versione `main` può non essere sempre stabile. Ci sforziamo per mantenere la versione `main` operativa, e la maggior parte dei problemi viene risolta in poche ore o in un giorno. Se riscontri un problema, per favore apri una [Issue](https://github.com/huggingface/transformers/issues) così possiamo sistemarlo ancora più velocemente! + +Controlla se 🤗 Transformers è stata installata in modo appropriato con il seguente comando: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" +``` + +## Installazione modificabile + +Hai bisogno di un'installazione modificabile se vuoi: + +* Usare la versione `main` del codice dalla fonte. +* Contribuire a 🤗 Transformers e hai bisogno di testare i cambiamenti nel codice. + +Clona il repository e installa 🤗 Transformers con i seguenti comandi: + +```bash +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install -e . +``` + +Questi comandi collegheranno la cartella in cui è stato clonato il repository e i path delle librerie Python. Python guarderà ora all'interno della cartella clonata, oltre ai normali path delle librerie. Per esempio, se i tuoi pacchetti Python sono installati tipicamente in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python cercherà anche nella cartella clonata: `~/transformers/`. + + + +Devi tenere la cartella `transformers` se vuoi continuare ad utilizzare la libreria. + + + +Ora puoi facilmente aggiornare il tuo clone all'ultima versione di 🤗 Transformers con il seguente comando: + +```bash +cd ~/transformers/ +git pull +``` + +Il tuo ambiente Python troverà la versione `main` di 🤗 Transformers alla prossima esecuzione. + +## Installazione con conda + +Installazione dal canale conda `huggingface`: + +```bash +conda install -c huggingface transformers +``` + +## Impostazione della cache + +I modelli pre-allenati sono scaricati e memorizzati localmente nella cache in: `~/.cache/huggingface/transformers/`. Questa è la directory di default data dalla variabile d'ambiente della shell `TRANSFORMERS_CACHE`. Su Windows, la directory di default è data da `C:\Users\username\.cache\huggingface\transformers`. Puoi cambiare le variabili d'ambiente della shell indicate in seguito, in ordine di priorità, per specificare una directory differente per la cache: + +1. Variabile d'ambiente della shell (default): `TRANSFORMERS_CACHE`. +2. Variabile d'ambiente della shell: `HF_HOME` + `transformers/`. +3. Variabile d'ambiente della shell: `XDG_CACHE_HOME` + `/huggingface/transformers`. + + + +🤗 Transformers utilizzerà le variabili d'ambiente della shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` se si proviene da un'iterazione precedente di questa libreria e sono state impostate queste variabili d'ambiente, a meno che non si specifichi la variabile d'ambiente della shell `TRANSFORMERS_CACHE`. + + + +## Modalità Offline + +🤗 Transformers può essere eseguita in un ambiente firewalled o offline utilizzando solo file locali. Imposta la variabile d'ambiente `TRANSFORMERS_OFFLINE=1` per abilitare questo comportamento. + + + +Aggiungi [🤗 Datasets](https://huggingface.co/docs/datasets/) al tuo flusso di lavoro offline di training impostando la variabile d'ambiente `HF_DATASETS_OFFLINE=1`. + + + +Ad esempio, in genere si esegue un programma su una rete normale, protetta da firewall per le istanze esterne, con il seguente comando: + +```bash +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +Esegui lo stesso programma in un'istanza offline con: + +```bash +HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +Lo script viene ora eseguito senza bloccarsi o attendere il timeout, perché sa di dover cercare solo file locali. + +### Ottenere modelli e tokenizer per l'uso offline + +Un'altra opzione per utilizzare offline 🤗 Transformers è scaricare i file in anticipo, e poi puntare al loro path locale quando hai la necessità di utilizzarli offline. Ci sono tre modi per fare questo: + +* Scarica un file tramite l'interfaccia utente sul [Model Hub](https://huggingface.co/models) premendo sull'icona ↓. + + ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) + +* Utilizza il flusso [`PreTrainedModel.from_pretrained`] e [`PreTrainedModel.save_pretrained`]: + + 1. Scarica i tuoi file in anticipo con [`PreTrainedModel.from_pretrained`]: + + ```py + >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + + >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") + ``` + + 2. Salva i tuoi file in una directory specificata con [`PreTrainedModel.save_pretrained`]: + + ```py + >>> tokenizer.save_pretrained("./il/tuo/path/bigscience_t0") + >>> model.save_pretrained("./il/tuo/path/bigscience_t0") + ``` + + 3. Ora quando sei offline, carica i tuoi file con [`PreTrainedModel.from_pretrained`] dalla directory specificata: + + ```py + >>> tokenizer = AutoTokenizer.from_pretrained("./il/tuo/path/bigscience_t0") + >>> model = AutoModel.from_pretrained("./il/tuo/path/bigscience_t0") + ``` + +* Scarica in maniera programmatica i file con la libreria [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub): + + 1. Installa la libreria `huggingface_hub` nel tuo ambiente virtuale: + + ```bash + python -m pip install huggingface_hub + ``` + + 2. Utilizza la funzione [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) per scaricare un file in un path specifico. Per esempio, il seguente comando scarica il file `config.json` dal modello [T0](https://huggingface.co/bigscience/T0_3B) nel path che desideri: + + ```py + >>> from huggingface_hub import hf_hub_download + + >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./il/tuo/path/bigscience_t0") + ``` + +Una volta che il tuo file è scaricato e salvato in cache localmente, specifica il suo path locale per caricarlo e utilizzarlo: + +```py +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("./il/tuo/path/bigscience_t0/config.json") +``` + + + +Fai riferimento alla sezione [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) per avere maggiori dettagli su come scaricare modelli presenti sull Hub. + + \ No newline at end of file diff --git a/docs/source/it/installation.mdx b/docs/source/it/installation.mdx deleted file mode 100644 index 1ff47c110cff..000000000000 --- a/docs/source/it/installation.mdx +++ /dev/null @@ -1,235 +0,0 @@ - - -# Installazione - -Installa 🤗 Transformers per qualsiasi libreria di deep learning con cui stai lavorando, imposta la tua cache, e opzionalmente configura 🤗 Transformers per l'esecuzione offline. - -🤗 Transformers è testato su Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, e Flax. Segui le istruzioni di installazione seguenti per la libreria di deep learning che stai utilizzando: - -* [PyTorch](https://pytorch.org/get-started/locally/) istruzioni di installazione. -* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) istruzioni di installazione. -* [Flax](https://flax.readthedocs.io/en/latest/) istruzioni di installazione. - -## Installazione con pip - -Puoi installare 🤗 Transformers in un [ambiente virtuale](https://docs.python.org/3/library/venv.html). Se non sei familiare con gli ambienti virtuali in Python, dai un'occhiata a questa [guida](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Un ambiente virtuale rende più semplice la gestione di progetti differenti, evitando problemi di compatibilità tra dipendenze. - -Inizia creando un ambiente virtuale nella directory del tuo progetto: - -```bash -python -m venv .env -``` - -Attiva l'ambiente virtuale: - -```bash -source .env/bin/activate -``` - -Ora puoi procedere con l'installazione di 🤗 Transformers eseguendo il comando seguente: - -```bash -pip install transformers -``` - -Per il solo supporto della CPU, puoi installare facilmente 🤗 Transformers e una libreria di deep learning in solo una riga. Ad esempio, installiamo 🤗 Transformers e PyTorch con: - -```bash -pip install transformers[torch] -``` - -🤗 Transformers e TensorFlow 2.0: - -```bash -pip install transformers[tf-cpu] -``` - -🤗 Transformers e Flax: - -```bash -pip install transformers[flax] -``` - -Infine, verifica se 🤗 Transformers è stato installato in modo appropriato eseguendo il seguente comando. Questo scaricherà un modello pre-allenato: - -```bash -python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" -``` - -Dopodiché stampa l'etichetta e il punteggio: - -```bash -[{'label': 'POSITIVE', 'score': 0.9998704791069031}] -``` - -## Installazione dalla fonte - -Installa 🤗 Transformers dalla fonte con il seguente comando: - -```bash -pip install git+https://github.com/huggingface/transformers -``` - -Questo comando installa la versione `main` più attuale invece dell'ultima versione stabile. Questo è utile per stare al passo con gli ultimi sviluppi. Ad esempio, se un bug è stato sistemato da quando è uscita l'ultima versione ufficiale ma non è stata ancora rilasciata una nuova versione. Tuttavia, questo significa che questa versione `main` può non essere sempre stabile. Ci sforziamo per mantenere la versione `main` operativa, e la maggior parte dei problemi viene risolta in poche ore o in un giorno. Se riscontri un problema, per favore apri una [Issue](https://github.com/huggingface/transformers/issues) così possiamo sistemarlo ancora più velocemente! - -Controlla se 🤗 Transformers è stata installata in modo appropriato con il seguente comando: - -```bash -python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" -``` - -## Installazione modificabile - -Hai bisogno di un'installazione modificabile se vuoi: - -* Usare la versione `main` del codice dalla fonte. -* Contribuire a 🤗 Transformers e hai bisogno di testare i cambiamenti nel codice. - -Clona il repository e installa 🤗 Transformers con i seguenti comandi: - -```bash -git clone https://github.com/huggingface/transformers.git -cd transformers -pip install -e . -``` - -Questi comandi collegheranno la cartella in cui è stato clonato il repository e i path delle librerie Python. Python guarderà ora all'interno della cartella clonata, oltre ai normali path delle librerie. Per esempio, se i tuoi pacchetti Python sono installati tipicamente in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python cercherà anche nella cartella clonata: `~/transformers/`. - - - -Devi tenere la cartella `transformers` se vuoi continuare ad utilizzare la libreria. - - - -Ora puoi facilmente aggiornare il tuo clone all'ultima versione di 🤗 Transformers con il seguente comando: - -```bash -cd ~/transformers/ -git pull -``` - -Il tuo ambiente Python troverà la versione `main` di 🤗 Transformers alla prossima esecuzione. - -## Installazione con conda - -Installazione dal canale conda `huggingface`: - -```bash -conda install -c huggingface transformers -``` - -## Impostazione della cache - -I modelli pre-allenati sono scaricati e memorizzati localmente nella cache in: `~/.cache/huggingface/transformers/`. Questa è la directory di default data dalla variabile d'ambiente della shell `TRANSFORMERS_CACHE`. Su Windows, la directory di default è data da `C:\Users\username\.cache\huggingface\transformers`. Puoi cambiare le variabili d'ambiente della shell indicate in seguito, in ordine di priorità, per specificare una directory differente per la cache: - -1. Variabile d'ambiente della shell (default): `TRANSFORMERS_CACHE`. -2. Variabile d'ambiente della shell: `HF_HOME` + `transformers/`. -3. Variabile d'ambiente della shell: `XDG_CACHE_HOME` + `/huggingface/transformers`. - - - -🤗 Transformers utilizzerà le variabili d'ambiente della shell `PYTORCH_TRANSFORMERS_CACHE` o `PYTORCH_PRETRAINED_BERT_CACHE` se si proviene da un'iterazione precedente di questa libreria e sono state impostate queste variabili d'ambiente, a meno che non si specifichi la variabile d'ambiente della shell `TRANSFORMERS_CACHE`. - - - -## Modalità Offline - -🤗 Transformers può essere eseguita in un ambiente firewalled o offline utilizzando solo file locali. Imposta la variabile d'ambiente `TRANSFORMERS_OFFLINE=1` per abilitare questo comportamento. - - - -Aggiungi [🤗 Datasets](https://huggingface.co/docs/datasets/) al tuo flusso di lavoro offline di training impostando la variabile d'ambiente `HF_DATASETS_OFFLINE=1`. - - - -Ad esempio, in genere si esegue un programma su una rete normale, protetta da firewall per le istanze esterne, con il seguente comando: - -```bash -python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... -``` - -Esegui lo stesso programma in un'istanza offline con: - -```bash -HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ -python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... -``` - -Lo script viene ora eseguito senza bloccarsi o attendere il timeout, perché sa di dover cercare solo file locali. - -### Ottenere modelli e tokenizer per l'uso offline - -Un'altra opzione per utilizzare offline 🤗 Transformers è scaricare i file in anticipo, e poi puntare al loro path locale quando hai la necessità di utilizzarli offline. Ci sono tre modi per fare questo: - -* Scarica un file tramite l'interfaccia utente sul [Model Hub](https://huggingface.co/models) premendo sull'icona ↓. - - ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) - -* Utilizza il flusso [`PreTrainedModel.from_pretrained`] e [`PreTrainedModel.save_pretrained`]: - - 1. Scarica i tuoi file in anticipo con [`PreTrainedModel.from_pretrained`]: - - ```py - >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - - >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") - ``` - - 2. Salva i tuoi file in una directory specificata con [`PreTrainedModel.save_pretrained`]: - - ```py - >>> tokenizer.save_pretrained("./il/tuo/path/bigscience_t0") - >>> model.save_pretrained("./il/tuo/path/bigscience_t0") - ``` - - 3. Ora quando sei offline, carica i tuoi file con [`PreTrainedModel.from_pretrained`] dalla directory specificata: - - ```py - >>> tokenizer = AutoTokenizer.from_pretrained("./il/tuo/path/bigscience_t0") - >>> model = AutoModel.from_pretrained("./il/tuo/path/bigscience_t0") - ``` - -* Scarica in maniera programmatica i file con la libreria [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub): - - 1. Installa la libreria `huggingface_hub` nel tuo ambiente virtuale: - - ```bash - python -m pip install huggingface_hub - ``` - - 2. Utilizza la funzione [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) per scaricare un file in un path specifico. Per esempio, il seguente comando scarica il file `config.json` dal modello [T0](https://huggingface.co/bigscience/T0_3B) nel path che desideri: - - ```py - >>> from huggingface_hub import hf_hub_download - - >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./il/tuo/path/bigscience_t0") - ``` - -Una volta che il tuo file è scaricato e salvato in cache localmente, specifica il suo path locale per caricarlo e utilizzarlo: - -```py ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("./il/tuo/path/bigscience_t0/config.json") -``` - - - -Fai riferimento alla sezione [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) per avere maggiori dettagli su come scaricare modelli presenti sull Hub. - - \ No newline at end of file diff --git a/docs/source/it/migration.md b/docs/source/it/migration.md new file mode 100644 index 000000000000..3b3b71da4d49 --- /dev/null +++ b/docs/source/it/migration.md @@ -0,0 +1,320 @@ + + +# Migrazione da pacchetti precedenti + +## Migrazione da transformers `v3.x` a `v4.x` + +Un paio di modifiche sono state introdotte nel passaggio dalla versione 3 alla versione 4. Di seguito è riportato un riepilogo delle +modifiche previste: + +#### 1. AutoTokenizer e pipeline ora utilizzano tokenizer veloci (rust) per impostazione predefinita. + +I tokenizer python e rust hanno all'incirca le stesse API, ma i tokenizer rust hanno un set di funzionalità più completo. + +Ciò introduce due modifiche sostanziali: +- La gestione dei token in overflow tra i tokenizer Python e Rust è diversa. +- I tokenizers di rust non accettano numeri interi nei metodi di codifica. + +##### Come ottenere lo stesso comportamento di v3.x in v4.x + +- Le pipeline ora contengono funzionalità aggiuntive pronte all'uso. Vedi la [pipeline di classificazione dei token con il flag `grouped_entities`](main_classes/pipelines#transformers.TokenClassificationPipeline). +- Gli auto-tokenizer ora restituiscono tokenizer rust. Per ottenere invece i tokenizer python, l'utente deve usare il flag `use_fast` impostandolo `False`: + +Nella versione `v3.x`: +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +``` +per ottenere lo stesso nella versione `v4.x`: +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) +``` + +#### 2. SentencePiece è stato rimosso dalle dipendenze richieste + +Il requisito sulla dipendenza SentencePiece è stato rimosso da `setup.py`. È stato fatto per avere un canale su anaconda cloud senza basarsi su `conda-forge`. Ciò significa che i tokenizer che dipendono dalla libreria SentencePiece non saranno disponibili con un'installazione standard di `transformers`. + +Ciò include le versioni **lente** di: +- `XLNetTokenizer` +- `AlbertTokenizer` +- `CamembertTokenizer` +- `MBartTokenizer` +- `PegasusTokenizer` +- `T5Tokenizer` +- `ReformerTokenizer` +- `XLMRobertaTokenizer` + +##### Come ottenere lo stesso comportamento della v3.x nella v4.x + +Per ottenere lo stesso comportamento della versione `v3.x`, devi installare anche `sentencepiece`: + +Nella versione `v3.x`: +```bash +pip install transformers +``` +per ottenere lo stesso nella versione `v4.x`: +```bash +pip install transformers[sentencepiece] +``` +o +```bash +pip install transformers stentencepiece +``` +#### 3. L'architettura delle repo è stato aggiornata in modo che ogni modello abbia la propria cartella + +Con l’aggiunta di nuovi modelli, il numero di file nella cartella `src/transformers` continua a crescere e diventa più difficile navigare e capire. Abbiamo fatto la scelta di inserire ogni modello e i file che lo accompagnano nelle proprie sottocartelle. + +Si tratta di una modifica sostanziale in quanto l'importazione di layer intermedi utilizzando direttamente il modulo di un modello deve essere eseguita tramite un percorso diverso. + +##### Come ottenere lo stesso comportamento della v3.x nella v4.x + +Per ottenere lo stesso comportamento della versione `v3.x`, devi aggiornare il percorso utilizzato per accedere ai layer. + +Nella versione `v3.x`: +```bash +from transformers.modeling_bert import BertLayer +``` +per ottenere lo stesso nella versione `v4.x`: +```bash +from transformers.models.bert.modeling_bert import BertLayer +``` + +#### 4. Impostare l'argomento `return_dict` su `True` per impostazione predefinita + +L'[argomento `return_dict`](main_classes/output) abilita la restituzione di oggetti python dict-like contenenti gli output del modello, invece delle tuple standard. Questo oggetto è self-documented poiché le chiavi possono essere utilizzate per recuperare valori, comportandosi anche come una tupla e gli utenti possono recuperare oggetti per indexing o slicing. + +Questa è una modifica sostanziale poiché la tupla non può essere decompressa: `value0, value1 = outputs` non funzionerà. + +##### Come ottenere lo stesso comportamento della v3.x nella v4.x + +Per ottenere lo stesso comportamento della versione `v3.x`, specifica l'argomento `return_dict` come `False`, sia nella configurazione del modello che nel passaggio successivo. + +Nella versione `v3.x`: +```bash +model = BertModel.from_pretrained("bert-base-cased") +outputs = model(**inputs) +``` +per ottenere lo stesso nella versione `v4.x`: +```bash +model = BertModel.from_pretrained("bert-base-cased") +outputs = model(**inputs, return_dict=False) +``` +o +```bash +model = BertModel.from_pretrained("bert-base-cased", return_dict=False) +outputs = model(**inputs) +``` + +#### 5. Rimozione di alcuni attributi deprecati + +Gli attributi sono stati rimossi se deprecati da almeno un mese. L'elenco completo degli attributi obsoleti è disponibile in [#8604](https://github.com/huggingface/transformers/pull/8604). + +Ecco un elenco di questi attributi/metodi/argomenti e quali dovrebbero essere le loro sostituzioni: + +In diversi modelli, le etichette diventano coerenti con gli altri modelli: +- `masked_lm_labels` diventa `labels` in `AlbertForMaskedLM` e `AlbertForPreTraining`. +- `masked_lm_labels` diventa `labels` in `BertForMaskedLM` e `BertForPreTraining`. +- `masked_lm_labels` diventa `labels` in `DistilBertForMaskedLM`. +- `masked_lm_labels` diventa `labels` in `ElectraForMaskedLM`. +- `masked_lm_labels` diventa `labels` in `LongformerForMaskedLM`. +- `masked_lm_labels` diventa `labels` in `MobileBertForMaskedLM`. +- `masked_lm_labels` diventa `labels` in `RobertaForMaskedLM`. +- `lm_labels` diventa `labels` in `BartForConditionalGeneration`. +- `lm_labels` diventa `labels` in `GPT2DoubleHeadsModel`. +- `lm_labels` diventa `labels` in `OpenAIGPTDoubleHeadsModel`. +- `lm_labels` diventa `labels` in `T5ForConditionalGeneration`. + +In diversi modelli, il meccanismo di memorizzazione nella cache diventa coerente con gli altri: +- `decoder_cached_states` diventa `past_key_values` in tutti i modelli BART-like, FSMT e T5. +- `decoder_past_key_values` diventa `past_key_values` in tutti i modelli BART-like, FSMT e T5. +- `past` diventa `past_key_values` in tutti i modelli CTRL. +- `past` diventa `past_key_values` in tutti i modelli GPT-2. + +Per quanto riguarda le classi tokenizer: +- L'attributo tokenizer `max_len` diventa `model_max_length`. +- L'attributo tokenizer `return_lengths` diventa `return_length`. +- L'argomento di codifica del tokenizer `is_pretokenized` diventa `is_split_into_words`. + +Per quanto riguarda la classe `Trainer`: +- L'argomento `tb_writer` di `Trainer` è stato rimosso in favore della funzione richiamabile `TensorBoardCallback(tb_writer=...)`. +- L'argomento `prediction_loss_only` di `Trainer` è stato rimosso in favore dell'argomento di classe `args.prediction_loss_only`. +- L'attributo `data_collator` di `Trainer` sarà richiamabile. +- Il metodo `_log` di `Trainer` è deprecato a favore di `log`. +- Il metodo `_training_step` di `Trainer` è deprecato a favore di `training_step`. +- Il metodo `_prediction_loop` di `Trainer` è deprecato a favore di `prediction_loop`. +- Il metodo `is_local_master` di `Trainer` è deprecato a favore di `is_local_process_zero`. +- Il metodo `is_world_master` di `Trainer` è deprecato a favore di `is_world_process_zero`. + +Per quanto riguarda la classe `TFTrainer`: +- L'argomento `prediction_loss_only` di `TFTrainer` è stato rimosso a favore dell'argomento di classe `args.prediction_loss_only`. +- Il metodo `_log` di `Trainer` è deprecato a favore di `log`. +- Il metodo `_prediction_loop` di `TFTrainer` è deprecato a favore di `prediction_loop`. +- Il metodo `_setup_wandb` di `TFTrainer` è deprecato a favore di `setup_wandb`. +- Il metodo `_run_model` di `TFTrainer` è deprecato a favore di `run_model`. + +Per quanto riguarda la classe `TrainingArguments`: +- L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `evaluation_strategy`. + +Per quanto riguarda il modello Transfo-XL: +- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_words_embeddings`. +- Il metodo di modellazione `reset_length` di Transfo-XL diventa `reset_memory_length`. + +Per quanto riguarda le pipeline: +- L'argomento `topk` di `FillMaskPipeline` diventa `top_k`. + + + +## Passaggio da pytorch-transformers a 🤗 Transformers + +Ecco un breve riepilogo di ciò a cui prestare attenzione durante il passaggio da `pytorch-transformers` a 🤗 Transformers. + +### L’ordine posizionale di alcune parole chiave di input dei modelli (`attention_mask`, `token_type_ids`...) è cambiato + +Per usare Torchscript (vedi #1010, #1204 e #1195) l'ordine specifico delle **parole chiave di input** di alcuni modelli (`attention_mask`, `token_type_ids`...) è stato modificato. + +Se inizializzavi i modelli usando parole chiave per gli argomenti, ad esempio `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, questo non dovrebbe causare alcun cambiamento. + +Se inizializzavi i modelli con input posizionali per gli argomenti, ad esempio `model(inputs_ids, attention_mask, token_type_ids)`, potrebbe essere necessario ricontrollare l'ordine esatto degli argomenti di input. + +## Migrazione da pytorch-pretrained-bert + +Ecco un breve riepilogo di ciò a cui prestare attenzione durante la migrazione da `pytorch-pretrained-bert` a 🤗 Transformers + +### I modelli restituiscono sempre `tuple` + +La principale modifica di rilievo durante la migrazione da `pytorch-pretrained-bert` a 🤗 Transformers è che il metodo dei modelli di previsione dà sempre una `tupla` con vari elementi a seconda del modello e dei parametri di configurazione. + +Il contenuto esatto delle tuple per ciascun modello è mostrato in dettaglio nelle docstring dei modelli e nella [documentazione](https://huggingface.co/transformers/). + +In quasi tutti i casi, andrà bene prendendo il primo elemento dell'output come quello che avresti precedentemente utilizzato in `pytorch-pretrained-bert`. + +Ecco un esempio di conversione da `pytorch-pretrained-bert` + a 🤗 Transformers per un modello di classificazione `BertForSequenceClassification`: + +```python +# Carichiamo il nostro modello +model = BertForSequenceClassification.from_pretrained("bert-base-uncased") + +# Se usavi questa riga in pytorch-pretrained-bert : +loss = model(input_ids, labels=labels) + +# Ora usa questa riga in 🤗 Transformers per estrarre la perdita dalla tupla di output: +outputs = model(input_ids, labels=labels) +loss = outputs[0] + +# In 🤗 Transformers puoi anche avere accesso ai logit: +loss, logits = outputs[:2] + +# Ed anche agli attention weight se configuri il modello per restituirli (e anche altri output, vedi le docstring e la documentazione) +model = BertForSequenceClassification.from_pretrained(" bert-base-uncased", output_attentions=True) +outputs = model(input_ids, labels=labels) +loss, logits, attentions = outputs +``` + +### Serializzazione + +Modifica sostanziale nel metodo `from_pretrained()`: + +1. I modelli sono ora impostati in modalità di valutazione in maniera predefinita quando usi il metodo `from_pretrained()`. Per addestrarli non dimenticare di riportarli in modalità di addestramento (`model.train()`) per attivare i moduli di dropout. + +2. Gli argomenti aggiuntivi `*inputs` e `**kwargs` forniti al metodo `from_pretrained()` venivano passati direttamente al metodo `__init__()` della classe sottostante del modello. Ora sono usati per aggiornare prima l'attributo di configurazione del modello, che può non funzionare con le classi del modello derivate costruite basandosi sui precedenti esempi di `BertForSequenceClassification`. Più precisamente, gli argomenti posizionali `*inputs` forniti a `from_pretrained()` vengono inoltrati direttamente al metodo `__init__()` del modello mentre gli argomenti keyword `**kwargs` (i) che corrispondono agli attributi della classe di configurazione, vengono utilizzati per aggiornare tali attributi (ii) che non corrispondono ad alcun attributo della classe di configurazione, vengono inoltrati al metodo `__init__()`. + +Inoltre, sebbene non si tratti di una modifica sostanziale, i metodi di serializzazione sono stati standardizzati e probabilmente dovresti passare al nuovo metodo `save_pretrained(save_directory)` se prima usavi qualsiasi altro metodo di serializzazione. + +Ecco un esempio: + +```python +### Carichiamo un modello e un tokenizer +model = BertForSequenceClassification.from_pretrained("bert-base-uncased") +tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + +### Facciamo fare alcune cose al nostro modello e tokenizer +# Es: aggiungiamo nuovi token al vocabolario e agli embending del nostro modello +tokenizer.add_tokens(["[SPECIAL_TOKEN_1]", "[SPECIAL_TOKEN_2]"]) +model.resize_token_embeddings(len(tokenizer)) +# Alleniamo il nostro modello +train(model) + +### Ora salviamo il nostro modello e il tokenizer in una cartella +model.save_pretrained("./my_saved_model_directory/") +tokenizer.save_pretrained("./my_saved_model_directory/") + +### Ricarichiamo il modello e il tokenizer +model = BertForSequenceClassification.from_pretrained("./my_saved_model_directory/") +tokenizer = BertTokenizer.from_pretrained("./my_saved_model_directory/") +``` + +### Ottimizzatori: BertAdam e OpenAIAdam ora sono AdamW, lo scheduling è quello standard PyTorch + +I due ottimizzatori precedenti inclusi, `BertAdam` e `OpenAIAdam`, sono stati sostituiti da un singolo `AdamW` che presenta alcune differenze: + +- implementa solo la correzione del weights decay, +- lo scheduling ora è esterno (vedi sotto), +- anche il gradient clipping ora è esterno (vedi sotto). + +Il nuovo ottimizzatore `AdamW` corrisponde alle API di `Adam` di PyTorch e ti consente di utilizzare metodi PyTorch o apex per lo scheduling e il clipping. + +Lo scheduling è ora standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) e non fanno più parte dell'ottimizzatore. + +Ecco un esempio di linear warmup e decay con `BertAdam` e con `AdamW`: + +```python +# Parametri: +lr = 1e-3 +max_grad_norm = 1.0 +num_training_steps = 1000 +num_warmup_steps = 100 +warmup_proportion = float( num_warmup_steps) / float(num_training_steps) # 0.1 + +### In precedenza l'ottimizzatore BertAdam veniva istanziato in questo modo: +optimizer = BertAdam( + model.parameters(), + lr=lr, + schedule="warmup_linear", + warmup=warmup_proportion, + num_training_steps=num_training_steps, +) +### e usato in questo modo: +for batch in train_data: + loss = model(batch) + loss.backward() + optimizer.step() + +### In 🤗 Transformers, ottimizzatore e schedule sono divisi e usati in questo modo: +optimizer = AdamW( + model.parameters(), lr=lr, correct_bias=False +) # Per riprodurre il comportamento specifico di BertAdam impostare correct_bias=False +scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps +) # PyTorch scheduler +### e va usato così: +for batch in train_data: + loss = model(batch) + loss.backward() + torch.nn.utils.clip_grad_norm_( + model.parameters(), max_grad_norm + ) # Gradient clipping non è più in AdamW (quindi puoi usare amp senza problemi) + optimizer.step() + scheduler.step() +``` diff --git a/docs/source/it/model_sharing.md b/docs/source/it/model_sharing.md new file mode 100644 index 000000000000..351cf57bf96b --- /dev/null +++ b/docs/source/it/model_sharing.md @@ -0,0 +1,238 @@ + + +# Condividi un modello + +Gli ultimi due tutorial ti hanno mostrato come puoi fare fine-tuning di un modello con PyTorch, Keras e 🤗 Accelerate per configurazioni distribuite. Il prossimo passo è quello di condividere il tuo modello con la community! In Hugging Face, crediamo nella condivisione della conoscenza e delle risorse in modo da democratizzare l'intelligenza artificiale per chiunque. Ti incoraggiamo a considerare di condividere il tuo modello con la community per aiutare altre persone a risparmiare tempo e risorse. + +In questo tutorial, imparerai due metodi per la condivisione di un modello trained o fine-tuned nel [Model Hub](https://huggingface.co/models): + +- Condividi in modo programmatico i tuoi file nell'Hub. +- Trascina i tuoi file nell'Hub mediante interfaccia grafica. + + + + + +Per condividere un modello con la community, hai bisogno di un account su [huggingface.co](https://huggingface.co/join). Puoi anche unirti ad un'organizzazione esistente o crearne una nuova. + + + +## Caratteristiche dei repository + +Ogni repository nel Model Hub si comporta come un tipico repository di GitHub. I nostri repository offrono il versionamento, la cronologia dei commit, e la possibilità di visualizzare le differenze. + +Il versionamento all'interno del Model Hub è basato su git e [git-lfs](https://git-lfs.github.com/). In altre parole, puoi trattare un modello come un unico repository, consentendo un maggiore controllo degli accessi e maggiore scalabilità. Il controllo delle versioni consente *revisions*, un metodo per appuntare una versione specifica di un modello con un hash di commit, un tag o un branch. + +Come risultato, puoi caricare una specifica versione di un modello con il parametro `revision`: + +```py +>>> model = AutoModel.from_pretrained( +... "julien-c/EsperBERTo-small", revision="v2.0.1" # nome di un tag, di un branch, o commit hash +... ) +``` + +Anche i file possono essere modificati facilmente in un repository ed è possibile visualizzare la cronologia dei commit e le differenze: + +![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) + +## Configurazione + +Prima di condividere un modello nell'Hub, hai bisogno delle tue credenziali di Hugging Face. Se hai accesso ad un terminale, esegui il seguente comando nell'ambiente virtuale in cui è installata la libreria 🤗 Transformers. Questo memorizzerà il tuo token di accesso nella cartella cache di Hugging Face (di default `~/.cache/`): + +```bash +huggingface-cli login +``` + +Se stai usando un notebook come Jupyter o Colaboratory, assicurati di avere la libreria [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) installata. Questa libreria ti permette di interagire in maniera programmatica con l'Hub. + +```bash +pip install huggingface_hub +``` + +Utilizza `notebook_login` per accedere all'Hub, e segui il link [qui](https://huggingface.co/settings/token) per generare un token con cui effettuare il login: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## Converti un modello per tutti i framework + +Per assicurarti che il tuo modello possa essere utilizzato da persone che lavorano con un framework differente, ti raccomandiamo di convertire e caricare il tuo modello sia con i checkpoint di PyTorch che con quelli di TensorFlow. Anche se è possibile caricare il modello da un framework diverso, se si salta questo passaggio, il caricamento sarà più lento perché 🤗 Transformers ha bisogno di convertire i checkpoint al momento. + +Convertire un checkpoint per un altro framework è semplice. Assicurati di avere PyTorch e TensorFlow installati (vedi [qui](installation) per le istruzioni d'installazione), e poi trova il modello specifico per il tuo compito nell'altro framework. + + + +Specifica `from_tf=True` per convertire un checkpoint da TensorFlow a PyTorch: + +```py +>>> pt_model = DistilBertForSequenceClassification.from_pretrained( +... "path/verso/il-nome-magnifico-che-hai-scelto", from_tf=True +... ) +>>> pt_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto") +``` + + +Specifica `from_pt=True` per convertire un checkpoint da PyTorch a TensorFlow: + +```py +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained( +... "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True +... ) +``` + +Poi puoi salvare il tuo nuovo modello in TensorFlow con il suo nuovo checkpoint: + +```py +>>> tf_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto") +``` + + +Se un modello è disponibile in Flax, puoi anche convertire un checkpoint da PyTorch a Flax: + +```py +>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( +... "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True +... ) +``` + + + +## Condividi un modello durante il training + + + + + +Condividere un modello nell'Hub è tanto semplice quanto aggiungere un parametro extra o un callback. Ricorda dal [tutorial sul fine-tuning](training), la classe [`TrainingArguments`] è dove specifichi gli iperparametri e le opzioni addizionali per l'allenamento. Una di queste opzioni di training include l'abilità di condividere direttamente un modello nell'Hub. Imposta `push_to_hub=True` in [`TrainingArguments`]: + +```py +>>> training_args = TrainingArguments(output_dir="il-mio-bellissimo-modello", push_to_hub=True) +``` + +Passa gli argomenti per il training come di consueto al [`Trainer`]: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +Dopo aver effettuato il fine-tuning del tuo modello, chiama [`~transformers.Trainer.push_to_hub`] sul [`Trainer`] per condividere il modello allenato nell'Hub. 🤗 Transformers aggiungerà in modo automatico persino gli iperparametri, i risultati del training e le versioni del framework alla scheda del tuo modello (model card, in inglese)! + +```py +>>> trainer.push_to_hub() +``` + + +Condividi un modello nell'Hub con [`PushToHubCallback`]. Nella funzione [`PushToHubCallback`], aggiungi: + +- Una directory di output per il tuo modello. +- Un tokenizer. +- L'`hub_model_id`, che è il tuo username sull'Hub e il nome del modello. + +```py +>>> from transformers import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="./il_path_dove_salvare_il_tuo_modello", +... tokenizer=tokenizer, +... hub_model_id="il-tuo-username/il-mio-bellissimo-modello", +... ) +``` + +Aggiungi il callback a [`fit`](https://keras.io/api/models/model_training_apis/), e 🤗 Transformers caricherà il modello allenato nell'Hub: + +```py +>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) +``` + + + +## Utilizzare la funzione `push_to_hub` + +Puoi anche chiamare `push_to_hub` direttamente sul tuo modello per caricarlo nell'Hub. + +Specifica il nome del tuo modello in `push_to_hub`: + +```py +>>> pt_model.push_to_hub("il-mio-bellissimo-modello") +``` + +Questo crea un repository sotto il proprio username con il nome del modello `il-mio-bellissimo-modello`. Ora chiunque può caricare il tuo modello con la funzione `from_pretrained`: + +```py +>>> from transformers import AutoModel + +>>> model = AutoModel.from_pretrained("il-tuo-username/il-mio-bellissimo-modello") +``` + +Se fai parte di un'organizzazione e vuoi invece condividere un modello sotto il nome dell'organizzazione, aggiungi il parametro `organization`: + +```py +>>> pt_model.push_to_hub("il-mio-bellissimo-modello", organization="la-mia-fantastica-org") +``` + +La funzione `push_to_hub` può essere anche utilizzata per aggiungere altri file al repository del modello. Per esempio, aggiungi un tokenizer ad un repository di un modello: + +```py +>>> tokenizer.push_to_hub("il-mio-bellissimo-modello") +``` + +O magari potresti voler aggiungere la versione di TensorFlow del tuo modello PyTorch a cui hai fatto fine-tuning: + +```py +>>> tf_model.push_to_hub("il-mio-bellissimo-modello") +``` + +Ora quando navighi nel tuo profilo Hugging Face, dovresti vedere il tuo repository del modello appena creato. Premendo sulla scheda **Files** vengono visualizzati tutti i file caricati nel repository. + +Per maggiori dettagli su come creare e caricare file ad un repository, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/how-to-upstream). + +## Carica un modello utilizzando l'interfaccia web + +Chi preferisce un approccio senza codice può caricare un modello tramite l'interfaccia web dell'hub. Visita [huggingface.co/new](https://huggingface.co/new) per creare un nuovo repository: + +![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) + +Da qui, aggiungi alcune informazioni sul tuo modello: + +- Seleziona il/la **owner** del repository. Puoi essere te o qualunque organizzazione di cui fai parte. +- Scegli un nome per il tuo modello, il quale sarà anche il nome del repository. +- Scegli se il tuo modello è pubblico o privato. +- Specifica la licenza utilizzata per il tuo modello. + +Ora premi sulla scheda **Files** e premi sul pulsante **Add file** per caricare un nuovo file al tuo repository. Trascina poi un file per caricarlo e aggiungere un messaggio di commit. + +![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) + +## Aggiungi una scheda del modello + +Per assicurarti che chiunque possa comprendere le abilità, limitazioni, i potenziali bias e le considerazioni etiche del tuo modello, per favore aggiungi una scheda del modello (model card, in inglese) al tuo repository. La scheda del modello è definita nel file `README.md`. Puoi aggiungere una scheda del modello: + +* Creando manualmente e caricando un file `README.md`. +* Premendo sul pulsante **Edit model card** nel repository del tuo modello. + +Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards). diff --git a/docs/source/it/model_sharing.mdx b/docs/source/it/model_sharing.mdx deleted file mode 100644 index 87ba2b5b3421..000000000000 --- a/docs/source/it/model_sharing.mdx +++ /dev/null @@ -1,234 +0,0 @@ - - -# Condividi un modello - -Gli ultimi due tutorial ti hanno mostrato come puoi fare fine-tuning di un modello con PyTorch, Keras e 🤗 Accelerate per configurazioni distribuite. Il prossimo passo è quello di condividere il tuo modello con la community! In Hugging Face, crediamo nella condivisione della conoscenza e delle risorse in modo da democratizzare l'intelligenza artificiale per chiunque. Ti incoraggiamo a considerare di condividere il tuo modello con la community per aiutare altre persone a risparmiare tempo e risorse. - -In questo tutorial, imparerai due metodi per la condivisione di un modello trained o fine-tuned nel [Model Hub](https://huggingface.co/models): - -- Condividi in modo programmatico i tuoi file nell'Hub. -- Trascina i tuoi file nell'Hub mediante interfaccia grafica. - - - - - -Per condividere un modello con la community, hai bisogno di un account su [huggingface.co](https://huggingface.co/join). Puoi anche unirti ad un'organizzazione esistente o crearne una nuova. - - - -## Caratteristiche dei repository - -Ogni repository nel Model Hub si comporta come un tipico repository di GitHub. I nostri repository offrono il versionamento, la cronologia dei commit, e la possibilità di visualizzare le differenze. - -Il versionamento all'interno del Model Hub è basato su git e [git-lfs](https://git-lfs.github.com/). In altre parole, puoi trattare un modello come un unico repository, consentendo un maggiore controllo degli accessi e maggiore scalabilità. Il controllo delle versioni consente *revisions*, un metodo per appuntare una versione specifica di un modello con un hash di commit, un tag o un branch. - -Come risultato, puoi caricare una specifica versione di un modello con il parametro `revision`: - -```py ->>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # nome di un tag, di un branch, o commit hash -... ) -``` - -Anche i file possono essere modificati facilmente in un repository ed è possibile visualizzare la cronologia dei commit e le differenze: - -![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) - -## Configurazione - -Prima di condividere un modello nell'Hub, hai bisogno delle tue credenziali di Hugging Face. Se hai accesso ad un terminale, esegui il seguente comando nell'ambiente virtuale in cui è installata la libreria 🤗 Transformers. Questo memorizzerà il tuo token di accesso nella cartella cache di Hugging Face (di default `~/.cache/`): - -```bash -huggingface-cli login -``` - -Se stai usando un notebook come Jupyter o Colaboratory, assicurati di avere la libreria [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) installata. Questa libreria ti permette di interagire in maniera programmatica con l'Hub. - -```bash -pip install huggingface_hub -``` - -Utilizza `notebook_login` per accedere all'Hub, e segui il link [qui](https://huggingface.co/settings/token) per generare un token con cui effettuare il login: - -```py ->>> from huggingface_hub import notebook_login - ->>> notebook_login() -``` - -## Converti un modello per tutti i framework - -Per assicurarti che il tuo modello possa essere utilizzato da persone che lavorano con un framework differente, ti raccomandiamo di convertire e caricare il tuo modello sia con i checkpoint di PyTorch che con quelli di TensorFlow. Anche se è possibile caricare il modello da un framework diverso, se si salta questo passaggio, il caricamento sarà più lento perché 🤗 Transformers ha bisogno di convertire i checkpoint al momento. - -Convertire un checkpoint per un altro framework è semplice. Assicurati di avere PyTorch e TensorFlow installati (vedi [qui](installation) per le istruzioni d'installazione), e poi trova il modello specifico per il tuo compito nell'altro framework. - - - -Specifica `from_tf=True` per convertire un checkpoint da TensorFlow a PyTorch: - -```py ->>> pt_model = DistilBertForSequenceClassification.from_pretrained( -... "path/verso/il-nome-magnifico-che-hai-scelto", from_tf=True -... ) ->>> pt_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto") -``` - - -Specifica `from_pt=True` per convertire un checkpoint da PyTorch a TensorFlow: - -```py ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained( -... "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True -... ) -``` - -Poi puoi salvare il tuo nuovo modello in TensorFlow con il suo nuovo checkpoint: - -```py ->>> tf_model.save_pretrained("path/verso/il-nome-magnifico-che-hai-scelto") -``` - - -Se un modello è disponibile in Flax, puoi anche convertire un checkpoint da PyTorch a Flax: - -```py ->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( -... "path/verso/il-nome-magnifico-che-hai-scelto", from_pt=True -... ) -``` - - - -## Condividi un modello durante il training - - - - - -Condividere un modello nell'Hub è tanto semplice quanto aggiungere un parametro extra o un callback. Ricorda dal [tutorial sul fine-tuning](training), la classe [`TrainingArguments`] è dove specifichi gli iperparametri e le opzioni addizionali per l'allenamento. Una di queste opzioni di training include l'abilità di condividere direttamente un modello nell'Hub. Imposta `push_to_hub=True` in [`TrainingArguments`]: - -```py ->>> training_args = TrainingArguments(output_dir="il-mio-bellissimo-modello", push_to_hub=True) -``` - -Passa gli argomenti per il training come di consueto al [`Trainer`]: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -Dopo aver effettuato il fine-tuning del tuo modello, chiama [`~transformers.Trainer.push_to_hub`] sul [`Trainer`] per condividere il modello allenato nell'Hub. 🤗 Transformers aggiungerà in modo automatico persino gli iperparametri, i risultati del training e le versioni del framework alla scheda del tuo modello (model card, in inglese)! - -```py ->>> trainer.push_to_hub() -``` - - -Condividi un modello nell'Hub con [`PushToHubCallback`]. Nella funzione [`PushToHubCallback`], aggiungi: - -- Una directory di output per il tuo modello. -- Un tokenizer. -- L'`hub_model_id`, che è il tuo username sull'Hub e il nome del modello. - -```py ->>> from transformers.keras.callbacks import PushToHubCallback - ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="./il_path_dove_salvare_il_tuo_modello", -... tokenizer=tokenizer, -... hub_model_id="il-tuo-username/il-mio-bellissimo-modello", -... ) -``` - -Aggiungi il callback a [`fit`](https://keras.io/api/models/model_training_apis/), e 🤗 Transformers caricherà il modello allenato nell'Hub: - -```py ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) -``` - - - -## Utilizzare la funzione `push_to_hub` - -Puoi anche chiamare `push_to_hub` direttamente sul tuo modello per caricarlo nell'Hub. - -Specifica il nome del tuo modello in `push_to_hub`: - -```py ->>> pt_model.push_to_hub("il-mio-bellissimo-modello") -``` - -Questo crea un repository sotto il proprio username con il nome del modello `il-mio-bellissimo-modello`. Ora chiunque può caricare il tuo modello con la funzione `from_pretrained`: - -```py ->>> from transformers import AutoModel - ->>> model = AutoModel.from_pretrained("il-tuo-username/il-mio-bellissimo-modello") -``` - -Se fai parte di un'organizzazione e vuoi invece condividere un modello sotto il nome dell'organizzazione, aggiungi il parametro `organization`: - -```py ->>> pt_model.push_to_hub("il-mio-bellissimo-modello", organization="la-mia-fantastica-org") -``` - -La funzione `push_to_hub` può essere anche utilizzata per aggiungere altri file al repository del modello. Per esempio, aggiungi un tokenizer ad un repository di un modello: - -```py ->>> tokenizer.push_to_hub("il-mio-bellissimo-modello") -``` - -O magari potresti voler aggiungere la versione di TensorFlow del tuo modello PyTorch a cui hai fatto fine-tuning: - -```py ->>> tf_model.push_to_hub("il-mio-bellissimo-modello") -``` - -Ora quando navighi nel tuo profilo Hugging Face, dovresti vedere il tuo repository del modello appena creato. Premendo sulla scheda **Files** vengono visualizzati tutti i file caricati nel repository. - -Per maggiori dettagli su come creare e caricare file ad un repository, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/how-to-upstream). - -## Carica un modello utilizzando l'interfaccia web - -Chi preferisce un approccio senza codice può caricare un modello tramite l'interfaccia web dell'hub. Visita [huggingface.co/new](https://huggingface.co/new) per creare un nuovo repository: - -![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) - -Da qui, aggiungi alcune informazioni sul tuo modello: - -- Seleziona il/la **owner** del repository. Puoi essere te o qualunque organizzazione di cui fai parte. -- Scegli un nome per il tuo modello, il quale sarà anche il nome del repository. -- Scegli se il tuo modello è pubblico o privato. -- Specifica la licenza utilizzata per il tuo modello. - -Ora premi sulla scheda **Files** e premi sul pulsante **Add file** per caricare un nuovo file al tuo repository. Trascina poi un file per caricarlo e aggiungere un messaggio di commit. - -![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) - -## Aggiungi una scheda del modello - -Per assicurarti che chiunque possa comprendere le abilità, limitazioni, i potenziali bias e le considerazioni etiche del tuo modello, per favore aggiungi una scheda del modello (model card, in inglese) al tuo repository. La scheda del modello è definita nel file `README.md`. Puoi aggiungere una scheda del modello: - -* Creando manualmente e caricando un file `README.md`. -* Premendo sul pulsante **Edit model card** nel repository del tuo modello. - -Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards). diff --git a/docs/source/it/multilingual.md b/docs/source/it/multilingual.md new file mode 100644 index 000000000000..889c620ab29d --- /dev/null +++ b/docs/source/it/multilingual.md @@ -0,0 +1,178 @@ + + +# Modelli multilingue per l'inferenza + +[[open-in-colab]] + +Ci sono diversi modelli multilingue in 🤗 Transformers, e il loro utilizzo per l'inferenza differisce da quello dei modelli monolingua. Non *tutti* gli utilizzi dei modelli multilingue sono però diversi. Alcuni modelli, come [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), possono essere usati come un modello monolingua. Questa guida ti mostrerà come utilizzare modelli multilingue che utilizzano un modo diverso per fare l'inferenza. + +## XLM + +XLM ha dieci diversi checkpoint, di cui solo uno è monolingua. I nove checkpoint rimanenti possono essere suddivisi in due categorie: i checkpoint che utilizzano i language embeddings e quelli che non li utilizzano. + +### XLM con language embeddings + +I seguenti modelli XLM utilizzano gli embeddings linguistici per specificare la lingua utilizzata per l'inferenza: + +- `xlm-mlm-ende-1024` (Modellazione mascherata del linguaggio (Masked language modeling, in inglese), Inglese-Tedesco) +- `xlm-mlm-enfr-1024` (Modellazione mascherata del linguaggio, Inglese-Francese) +- `xlm-mlm-enro-1024` (Modellazione mascherata del linguaggio, Inglese-Rumeno) +- `xlm-mlm-xnli15-1024` (Modellazione mascherata del linguaggio, lingue XNLI) +- `xlm-mlm-tlm-xnli15-1024` (Modellazione mascherata del linguaggio + traduzione, lingue XNLI) +- `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese) +- `xlm-clm-ende-1024` (Modellazione causale del linguaggio, Inglese-Tedesco) + +Gli embeddings linguistici sono rappresentati come un tensore delle stesse dimensioni dell' `input_ids` passato al modello. I valori in questi tensori dipendono dal linguaggio usato e sono identificati dagli attributi `lang2id` e `id2lang` del tokenizer. + +In questo esempio, carica il checkpoint `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese): + +```py +>>> import torch +>>> from transformers import XLMTokenizer, XLMWithLMHeadModel + +>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") +>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") +``` + +L'attributo `lang2id` del tokenizer mostra il linguaggio del modello e il suo ids: + +```py +>>> print(tokenizer.lang2id) +{'en': 0, 'fr': 1} +``` + +Poi, crea un esempio di input: + +```py +>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 +``` + +Imposta l'id del linguaggio a `"en"` e usalo per definire il language embedding. Il language embedding è un tensore riempito con `0` perché questo è il language id per l'inglese. Questo tensore dovrebbe avere la stessa dimensione di `input_ids`. + +```py +>>> language_id = tokenizer.lang2id["en"] # 0 +>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) + +>>> # We reshape it to be of size (batch_size, sequence_length) +>>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) +``` + +Adesso puoi inserire `input_ids` e language embedding nel modello: + +```py +>>> outputs = model(input_ids, langs=langs) +``` + +Lo script [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) può generare testo tramite i language embeddings usando i checkpoints `xlm-clm`. + +### XLM senza language embeddings + +I seguenti modelli XLM non richiedono l'utilizzo dei language embeddings per fare inferenza: + +- `xlm-mlm-17-1280` (Modellazione mascherata del linguaggio, 17 lingue) +- `xlm-mlm-100-1280` (Modellazione mascherata del linguaggio, 100 lingue) + +Questi modelli sono utilizzati per rappresentazioni generiche di frasi, a differenza dei precedenti checkpoints XML. + +## BERT + +Il seguente modello BERT può essere usato per compiti multilingue: + +- `bert-base-multilingual-uncased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 102 lingue) +- `bert-base-multilingual-cased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 104 lingue) + +Questi modelli non richiedono language embeddings per fare inferenza. Riescono ad identificare il linguaggio dal contesto e inferire di conseguenza. + +## XLM-RoBERTa + +Il seguente modello XLM-RoBERTa può essere usato per compiti multilingue: + +- `xlm-roberta-base` (Modellazione mascherata del linguaggio, 100 lingue) +- `xlm-roberta-large` (Modellazione mascherata del linguaggio, 100 lingue) + +XLM-RoBERTa è stato addestrato su 2.5TB di dati CommonCrawl appena creati e puliti in 100 lingue. Offre notevoli vantaggi rispetto ai modelli multilingue rilasciati in precedenza, come mBERT o XLM, in compiti come la classificazione, l'etichettatura delle sequenze e la risposta alle domande. + +## M2M100 + +Il seguente modello M2M100 può essere usato per compiti multilingue: + +- `facebook/m2m100_418M` (Traduzione) +- `facebook/m2m100_1.2B` (Traduzione) + +In questo esempio, carica il checkpoint `facebook/m2m100_418M` per tradurre dal cinese all'inglese. Puoi impostare la lingua di partenza nel tokenizer: + +```py +>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." + +>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +``` + +Applica il tokenizer al testo: + +```py +>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") +``` + +M2M100 forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese: + +```py +>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' +``` + +## MBart + +Il seguente modello MBart può essere usato per compiti multilingue: + +- `facebook/mbart-large-50-one-to-many-mmt` (Traduzione automatica multilingue uno-a-molti, 50 lingue) +- `facebook/mbart-large-50-many-to-many-mmt` (Traduzione automatica multilingue molti-a-molti, 50 lingue) +- `facebook/mbart-large-50-many-to-one-mmt` (Traduzione automatica multilingue molti-a-uno, 50 lingue) +- `facebook/mbart-large-50` (Traduzione multilingue, 50 lingue) +- `facebook/mbart-large-cc25` + +In questo esempio, carica il checkpoint `facebook/mbart-large-50-many-to-many-mmt` per tradurre dal finlandese all'inglese. Puoi impostare la lingua di partenza nel tokenizer: + +```py +>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." + +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +``` + +Applica il tokenizer sul testo: + +```py +>>> encoded_en = tokenizer(en_text, return_tensors="pt") +``` + +MBart forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese: + +```py +>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." +``` + +Se stai usando il checkpoint `facebook/mbart-large-50-many-to-one-mmt`, non hai bisogno di forzare l'id della lingua obiettivo come primo token generato altrimenti l'uso è lo stesso. \ No newline at end of file diff --git a/docs/source/it/multilingual.mdx b/docs/source/it/multilingual.mdx deleted file mode 100644 index a8ccec97d0a7..000000000000 --- a/docs/source/it/multilingual.mdx +++ /dev/null @@ -1,174 +0,0 @@ - - -# Modelli multilingue per l'inferenza - -[[open-in-colab]] - -Ci sono diversi modelli multilingue in 🤗 Transformers, e il loro utilizzo per l'inferenza differisce da quello dei modelli monolingua. Non *tutti* gli utilizzi dei modelli multilingue sono però diversi. Alcuni modelli, come [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), possono essere usati come un modello monolingua. Questa guida ti mostrerà come utilizzare modelli multilingue che utilizzano un modo diverso per fare l'inferenza. - -## XLM - -XLM ha dieci diversi checkpoint, di cui solo uno è monolingua. I nove checkpoint rimanenti possono essere suddivisi in due categorie: i checkpoint che utilizzano i language embeddings e quelli che non li utilizzano. - -### XLM con language embeddings - -I seguenti modelli XLM utilizzano gli embeddings linguistici per specificare la lingua utilizzata per l'inferenza: - -- `xlm-mlm-ende-1024` (Modellazione mascherata del linguaggio (Masked language modeling, in inglese), Inglese-Tedesco) -- `xlm-mlm-enfr-1024` (Modellazione mascherata del linguaggio, Inglese-Francese) -- `xlm-mlm-enro-1024` (Modellazione mascherata del linguaggio, Inglese-Rumeno) -- `xlm-mlm-xnli15-1024` (Modellazione mascherata del linguaggio, lingue XNLI) -- `xlm-mlm-tlm-xnli15-1024` (Modellazione mascherata del linguaggio + traduzione, lingue XNLI) -- `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese) -- `xlm-clm-ende-1024` (Modellazione causale del linguaggio, Inglese-Tedesco) - -Gli embeddings linguistici sono rappresentati come un tensore delle stesse dimensioni dell' `input_ids` passato al modello. I valori in questi tensori dipendono dal linguaggio usato e sono identificati dagli attributi `lang2id` e `id2lang` del tokenizer. - -In questo esempio, carica il checkpoint `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese): - -```py ->>> import torch ->>> from transformers import XLMTokenizer, XLMWithLMHeadModel - ->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") ->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") -``` - -L'attributo `lang2id` del tokenizer mostra il linguaggio del modello e il suo ids: - -```py ->>> print(tokenizer.lang2id) -{'en': 0, 'fr': 1} -``` - -Poi, crea un esempio di input: - -```py ->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 -``` - -Imposta l'id del linguaggio a `"en"` e usalo per definire il language embedding. Il language embedding è un tensore riempito con `0` perché questo è il language id per l'inglese. Questo tensore dovrebbe avere la stessa dimensione di `input_ids`. - -```py ->>> language_id = tokenizer.lang2id["en"] # 0 ->>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) - ->>> # We reshape it to be of size (batch_size, sequence_length) ->>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) -``` - -Adesso puoi inserire `input_ids` e language embedding nel modello: - -```py ->>> outputs = model(input_ids, langs=langs) -``` - -Lo script [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) può generare testo tramite i language embeddings usando i checkpoints `xlm-clm`. - -### XLM senza language embeddings - -I seguenti modelli XLM non richiedono l'utilizzo dei language embeddings per fare inferenza: - -- `xlm-mlm-17-1280` (Modellazione mascherata del linguaggio, 17 lingue) -- `xlm-mlm-100-1280` (Modellazione mascherata del linguaggio, 100 lingue) - -Questi modelli sono utilizzati per rappresentazioni generiche di frasi, a differenza dei precedenti checkpoints XML. - -## BERT - -Il seguente modello BERT può essere usato per compiti multilingue: - -- `bert-base-multilingual-uncased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 102 lingue) -- `bert-base-multilingual-cased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 104 lingue) - -Questi modelli non richiedono language embeddings per fare inferenza. Riescono ad identificare il linguaggio dal contesto e inferire di conseguenza. - -## XLM-RoBERTa - -Il seguente modello XLM-RoBERTa può essere usato per compiti multilingue: - -- `xlm-roberta-base` (Modellazione mascherata del linguaggio, 100 lingue) -- `xlm-roberta-large` (Modellazione mascherata del linguaggio, 100 lingue) - -XLM-RoBERTa è stato addestrato su 2.5TB di dati CommonCrawl appena creati e puliti in 100 lingue. Offre notevoli vantaggi rispetto ai modelli multilingue rilasciati in precedenza, come mBERT o XLM, in compiti come la classificazione, l'etichettatura delle sequenze e la risposta alle domande. - -## M2M100 - -Il seguente modello M2M100 può essere usato per compiti multilingue: - -- `facebook/m2m100_418M` (Traduzione) -- `facebook/m2m100_1.2B` (Traduzione) - -In questo esempio, carica il checkpoint `facebook/m2m100_418M` per tradurre dal cinese all'inglese. Puoi impostare la lingua di partenza nel tokenizer: - -```py ->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." - ->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") ->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") -``` - -Applica il tokenizer al testo: - -```py ->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") -``` - -M2M100 forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese: - -```py ->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' -``` - -## MBart - -Il seguente modello MBart può essere usato per compiti multilingue: - -- `facebook/mbart-large-50-one-to-many-mmt` (Traduzione automatica multilingue uno-a-molti, 50 lingue) -- `facebook/mbart-large-50-many-to-many-mmt` (Traduzione automatica multilingue molti-a-molti, 50 lingue) -- `facebook/mbart-large-50-many-to-one-mmt` (Traduzione automatica multilingue molti-a-uno, 50 lingue) -- `facebook/mbart-large-50` (Traduzione multilingue, 50 lingue) -- `facebook/mbart-large-cc25` - -In questo esempio, carica il checkpoint `facebook/mbart-large-50-many-to-many-mmt` per tradurre dal finlandese all'inglese. Puoi impostare la lingua di partenza nel tokenizer: - -```py ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." - ->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") ->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") -``` - -Applica il tokenizer sul testo: - -```py ->>> encoded_en = tokenizer(en_text, return_tensors="pt") -``` - -MBart forza l'id della lingua obiettivo come primo token generato per tradurre nella lingua obiettivo. Imposta il parametro `forced_bos_token_id` a `en` nel metodo `generate` per tradurre in inglese: - -```py ->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." -``` - -Se stai usando il checkpoint `facebook/mbart-large-50-many-to-one-mmt`, non hai bisogno di forzare l'id della lingua obiettivo come primo token generato altrimenti l'uso è lo stesso. \ No newline at end of file diff --git a/docs/source/it/perf_hardware.md b/docs/source/it/perf_hardware.md new file mode 100644 index 000000000000..a579362e2b1b --- /dev/null +++ b/docs/source/it/perf_hardware.md @@ -0,0 +1,155 @@ + + + +# Hardware ottimizzato per l'addestramento + +L'hardware utilizzato per eseguire l'addestramento del modello e l'inferenza può avere un grande effetto sulle prestazioni. Per un analisi approfondita delle GPUs, assicurati di dare un'occhiata all'eccellente [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/) di Tim Dettmer. + +Diamo un'occhiata ad alcuni consigli pratici per la configurazione della GPU. + +## GPU +Quando si addestrano modelli più grandi ci sono essenzialmente tre opzioni: +- GPUs piu' grandi +- Piu' GPUs +- Piu' CPU e piu' NVMe (scaricato da [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support)) + +Iniziamo dal caso in cui ci sia una singola GPU. + +### Potenza e Raffreddamento + +Se hai acquistato una costosa GPU di fascia alta, assicurati di darle la potenza corretta e un raffreddamento sufficiente. + +**Potenza**: + +Alcune schede GPU consumer di fascia alta hanno 2 e talvolta 3 prese di alimentazione PCI-E a 8 pin. Assicurati di avere tanti cavi PCI-E a 8 pin indipendenti da 12 V collegati alla scheda quante sono le prese. Non utilizzare le 2 fessure a un'estremità dello stesso cavo (noto anche come cavo a spirale). Cioè se hai 2 prese sulla GPU, vuoi 2 cavi PCI-E a 8 pin che vanno dall'alimentatore alla scheda e non uno che abbia 2 connettori PCI-E a 8 pin alla fine! In caso contrario, non otterrai tutte le prestazioni ufficiali. + +Ciascun cavo di alimentazione PCI-E a 8 pin deve essere collegato a una guida da 12 V sul lato dell'alimentatore e può fornire fino a 150 W di potenza. + +Alcune altre schede possono utilizzare connettori PCI-E a 12 pin e questi possono fornire fino a 500-600 W di potenza. + +Le schede di fascia bassa possono utilizzare connettori a 6 pin, che forniscono fino a 75 W di potenza. + +Inoltre vuoi un alimentatore (PSU) di fascia alta che abbia una tensione stabile. Alcuni PSU di qualità inferiore potrebbero non fornire alla scheda la tensione stabile di cui ha bisogno per funzionare al massimo. + +E ovviamente l'alimentatore deve avere abbastanza Watt inutilizzati per alimentare la scheda. + +**Raffreddamento**: + +Quando una GPU si surriscalda, inizierà a rallentare e non fornirà le prestazioni mssimali e potrebbe persino spegnersi se diventasse troppo calda. + +È difficile dire l'esatta temperatura migliore a cui aspirare quando una GPU è molto caricata, ma probabilmente qualsiasi cosa al di sotto di +80°C va bene, ma più bassa è meglio - forse 70-75°C è un intervallo eccellente in cui trovarsi. È probabile che il rallentamento inizi a circa 84-90°C. Ma oltre alla limitazione delle prestazioni, una temperatura molto elevata prolungata è probabile che riduca la durata di una GPU. + +Diamo quindi un'occhiata a uno degli aspetti più importanti quando si hanno più GPU: la connettività. + +### Connettività multi-GPU + +Se utilizzi più GPU, il modo in cui le schede sono interconnesse può avere un enorme impatto sul tempo totale di allenamento. Se le GPU si trovano sullo stesso nodo fisico, puoi eseguire: + +``` +nvidia-smi topo -m +``` + +e ti dirà come sono interconnesse le GPU. Su una macchina con doppia GPU e collegata a NVLink, molto probabilmente vedrai qualcosa del tipo: + +``` + GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X NV2 0-23 N/A +GPU1 NV2 X 0-23 N/A +``` + +su una macchina diversa senza NVLink potremmo vedere: + +``` + GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X PHB 0-11 N/A +GPU1 PHB X 0-11 N/A +``` + +Il rapporto include questa legenda: + +``` + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks +``` + +Quindi il primo rapporto `NV2` ci dice che le GPU sono interconnesse con 2 NVLinks e nel secondo report `PHB` abbiamo una tipica configurazione PCIe+Bridge a livello di consumatore. + +Controlla che tipo di connettività hai sulla tua configurazione. Alcuni di questi renderanno la comunicazione tra le carte più veloce (es. NVLink), altri più lenta (es. PHB). + +A seconda del tipo di soluzione di scalabilità utilizzata, la velocità di connettività potrebbe avere un impatto maggiore o minore. Se le GPU devono sincronizzarsi raramente, come in DDP, l'impatto di una connessione più lenta sarà meno significativo. Se le GPU devono scambiarsi messaggi spesso, come in ZeRO-DP, una connettività più veloce diventa estremamente importante per ottenere un addestramento più veloce. + +#### NVlink + +[NVLink](https://en.wikipedia.org/wiki/NVLink) è un collegamento di comunicazione a corto raggio multilinea seriale basato su cavo sviluppato da Nvidia. + +Ogni nuova generazione fornisce una larghezza di banda più veloce, ad es. ecco una citazione da [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf): + +> Third-Generation NVLink® +> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, +> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four +> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth +> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink. +> (Note that 3-Way and 4-Way SLI configurations are not supported.) + +Quindi più `X` si ottiene nel rapporto di `NVX` nell'output di `nvidia-smi topo -m`, meglio è. La generazione dipenderà dall'architettura della tua GPU. + +Confrontiamo l'esecuzione di un training del modello di linguaggio gpt2 su un piccolo campione di wikitext + +I risultati sono: + + +| NVlink | Time | +| ----- | ---: | +| Y | 101s | +| N | 131s | + + +Puoi vedere che NVLink completa l'addestramento circa il 23% più velocemente. Nel secondo benchmark utilizziamo `NCCL_P2P_DISABLE=1` per dire alle GPU di non utilizzare NVLink. + +Ecco il codice benchmark completo e gli output: + +```bash +# DDP w/ NVLink + +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} + +# DDP w/o NVLink + +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 + +{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} +``` + +Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) +Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` \ No newline at end of file diff --git a/docs/source/it/perf_hardware.mdx b/docs/source/it/perf_hardware.mdx deleted file mode 100644 index 0bfdbc8fe686..000000000000 --- a/docs/source/it/perf_hardware.mdx +++ /dev/null @@ -1,151 +0,0 @@ - - - -# Hardware ottimizzato per l'addestramento - -L'hardware utilizzato per eseguire l'addestramento del modello e l'inferenza può avere un grande effetto sulle prestazioni. Per un analisi approfondita delle GPUs, assicurati di dare un'occhiata all'eccellente [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/) di Tim Dettmer. - -Diamo un'occhiata ad alcuni consigli pratici per la configurazione della GPU. - -## GPU -Quando si addestrano modelli più grandi ci sono essenzialmente tre opzioni: -- GPUs piu' grandi -- Piu' GPUs -- Piu' CPU e piu' NVMe (scaricato da [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support)) - -Iniziamo dal caso in cui ci sia una singola GPU. - -### Potenza e Raffreddamento - -Se hai acquistato una costosa GPU di fascia alta, assicurati di darle la potenza corretta e un raffreddamento sufficiente. - -**Potenza**: - -Alcune schede GPU consumer di fascia alta hanno 2 e talvolta 3 prese di alimentazione PCI-E a 8 pin. Assicurati di avere tanti cavi PCI-E a 8 pin indipendenti da 12 V collegati alla scheda quante sono le prese. Non utilizzare le 2 fessure a un'estremità dello stesso cavo (noto anche come cavo a spirale). Cioè se hai 2 prese sulla GPU, vuoi 2 cavi PCI-E a 8 pin che vanno dall'alimentatore alla scheda e non uno che abbia 2 connettori PCI-E a 8 pin alla fine! In caso contrario, non otterrai tutte le prestazioni ufficiali. - -Ciascun cavo di alimentazione PCI-E a 8 pin deve essere collegato a una guida da 12 V sul lato dell'alimentatore e può fornire fino a 150 W di potenza. - -Alcune altre schede possono utilizzare connettori PCI-E a 12 pin e questi possono fornire fino a 500-600 W di potenza. - -Le schede di fascia bassa possono utilizzare connettori a 6 pin, che forniscono fino a 75 W di potenza. - -Inoltre vuoi un alimentatore (PSU) di fascia alta che abbia una tensione stabile. Alcuni PSU di qualità inferiore potrebbero non fornire alla scheda la tensione stabile di cui ha bisogno per funzionare al massimo. - -E ovviamente l'alimentatore deve avere abbastanza Watt inutilizzati per alimentare la scheda. - -**Raffreddamento**: - -Quando una GPU si surriscalda, inizierà a rallentare e non fornirà le prestazioni mssimali e potrebbe persino spegnersi se diventasse troppo calda. - -È difficile dire l'esatta temperatura migliore a cui aspirare quando una GPU è molto caricata, ma probabilmente qualsiasi cosa al di sotto di +80°C va bene, ma più bassa è meglio - forse 70-75°C è un intervallo eccellente in cui trovarsi. È probabile che il rallentamento inizi a circa 84-90°C. Ma oltre alla limitazione delle prestazioni, una temperatura molto elevata prolungata è probabile che riduca la durata di una GPU. - -Diamo quindi un'occhiata a uno degli aspetti più importanti quando si hanno più GPU: la connettività. - -### Connettività multi-GPU - -Se utilizzi più GPU, il modo in cui le schede sono interconnesse può avere un enorme impatto sul tempo totale di allenamento. Se le GPU si trovano sullo stesso nodo fisico, puoi eseguire: - -``` -nvidia-smi topo -m -``` - -e ti dirà come sono interconnesse le GPU. Su una macchina con doppia GPU e collegata a NVLink, molto probabilmente vedrai qualcosa del tipo: - -``` - GPU0 GPU1 CPU Affinity NUMA Affinity -GPU0 X NV2 0-23 N/A -GPU1 NV2 X 0-23 N/A -``` - -su una macchina diversa senza NVLink potremmo vedere: - -``` - GPU0 GPU1 CPU Affinity NUMA Affinity -GPU0 X PHB 0-11 N/A -GPU1 PHB X 0-11 N/A -``` - -Il rapporto include questa legenda: - -``` - X = Self - SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) - NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node - PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) - PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) - PIX = Connection traversing at most a single PCIe bridge - NV# = Connection traversing a bonded set of # NVLinks -``` - -Quindi il primo rapporto `NV2` ci dice che le GPU sono interconnesse con 2 NVLinks e nel secondo report `PHB` abbiamo una tipica configurazione PCIe+Bridge a livello di consumatore. - -Controlla che tipo di connettività hai sulla tua configurazione. Alcuni di questi renderanno la comunicazione tra le carte più veloce (es. NVLink), altri più lenta (es. PHB). - -A seconda del tipo di soluzione di scalabilità utilizzata, la velocità di connettività potrebbe avere un impatto maggiore o minore. Se le GPU devono sincronizzarsi raramente, come in DDP, l'impatto di una connessione più lenta sarà meno significativo. Se le GPU devono scambiarsi messaggi spesso, come in ZeRO-DP, una connettività più veloce diventa estremamente importante per ottenere un addestramento più veloce. - -#### NVlink - -[NVLink](https://en.wikipedia.org/wiki/NVLink) è un collegamento di comunicazione a corto raggio multilinea seriale basato su cavo sviluppato da Nvidia. - -Ogni nuova generazione fornisce una larghezza di banda più veloce, ad es. ecco una citazione da [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf): - -> Third-Generation NVLink® -> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, -> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four -> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth -> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink. -> (Note that 3-Way and 4-Way SLI configurations are not supported.) - -Quindi più `X` si ottiene nel rapporto di `NVX` nell'output di `nvidia-smi topo -m`, meglio è. La generazione dipenderà dall'architettura della tua GPU. - -Confrontiamo l'esecuzione di un training del modello di linguaggio gpt2 su un piccolo campione di wikitext - -I risultati sono: - - -| NVlink | Time | -| ----- | ---: | -| Y | 101s | -| N | 131s | - - -Puoi vedere che NVLink completa l'addestramento circa il 23% più velocemente. Nel secondo benchmark utilizziamo `NCCL_P2P_DISABLE=1` per dire alle GPU di non utilizzare NVLink. - -Ecco il codice benchmark completo e gli output: - -```bash -# DDP w/ NVLink - -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \ ---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ ---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ ---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} - -# DDP w/o NVLink - -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \ ---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \ ---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train ---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} -``` - -Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) -Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` \ No newline at end of file diff --git a/docs/source/it/perf_infer_cpu.md b/docs/source/it/perf_infer_cpu.md new file mode 100644 index 000000000000..baae51a5a978 --- /dev/null +++ b/docs/source/it/perf_infer_cpu.md @@ -0,0 +1,79 @@ + + +# Inferenza Efficiente su CPU + +Questa guida si concentra sull'inferenza di modelli di grandi dimensioni in modo efficiente sulla CPU. + +## `BetterTransformer` per inferenza più rapida + +Abbiamo integrato di recente `BetterTransformer` per fare inferenza più rapidamente con modelli per testi, immagini e audio. Visualizza la documentazione sull'integrazione [qui](https://huggingface.co/docs/optimum/bettertransformer/overview) per maggiori dettagli. + +## PyTorch JIT-mode (TorchScript) + +TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programmma TorchScript può esere salvato da un processo Python e caricato in un processo dove non ci sono dipendenze Python. +Comparandolo con l'eager mode di default, jit mode in PyTorch normalmente fornisce prestazioni migliori per l'inferenza del modello da parte di metodologie di ottimizzazione come la operator fusion. + +Per una prima introduzione a TorchScript, vedi la Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules). + +### IPEX Graph Optimization con JIT-mode + +Intel® Extension per PyTorch fornnisce ulteriori ottimizzazioni in jit mode per i modelli della serie Transformers. Consigliamo vivamente agli utenti di usufruire dei vantaggi di Intel® Extension per PyTorch con jit mode. Alcuni operator patterns usati fequentemente dai modelli Transformers models sono già supportati in Intel® Extension per PyTorch con jit mode fusions. Questi fusion patterns come Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. sono abilitati e hanno buone performance. I benefici della fusion è fornito agli utenti in modo trasparente. In base alle analisi, il ~70% dei problemi più popolari in NLP question-answering, text-classification, and token-classification possono avere benefici sulle performance grazie ai fusion patterns sia per Float32 precision che per BFloat16 Mixed precision. + +Vedi maggiori informazioni per [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html). + +#### Installazione di IPEX + +I rilasci di IPEX seguono PyTorch, verifica i vari approcci per [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/). + +### Utilizzo del JIT-mode + +Per abilitare JIT-mode in Trainer per evaluation e prediction, devi aggiungere `jit_mode_eval` negli argomenti di Trainer. + + + +per PyTorch >= 1.14.0. JIT-mode potrebe giovare a qualsiasi modello di prediction e evaluaion visto che il dict input è supportato in jit.trace + +per PyTorch < 1.14.0. JIT-mode potrebbe giovare ai modelli il cui ordine dei parametri corrisponde all'ordine delle tuple in ingresso in jit.trace, come i modelli per question-answering. +Nel caso in cui l'ordine dei parametri seguenti non corrisponda all'ordine delle tuple in ingresso in jit.trace, come nei modelli di text-classification, jit.trace fallirà e lo cattureremo con una eccezione al fine di renderlo un fallback. Il logging è usato per notificare gli utenti. + + + +Trovi un esempo con caso d'uso in [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) + +- Inference using jit mode on CPU: + +
python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+--jit_mode_eval 
+ +- Inference with IPEX using jit mode on CPU: + +
python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+--use_ipex \
+--jit_mode_eval
diff --git a/docs/source/it/perf_infer_gpu_many.md b/docs/source/it/perf_infer_gpu_many.md new file mode 100644 index 000000000000..b78cb34e1d6d --- /dev/null +++ b/docs/source/it/perf_infer_gpu_many.md @@ -0,0 +1,28 @@ + + +# Inferenza Efficiente su GPU Multiple + +Questo documento contiene informazioni su come fare inferenza in maniera efficiente su GPU multiple. + + + +Nota: Un setup con GPU multiple può utilizzare la maggior parte delle strategie descritte nella [sezione con GPU singola](./perf_infer_gpu_one). Tuttavia, è necessario conoscere delle tecniche semplici che possono essere utilizzate per un risultato migliore. + + + +## `BetterTransformer` per inferenza più rapida + +Abbiamo recentemente integrato `BetterTransformer` per inferenza più rapida su multi-GPU per modelli su testo, immagini e audio. Controlla il documento con queste integrazioni [qui](https://huggingface.co/docs/optimum/bettertransformer/overview) per maggiori dettagli. diff --git a/docs/source/it/perf_infer_gpu_one.md b/docs/source/it/perf_infer_gpu_one.md new file mode 100644 index 000000000000..16f77b3b1f31 --- /dev/null +++ b/docs/source/it/perf_infer_gpu_one.md @@ -0,0 +1,112 @@ + + +# Inferenza efficiente su GPU singola + +Questo documento sarà presto completato con informazioni su come effetture l'inferenza su una singola GPU. Nel frattempo è possibile consultare [la guida per l'addestramento su una singola GPU](perf_train_gpu_one) e [la guida per l'inferenza su CPU](perf_infer_cpu). + +## `BetterTransformer` per l'inferenza più veloce + +Abbiamo recentemente integrato `BetterTransformer` per velocizzare l'inferenza su GPU per modelli di testo, immagini e audio. Per maggiori dettagli, consultare la documentazione su questa integrazione [qui](https://huggingface.co/docs/optimum/bettertransformer/overview). + +## Integrazione di `bitsandbytes` per Int8 mixed-precision matrix decomposition + + + +Nota che questa funzione può essere utilizzata anche nelle configurazioni multi GPU. + + + +Dal paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), noi supportiamo l'integrazione di Hugging Face per tutti i modelli dell'Hub con poche righe di codice. +Il metodo `nn.Linear` riduce la dimensione di 2 per i pesi `float16` e `bfloat16` e di 4 per i pesi `float32`, con un impatto quasi nullo sulla qualità, operando sugli outlier in half-precision. + +![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png) + +Il metodo Int8 mixed-precision matrix decomposition funziona separando la moltiplicazione tra matrici in due flussi: (1) una matrice di flusso di outlier di caratteristiche sistematiche moltiplicata in fp16, (2) in flusso regolare di moltiplicazione di matrici int8 (99,9%). Con questo metodo, è possibile effettutare inferenza int8 per modelli molto grandi senza degrado predittivo. +Per maggiori dettagli sul metodo, consultare il [paper](https://arxiv.org/abs/2208.07339) o il nostro [blogpost sull'integrazione](https://huggingface.co/blog/hf-bitsandbytes-integration). + +![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif) + +Nota che è necessaria una GPU per eseguire modelli di tipo mixed-8bit, poiché i kernel sono stati compilati solo per le GPU. Prima di utilizzare questa funzione, assicurarsi di disporre di memoria sufficiente sulla GPU per memorizzare un quarto del modello (o la metà se i pesi del modello sono in mezza precisione). +Di seguito sono riportate alcune note per aiutarvi a utilizzare questo modulo, oppure seguite le dimostrazioni su [Google colab](#colab-demos). + +### Requisiti + +- Se si dispone di `bitsandbytes<0.37.0`, assicurarsi di eseguire su GPU NVIDIA che supportano tensor cores a 8 bit (Turing, Ampere o architetture più recenti - ad esempio T4, RTX20s RTX30s, A40-A100). Per `bitsandbytes>=0.37.0`, tutte le GPU dovrebbero essere supportate. +- Installare la versione corretta di `bitsandbytes` eseguendo: +`pip install bitsandbytes>=0.31.5`. +- Installare `accelerate` +`pip install accelerate>=0.12.0` + +### Esecuzione di modelli mixed-Int8 - configurazione per singola GPU + +Dopo aver installato le librerie necessarie, per caricare il tuo modello mixed 8-bit è il seguente: + +```py +from transformers import AutoModelForCausalLM + +model_name = "bigscience/bloom-2b5" +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +``` + +Per la generazione di testo, si consiglia di: + +* utilizzare il metodo `generate()` del modello invece della funzione `pipeline()`. Sebbene l'inferenza sia possibile con la funzione `pipeline()`, essa non è ottimizzata per i modelli mixed-8bit e sarà più lenta rispetto all'uso del metodo `generate()`. Inoltre, alcune strategie di campionamento, come il campionamento nucleaus, non sono supportate dalla funzione `pipeline()` per i modelli mixed-8bit. +* collocare tutti gli ingressi sullo stesso dispositivo del modello. + +Ecco un semplice esempio: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_name = "bigscience/bloom-2b5" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) + +text = "Hello, my llama is cute" +inputs = tokenizer(prompt, return_tensors="pt").to("cuda") +generated_ids = model.generate(**inputs) +outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +``` + + +### Esecuzione di modelli mixed-8bit - configurazione multi GPU + +Usare il seguente modo caricare il modello mixed-8bit su più GPU (stesso comando della configurazione a GPU singola): +```py +model_name = "bigscience/bloom-2b5" +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +``` +Puoi controllare la RAM della GPU che si vuole allocare su ogni GPU usando `accelerate`. Utilizzare l'argomento `max_memory` come segue: + +```py +max_memory_mapping = {0: "1GB", 1: "2GB"} +model_name = "bigscience/bloom-3b" +model_8bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping +) +``` +In questo esempio, la prima GPU utilizzerà 1 GB di memoria e la seconda 2 GB. + +### Colab demos + +Con questo metodo è possibile inferire modelli che prima non era possibile inferire su Google Colab. +Guardate la demo per l'esecuzione di T5-11b (42GB in fp32)! Utilizzo la quantizzazione a 8 bit su Google Colab: + +[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing) + +Oppure questa demo di BLOOM-3B: + +[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing) \ No newline at end of file diff --git a/docs/source/it/perf_infer_special.md b/docs/source/it/perf_infer_special.md new file mode 100644 index 000000000000..3e2c0a5c288e --- /dev/null +++ b/docs/source/it/perf_infer_special.md @@ -0,0 +1,18 @@ + + +# Inferenza su Hardware Specializzato + +Questo documento sarà completato a breve con la documentazione per l'inferenza su hardware specializzato. Nel frattempo puoi controllare [la guida per fare inferenza sulle CPU](perf_infer_cpu). \ No newline at end of file diff --git a/docs/source/it/perf_train_cpu.md b/docs/source/it/perf_train_cpu.md new file mode 100644 index 000000000000..c91baeec8800 --- /dev/null +++ b/docs/source/it/perf_train_cpu.md @@ -0,0 +1,69 @@ + + +# Addestramento efficiente su CPU + +Questa guida si concentra su come addestrare in maniera efficiente grandi modelli su CPU. + +## Mixed precision con IPEX + +IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni. + +Il tipo di dati a bassa precisione BFloat16 è stato supportato in modo nativo su 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) con AVX512 e sarà supportata dalla prossima generazione di Intel® Xeon® Scalable Processors con Intel® Advanced Matrix Extensions (Intel® AMX) instruction set con prestazioni ulteriormente migliorate. L'Auto Mixed Precision per il backende della CPU è stato abilitato da PyTorch-1.10. allo stesso tempo, il supporto di Auto Mixed Precision con BFloat16 per CPU e l'ottimizzazione degli operatori BFloat16 è stata abilitata in modo massiccio in Intel® Extension per PyTorch, and parzialmente aggiornato al branch master di PyTorch. Gli utenti possono ottenere prestazioni migliori ed users experience con IPEX Auto Mixed Precision.. + +Vedi informazioni più dettagliate su [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html). + +### Installazione di IPEX: + +Il rilascio di IPEX segue quello di PyTorch, da installare via pip: + +| PyTorch Version | IPEX version | +| :---------------: | :----------: | +| 1.13 | 1.13.0+cpu | +| 1.12 | 1.12.300+cpu | +| 1.11 | 1.11.200+cpu | +| 1.10 | 1.10.100+cpu | + +```bash +pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +``` + +Vedi altri approcci per [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html). + +### Utilizzo nel Trainer + +Per abilitare la auto mixed precision con IPEX in Trainer, l'utende dovrebbe aggiungere `use_ipex`, `bf16` e `no_cuda` negli argomenti del comando di addestramento. + +Vedi un sempio di un caso d'uso [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) + +- Training with IPEX using BF16 auto mixed precision on CPU: + +
 python run_qa.py \
+--model_name_or_path bert-base-uncased \
+--dataset_name squad \
+--do_train \
+--do_eval \
+--per_device_train_batch_size 12 \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/debug_squad/ \
+--use_ipex \
+--bf16 --no_cuda
+ +### Esempi pratici + +Blog: [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids) diff --git a/docs/source/it/perf_train_cpu_many.md b/docs/source/it/perf_train_cpu_many.md new file mode 100644 index 000000000000..2fb10ee4ba49 --- /dev/null +++ b/docs/source/it/perf_train_cpu_many.md @@ -0,0 +1,141 @@ + + +# Addestramento effciente su multiple CPU + +Quando l'addestramento su una singola CPU è troppo lento, possiamo usare CPU multiple. Quasta guida si concentra su DDP basato su PyTorch abilitando l'addetramento distribuito su CPU in maniera efficiente. + +## Intel® oneCCL Bindings per PyTorch + +[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) è una libreria per l'addestramento efficiente del deep learning in distribuito e implementa collettivi come allreduce, allgather, alltoall. Per maggiori informazioni su oneCCL, fai riferimento a [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) e [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html). + +Il modulo `oneccl_bindings_for_pytorch` (`torch_ccl` precedentemente alla versione 1.12) implementa PyTorch C10D ProcessGroup API e può essere caricato dinamicamente com external ProcessGroup e funziona solo su piattaforma Linux al momento. + +Qui trovi informazioni più dettagliate per [oneccl_bind_pt](https://github.com/intel/torch-ccl). + +### Intel® oneCCL Bindings per l'installazione PyTorch: + +I file wheel sono disponibili per le seguenti versioni di Python: + +| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | +| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | +| 1.13.0 | | √ | √ | √ | √ | +| 1.12.100 | | √ | √ | √ | √ | +| 1.12.0 | | √ | √ | √ | √ | +| 1.11.0 | | √ | √ | √ | √ | +| 1.10.0 | √ | √ | √ | √ | | + +```bash +pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu +``` + +dove `{pytorch_version}` deve essere la tua versione di PyTorch, per l'stanza 1.13.0. +Verifica altri approcci per [oneccl_bind_pt installation](https://github.com/intel/torch-ccl). +Le versioni di oneCCL e PyTorch devono combaciare. + + + +oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0) +PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100 + + + +## Intel® MPI library + +Usa questa implementazione basata su standard MPI per fornire una architettura flessibile, efficiente, scalabile su cluster per Intel®. Questo componente è parte di Intel® oneAPI HPC Toolkit. + +oneccl_bindings_for_pytorch è installato insieme al set di strumenti MPI. Necessità di reperire l'ambiente prima di utilizzarlo. + +per Intel® oneCCL >= 1.12.0 + +```bash +oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)") +source $oneccl_bindings_for_pytorch_path/env/setvars.sh +``` + +per Intel® oneCCL con versione < 1.12.0 + +```bash +torch_ccl_path=$(python -c "import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))") +source $torch_ccl_path/env/setvars.sh +``` + +#### Installazione IPEX: + +IPEX fornisce ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16; puoi fare riferimento a [single CPU section](./perf_train_cpu). + +Il seguente "Utilizzo in Trainer" prende come esempio mpirun nella libreria Intel® MPI. + +## Utilizzo in Trainer + +Per abilitare l'addestramento distribuito multi CPU nel Trainer con il ccl backend, gli utenti devono aggiungere **`--ddp_backend ccl`** negli argomenti del comando. + +Vediamo un esempio per il [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) + +Il seguente comando abilita due processi sul nodo Xeon, con un processo in esecuzione per ogni socket. Le variabili OMP_NUM_THREADS/CCL_WORKER_COUNT possono essere impostate per una prestazione ottimale. + +```shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=127.0.0.1 + mpirun -n 2 -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex +``` + +Il seguente comando abilita l'addestramento per un totale di quattro processi su due Xeon (node0 e node1, prendendo node0 come processo principale), ppn (processes per node) è impostato a 2, on un processo in esecuzione per ogni socket. Le variabili OMP_NUM_THREADS/CCL_WORKER_COUNT possono essere impostate per una prestazione ottimale. + +In node0, è necessario creare un file di configurazione che contenga gli indirizzi IP di ciascun nodo (per esempio hostfile) e passare il percorso del file di configurazione come parametro. + +```shell script + cat hostfile + xxx.xxx.xxx.xxx #node0 ip + xxx.xxx.xxx.xxx #node1 ip +``` + +A questo punto, esegui il seguente comando nel nodo0 e **4DDP** sarà abilitato in node0 e node1 con BF16 auto mixed precision: + +```shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip + mpirun -f hostfile -n 4 -ppn 2 \ + -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex \ + --bf16 +``` diff --git a/docs/source/it/perf_train_special.md b/docs/source/it/perf_train_special.md new file mode 100644 index 000000000000..afe05d801d66 --- /dev/null +++ b/docs/source/it/perf_train_special.md @@ -0,0 +1,24 @@ + + +# Addestramento su Hardware Specializzato + + + + Nota: Molte delle strategie introdotte nella [sezione sulla GPU singola](perf_train_gpu_one) (come mixed precision training o gradient accumulation) e [sezione multi-GPU](perf_train_gpu_many) sono generiche e applicabili all'addestramento di modelli in generale quindi assicurati di dargli un'occhiata prima di immergerti in questa sezione. + + + +Questo documento sarà presto completato con informazioni su come effettuare la formazione su hardware specializzato. diff --git a/docs/source/it/perf_train_tpu.md b/docs/source/it/perf_train_tpu.md new file mode 100644 index 000000000000..663f83c499cb --- /dev/null +++ b/docs/source/it/perf_train_tpu.md @@ -0,0 +1,24 @@ + + +# Addestramento su TPU + + + + Nota: Molte delle strategie introdotte nella [sezione sulla GPU singola](perf_train_gpu_one) (come mixed precision training o gradient accumulation) e [sezione multi-GPU](perf_train_gpu_many) sono generiche e applicabili all'addestramento di modelli in generale quindi assicurati di dargli un'occhiata prima di immergerti in questa sezione. + + + +Questo documento sarà presto completato con informazioni su come effettuare la formazione su TPU. diff --git a/docs/source/it/pipeline_tutorial.md b/docs/source/it/pipeline_tutorial.md new file mode 100644 index 000000000000..056282b164ed --- /dev/null +++ b/docs/source/it/pipeline_tutorial.md @@ -0,0 +1,152 @@ + + +# Pipeline per l'inferenza + +La [`pipeline`] rende semplice usare qualsiasi modello dal [Model Hub](https://huggingface.co/models) per fare inferenza su diversi compiti come generazione del testo, segmentazione di immagini e classificazione di audio. Anche se non hai esperienza con una modalità specifica o non comprendi bene il codice che alimenta i modelli, è comunque possibile utilizzarli con l'opzione [`pipeline`]! Questa esercitazione ti insegnerà a: + +* Usare una [`pipeline`] per fare inferenza. +* Usare uno specifico tokenizer o modello. +* Usare una [`pipeline`] per compiti che riguardano audio e video. + + + +Dai un'occhiata alla documentazione di [`pipeline`] per una lista completa dei compiti supportati. + + + +## Utilizzo della Pipeline + +Nonostante ogni compito abbia una [`pipeline`] associata, è più semplice utilizzare l'astrazione generica della [`pipeline`] che contiene tutte quelle specifiche per ogni mansione. La [`pipeline`] carica automaticamente un modello predefinito e un tokenizer in grado di fare inferenza per il tuo compito. + +1. Inizia creando una [`pipeline`] e specificando il compito su cui fare inferenza: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline(task="text-generation") +``` + +2. Inserisci il testo in input nella [`pipeline`]: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" +... ) # doctest: +SKIP +[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}] +``` + +Se hai più di un input, inseriscilo in una lista: + +```py +>>> generator( +... [ +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", +... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne", +... ] +... ) # doctest: +SKIP +``` + +Qualsiasi parametro addizionale per il tuo compito può essere incluso nella [`pipeline`]. La mansione `text-generation` ha un metodo [`~generation.GenerationMixin.generate`] con diversi parametri per controllare l'output. Ad esempio, se desideri generare più di un output, utilizza il parametro `num_return_sequences`: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", +... num_return_sequences=2, +... ) # doctest: +SKIP +``` + +### Scegliere modello e tokenizer + +La [`pipeline`] accetta qualsiasi modello dal [Model Hub](https://huggingface.co/models). Ci sono tag nel Model Hub che consentono di filtrare i modelli per attività. Una volta che avrai scelto il modello appropriato, caricalo usando la corrispondente classe `AutoModelFor` e [`AutoTokenizer`]. Ad esempio, carica la classe [`AutoModelForCausalLM`] per un compito di causal language modeling: + +```py +>>> from transformers import AutoTokenizer, AutoModelForCausalLM + +>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") +>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") +``` + +Crea una [`pipeline`] per il tuo compito, specificando il modello e il tokenizer che hai caricato: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer) +``` + +Inserisci il testo di input nella [`pipeline`] per generare del testo: + +```py +>>> generator( +... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" +... ) # doctest: +SKIP +[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}] +``` + +## Audio pipeline + +La flessibilità della [`pipeline`] fa si che possa essere estesa ad attività sugli audio. + +Per esempio, classifichiamo le emozioni in questo clip audio: + +```py +>>> from datasets import load_dataset +>>> import torch + +>>> torch.manual_seed(42) # doctest: +IGNORE_RESULT +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") +>>> audio_file = ds[0]["audio"]["path"] +``` + +Trova un modello per la [classificazione audio](https://huggingface.co/models?pipeline_tag=audio-classification) sul Model Hub per eseguire un compito di riconoscimento automatico delle emozioni e caricalo nella [`pipeline`]: + +```py +>>> from transformers import pipeline + +>>> audio_classifier = pipeline( +... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +Inserisci il file audio nella [`pipeline`]: + +```py +>>> preds = audio_classifier(audio_file) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}] +``` + +## Vision pipeline + +Infine, usare la [`pipeline`] per le attività sulle immagini è praticamente la stessa cosa. + +Specifica la tua attività e inserisci l'immagine nel classificatore. L'immagine può essere sia un link che un percorso sul tuo pc in locale. Per esempio, quale specie di gatto è raffigurata qui sotto? + +![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) + +```py +>>> from transformers import pipeline + +>>> vision_classifier = pipeline(task="image-classification") +>>> preds = vision_classifier( +... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] +``` diff --git a/docs/source/it/pipeline_tutorial.mdx b/docs/source/it/pipeline_tutorial.mdx deleted file mode 100644 index 64347164505f..000000000000 --- a/docs/source/it/pipeline_tutorial.mdx +++ /dev/null @@ -1,148 +0,0 @@ - - -# Pipeline per l'inferenza - -La [`pipeline`] rende semplice usare qualsiasi modello dal [Model Hub](https://huggingface.co/models) per fare inferenza su diversi compiti come generazione del testo, segmentazione di immagini e classificazione di audio. Anche se non hai esperienza con una modalità specifica o non comprendi bene il codice che alimenta i modelli, è comunque possibile utilizzarli con l'opzione [`pipeline`]! Questa esercitazione ti insegnerà a: - -* Usare una [`pipeline`] per fare inferenza. -* Usare uno specifico tokenizer o modello. -* Usare una [`pipeline`] per compiti che riguardano audio e video. - - - -Dai un'occhiata alla documentazione di [`pipeline`] per una lista completa dei compiti supportati. - - - -## Utilizzo della Pipeline - -Nonostante ogni compito abbia una [`pipeline`] associata, è più semplice utilizzare l'astrazione generica della [`pipeline`] che contiene tutte quelle specifiche per ogni mansione. La [`pipeline`] carica automaticamente un modello predefinito e un tokenizer in grado di fare inferenza per il tuo compito. - -1. Inizia creando una [`pipeline`] e specificando il compito su cui fare inferenza: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="text-generation") -``` - -2. Inserisci il testo in input nella [`pipeline`]: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" -... ) # doctest: +SKIP -[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}] -``` - -Se hai più di un input, inseriscilo in una lista: - -```py ->>> generator( -... [ -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", -... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne", -... ] -... ) # doctest: +SKIP -``` - -Qualsiasi parametro addizionale per il tuo compito può essere incluso nella [`pipeline`]. La mansione `text-generation` ha un metodo [`~generation.GenerationMixin.generate`] con diversi parametri per controllare l'output. Ad esempio, se desideri generare più di un output, utilizza il parametro `num_return_sequences`: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone", -... num_return_sequences=2, -... ) # doctest: +SKIP -``` - -### Scegliere modello e tokenizer - -La [`pipeline`] accetta qualsiasi modello dal [Model Hub](https://huggingface.co/models). Ci sono tag nel Model Hub che consentono di filtrare i modelli per attività. Una volta che avrai scelto il modello appropriato, caricalo usando la corrispondente classe `AutoModelFor` e [`AutoTokenizer`]. Ad esempio, carica la classe [`AutoModelForCausalLM`] per un compito di causal language modeling: - -```py ->>> from transformers import AutoTokenizer, AutoModelForCausalLM - ->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") ->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") -``` - -Crea una [`pipeline`] per il tuo compito, specificando il modello e il tokenizer che hai caricato: - -```py ->>> from transformers import pipeline - ->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer) -``` - -Inserisci il testo di input nella [`pipeline`] per generare del testo: - -```py ->>> generator( -... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone" -... ) # doctest: +SKIP -[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}] -``` - -## Audio pipeline - -La flessibilità della [`pipeline`] fa si che possa essere estesa ad attività sugli audio. - -Per esempio, classifichiamo le emozioni in questo clip audio: - -```py ->>> from datasets import load_dataset ->>> import torch - ->>> torch.manual_seed(42) # doctest: +IGNORE_RESULT ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") ->>> audio_file = ds[0]["audio"]["path"] -``` - -Trova un modello per la [classificazione audio](https://huggingface.co/models?pipeline_tag=audio-classification) sul Model Hub per eseguire un compito di riconoscimento automatico delle emozioni e caricalo nella [`pipeline`]: - -```py ->>> from transformers import pipeline - ->>> audio_classifier = pipeline( -... task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -Inserisci il file audio nella [`pipeline`]: - -```py ->>> preds = audio_classifier(audio_file) ->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] ->>> preds -[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}] -``` - -## Vision pipeline - -Infine, usare la [`pipeline`] per le attività sulle immagini è praticamente la stessa cosa. - -Specifica la tua attività e inserisci l'immagine nel classificatore. L'immagine può essere sia un link che un percorso sul tuo pc in locale. Per esempio, quale specie di gatto è raffigurata qui sotto? - -![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) - -```py ->>> from transformers import pipeline - ->>> vision_classifier = pipeline(task="image-classification") ->>> preds = vision_classifier( -... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" -... ) ->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] ->>> preds -[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] -``` diff --git a/docs/source/it/pr_checks.md b/docs/source/it/pr_checks.md new file mode 100644 index 000000000000..caa5fe32965b --- /dev/null +++ b/docs/source/it/pr_checks.md @@ -0,0 +1,135 @@ + + +# Controlli su una Pull Request + +Quando apri una pull request sui 🤗 Transformers, vengono eseguiti un discreto numero di controlli per assicurarsi che la patch che stai aggiungendo non stia rompendo qualcosa di esistente. Questi controlli sono di quattro tipi: +- test regolari +- costruzione della documentazione +- stile del codice e della documentazione +- coerenza generale del repository + +In questo documento, cercheremo di spiegare quali sono i vari controlli e le loro ragioni, oltre a spiegare come eseguire il debug locale se uno di essi fallisce sulla tua PR. + +Nota che tutti richiedono un'installazione dev: + +```bash +pip install transformers[dev] +``` + +o un'installazione modificabile: + +```bash +pip install -e .[dev] +``` + +all'interno del repo Transformers. + +## Tests + +Tutti i job che iniziano con `ci/circleci: run_tests_` eseguono parti della suite di test dei Transformers. Ognuno di questi job si concentra su una parte della libreria in un determinato ambiente: per esempio `ci/circleci: run_tests_pipelines_tf` esegue il test delle pipeline in un ambiente in cui è installato solo TensorFlow. + +Nota che per evitare di eseguire i test quando non ci sono cambiamenti reali nei moduli che si stanno testando, ogni volta viene eseguita solo una parte della suite di test: viene eseguita una utility per determinare le differenze nella libreria tra prima e dopo la PR (ciò che GitHub mostra nella scheda "Files changes") e sceglie i test che sono stati impattati dalla diff. Questa utility può essere eseguita localmente con: + +```bash +python utils/tests_fetcher.py +``` + +dalla root del repo Transformers. Di seguito ciò che farà: + +1. Controlla per ogni file nel diff se le modifiche sono nel codice o solo nei commenti o nelle docstrings. Vengono mantenuti solo i file con modifiche reali al codice. +2. Costruisce una mappa interna che fornisce per ogni file del codice sorgente della libreria tutti i file su cui ha un impatto ricorsivo. Si dice che il modulo A ha un impatto sul modulo B se il modulo B importa il modulo A. Per l'impatto ricorsivo, abbiamo bisogno di una catena di moduli che va dal modulo A al modulo B in cui ogni modulo importa il precedente. +3. Applica questa mappa ai file raccolti nel passaggio 1, si ottiene l'elenco dei file del modello interessati dalla PR. +4. Mappa ciascuno di questi file con i corrispondenti file di test e ottiene l'elenco dei test da eseguire. + +Quando esegui lo script in locale, dovresti ottenere la stampa dei risultati dei passi 1, 3 e 4 e quindi sapere quali test sono stati eseguiti. Lo script creerà anche un file chiamato `test_list.txt` che contiene l'elenco dei test da eseguire e che puoi eseguire localmente con il seguente comando: + +```bash +python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt) +``` + +Nel caso in cui qualcosa sia sfuggito, l'intera suite di test viene eseguita quotidianamente. + +## Build della documentazione + +Il job `ci/circleci: build_doc` esegue una build della documentazione per assicurarsi che tutto sia a posto una volta che la PR è stata unita. Se questo passaggio fallisce, puoi controllare localmente entrando nella cartella `docs` del repo Transformers e digitare + +```bash +make html +``` + +Sphinx non è noto per i suoi messaggi di errore chiari, quindi potrebbe essere necessario che provi alcune cose per trovare davvero la fonte dell'errore. + +## Stile del codice e della documentazione + +La formattazione del codice viene applicata a tutti i file sorgenti, agli esempi e ai test usando `black` e `isort`. Abbiamo anche uno strumento personalizzato che si occupa della formattazione delle docstring e dei file `rst` (`utils/style_doc.py`), così come dell'ordine dei lazy imports eseguiti nei file `__init__.py` dei Transformers (`utils/custom_init_isort.py`). Tutto questo può essere lanciato eseguendo + +```bash +make style +``` + +I controlli della CI sono applicati all'interno del controllo `ci/circleci: check_code_quality`. Esegue anche `flake8`, che dà un'occhiata di base al codice e si lamenta se trova una variabile non definita o non utilizzata. Per eseguire questo controllo localmente, usare + +```bash +make quality +``` + +Questa operazione può richiedere molto tempo, quindi per eseguire la stessa operazione solo sui file modificati nel branch corrente, eseguire + +```bash +make fixup +``` + +Quest'ultimo comando eseguirà anche tutti i controlli aggiuntivi per la consistenza del repository. Diamogli un'occhiata. + +## Coerenza del repository + +All'interno sono raggruppati tutti i test per assicurarsi che la tua PR lasci il repository in un buono stato ed è eseguito dal controllo `ci/circleci: check_repository_consistency`. Puoi eseguire localmente questo controllo eseguendo quanto segue: + +```bash +make repo-consistency +``` + +Questo verifica che: + +- Tutti gli oggetti aggiunti all'init sono documentati (eseguito da `utils/check_repo.py`) +- Tutti i file `__init__.py` hanno lo stesso contenuto nelle loro due sezioni (eseguito da `utils/check_inits.py`) +- Tutto il codice identificato come copia da un altro modulo è coerente con l'originale (eseguito da `utils/check_copies.py`) +- Le traduzioni dei README e l'indice della documentazione hanno lo stesso elenco di modelli del README principale (eseguito da `utils/check_copies.py`) +- Le tabelle autogenerate nella documentazione sono aggiornate (eseguito da `utils/check_table.py`) +- La libreria ha tutti gli oggetti disponibili anche se non tutte le dipendenze opzionali sono installate (eseguito da `utils/check_dummies.py`) + +Se questo controllo fallisce, le prime due voci richiedono una correzione manuale, mentre le ultime quattro possono essere corrette automaticamente per te eseguendo il comando + +```bash +make fix-copies +``` + +Ulteriori controlli riguardano le PR che aggiungono nuovi modelli, principalmente che: + +- Tutti i modelli aggiunti sono in un Auto-mapping (eseguita da `utils/check_repo.py`) + +- Tutti i modelli sono testati correttamente (eseguito da `utils/check_repo.py`) + + \ No newline at end of file diff --git a/docs/source/it/preprocessing.md b/docs/source/it/preprocessing.md new file mode 100644 index 000000000000..94578dfe166b --- /dev/null +++ b/docs/source/it/preprocessing.md @@ -0,0 +1,491 @@ + + +# Preprocess + +[[open-in-colab]] + +Prima di poter usare i dati in un modello, bisogna processarli in un formato accettabile per quest'ultimo. Un modello non comprende il testo grezzo, le immagini o l'audio. Bisogna convertire questi input in numeri e assemblarli all'interno di tensori. In questa esercitazione, tu potrai: + +* Preprocessare dati testuali con un tokenizer. +* Preprocessare immagini o dati audio con un estrattore di caratteristiche. +* Preprocessare dati per attività multimodali mediante un processore. + +## NLP + + + +Lo strumento principale per processare dati testuali è un [tokenizer](main_classes/tokenizer). Un tokenizer inizia separando il testo in *tokens* secondo una serie di regole. I tokens sono convertiti in numeri, questi vengono utilizzati per costruire i tensori di input del modello. Anche altri input addizionali se richiesti dal modello vengono aggiunti dal tokenizer. + + + +Se stai pensando si utilizzare un modello preaddestrato, è importante utilizzare il tokenizer preaddestrato associato. Questo assicura che il testo sia separato allo stesso modo che nel corpus usato per l'addestramento, e venga usata la stessa mappatura tokens-to-index (solitamente indicato come il *vocabolario*) come nel preaddestramento. + + + +Iniziamo subito caricando un tokenizer preaddestrato con la classe [`AutoTokenizer`]. Questo scarica il *vocabolario* usato quando il modello è stato preaddestrato. + +### Tokenize + +Carica un tokenizer preaddestrato con [`AutoTokenizer.from_pretrained`]: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +``` + +Poi inserisci le tue frasi nel tokenizer: + +```py +>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") +>>> print(encoded_input) +{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +Il tokenizer restituisce un dizionario contenente tre oggetti importanti: + +* [input_ids](glossary#input-ids) sono gli indici che corrispondono ad ogni token nella frase. +* [attention_mask](glossary#attention-mask) indicata se un token deve essere elaborato o no. +* [token_type_ids](glossary#token-type-ids) identifica a quale sequenza appartiene un token se è presente più di una sequenza. + +Si possono decodificare gli `input_ids` per farsi restituire l'input originale: + +```py +>>> tokenizer.decode(encoded_input["input_ids"]) +'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' +``` + +Come si può vedere, il tokenizer aggiunge due token speciali - `CLS` e `SEP` (classificatore e separatore) - alla frase. Non tutti i modelli hanno bisogno dei token speciali, ma se servono, il tokenizer li aggiungerà automaticamente. + +Se ci sono più frasi che vuoi processare, passale come una lista al tokenizer: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_inputs = tokenizer(batch_sentences) +>>> print(encoded_inputs) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1]]} +``` + +### Pad + +Questo è un argomento importante. Quando processi un insieme di frasi potrebbero non avere tutte la stessa lunghezza. Questo è un problema perchè i tensori, in input del modello, devono avere dimensioni uniformi. Il padding è una strategia per assicurarsi che i tensori siano rettangolari aggiungendo uno speciale *padding token* alle frasi più corte. + +Imposta il parametro `padding` a `True` per imbottire le frasi più corte nel gruppo in modo che combacino con la massima lunghezza presente: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +Nota che il tokenizer aggiunge alle sequenze degli `0` perchè sono troppo corte! + +### Truncation + +L'altra faccia della medaglia è che avolte le sequenze possono essere troppo lunghe per essere gestite dal modello. In questo caso, avrai bisogno di troncare la sequenza per avere una lunghezza minore. + +Imposta il parametro `truncation` a `True` per troncare una sequenza alla massima lunghezza accettata dal modello: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) +>>> print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} +``` + +### Costruire i tensori + +Infine, vuoi che il tokenizer restituisca i tensori prodotti dal modello. + +Imposta il parametro `return_tensors` su `pt` per PyTorch, o `tf` per TensorFlow: + +```py +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt") +>>> print(encoded_input) +{'input_ids': tensor([[ 101, 153, 7719, 21490, 1122, 1114, 9582, 1623, 102], + [ 101, 5226, 1122, 9649, 1199, 2610, 1236, 102, 0]]), + 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 0]])} +===PT-TF-SPLIT=== +>>> batch_sentences = [ +... "But what about second breakfast?", +... "Don't think he knows about second breakfast, Pip.", +... "What about elevensies?", +... ] +>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf") +>>> print(encoded_input) +{'input_ids': , + 'token_type_ids': , + 'attention_mask': } +``` + +## Audio + +Gli input audio sono processati in modo differente rispetto al testo, ma l'obiettivo rimane lo stesso: creare sequenze numeriche che il modello può capire. Un [estrattore di caratteristiche](main_classes/feature_extractor) è progettato con lo scopo preciso di estrarre caratteristiche da immagini o dati audio grezzi e convertirli in tensori. Prima di iniziare, installa 🤗 Datasets per caricare un dataset audio e sperimentare: + +```bash +pip install datasets +``` + +Carica il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (vedi il 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) per avere maggiori dettagli su come caricare un dataset): + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +``` + +Accedi al primo elemento della colonna `audio` per dare uno sguardo all'input. Richiamando la colonna `audio` sarà caricato automaticamente e ricampionato il file audio: + +```py +>>> dataset[0]["audio"] +{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 8000} +``` + +Questo restituisce tre oggetti: + +* `array` è il segnale vocale caricato - e potenzialmente ricampionato - come vettore 1D. +* `path` il percorso del file audio. +* `sampling_rate` si riferisce al numero di campioni del segnale vocale misurati al secondo. + +### Ricampionamento + +Per questo tutorial, puoi usare il modello [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Come puoi vedere dalla model card, il modello Wav2Vec2 è preaddestrato su un campionamento vocale a 16kHz.È importante che la frequenza di campionamento dei tuoi dati audio combaci con la frequenza di campionamento del dataset usato per preaddestrare il modello. Se la frequenza di campionamento dei tuoi dati non è uguale dovrai ricampionare i tuoi dati audio. + +Per esempio, il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) ha una frequenza di campionamento di 8000kHz. Utilizzando il modello Wav2Vec2 su questo dataset, alzala a 16kHz: + +```py +>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +>>> dataset[0]["audio"] +{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 8000} +``` + +1. Usa il metodo di 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) per alzare la frequenza di campionamento a 16kHz: + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +2. Carica il file audio: + +```py +>>> dataset[0]["audio"] +{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., + 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 16000} +``` + +Come puoi notare, la `sampling_rate` adesso è 16kHz! + +### Feature extractor + +Il prossimo passo è caricare un estrattore di caratteristiche per normalizzare e fare padding sull'input. Quando applichiamo il padding sui dati testuali, uno `0` è aggiunto alle sequenze più brevi. La stessa idea si applica ai dati audio, l'estrattore di caratteristiche per gli audio aggiungerà uno `0` - interpretato come silenzio - agli `array`. + +Carica l'estrattore delle caratteristiche con [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +``` + +Inserisci l' `array` audio nell'estrattore delle caratteristiche. Noi raccomandiamo sempre di aggiungere il parametro `sampling_rate` nell'estrattore delle caratteristiche per correggere meglio qualche errore, dovuto ai silenzi, che potrebbe verificarsi. + +```py +>>> audio_input = [dataset[0]["audio"]["array"]] +>>> feature_extractor(audio_input, sampling_rate=16000) +{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., + 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} +``` + +### Pad e truncate + +Come per il tokenizer, puoi applicare le operazioni padding o truncation per manipolare sequenze di variabili a lotti. Dai uno sguaro alla lunghezza delle sequenze di questi due campioni audio: + +```py +>>> dataset[0]["audio"]["array"].shape +(173398,) + +>>> dataset[1]["audio"]["array"].shape +(106496,) +``` + +Come puoi vedere, il primo campione ha una sequenza più lunga del secondo. Crea una funzione che preprocesserà il dataset. Specifica una lunghezza massima del campione, e l'estrattore di features si occuperà di riempire o troncare la sequenza per coincidervi: + +```py +>>> def preprocess_function(examples): +... audio_arrays = [x["array"] for x in examples["audio"]] +... inputs = feature_extractor( +... audio_arrays, +... sampling_rate=16000, +... padding=True, +... max_length=100000, +... truncation=True, +... ) +... return inputs +``` + +Applica la funzione ai primi esempi nel dataset: + +```py +>>> processed_dataset = preprocess_function(dataset[:5]) +``` + +Adesso guarda la lunghezza dei campioni elaborati: + +```py +>>> processed_dataset["input_values"][0].shape +(100000,) + +>>> processed_dataset["input_values"][1].shape +(100000,) +``` + +La lunghezza dei campioni adesso coincide con la massima lunghezza impostata nelle funzione. + +## Vision + +Un estrattore di caratteristiche si può usare anche per processare immagini e per compiti di visione. Ancora una volta, l'obiettivo è convertire l'immagine grezza in un lotto di tensori come input. + +Carica il dataset [food101](https://huggingface.co/datasets/food101) per questa esercitazione. Usa il parametro `split` di 🤗 Datasets per caricare solo un piccolo campione dal dataset di addestramento poichè il set di dati è molto grande: + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("food101", split="train[:100]") +``` + +Secondo passo, dai uno sguardo alle immagini usando la caratteristica [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) di 🤗 Datasets: + +```py +>>> dataset[0]["image"] +``` + +![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png) + +### Feature extractor + +Carica l'estrattore di caratteristiche [`AutoFeatureExtractor.from_pretrained`]: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224") +``` + +### Data augmentation + +Per le attività di visione, è usuale aggiungere alcuni tipi di data augmentation alle immagini come parte del preprocessing. Puoi aggiungere augmentations con qualsiasi libreria che preferisci, ma in questa esercitazione, userai il modulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) di torchvision. + +1. Normalizza l'immagine e usa [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) per concatenare alcune trasformazioni - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) e [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - insieme: + +```py +>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor + +>>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) +>>> _transforms = Compose( +... [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize] +... ) +``` + +2. Il modello accetta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) come input. Questo valore è generato dall'estrattore di caratteristiche. Crea una funzione che genera `pixel_values` dai transforms: + +```py +>>> def transforms(examples): +... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] +... return examples +``` + +3. Poi utilizza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)per applicare al volo la trasformazione: + +```py +>>> dataset.set_transform(transforms) +``` + +4. Adesso quando accedi all'immagine, puoi notare che l'estrattore di caratteristiche ha aggiunto `pixel_values` allo schema di input: + +```py +>>> dataset[0]["image"] +{'image': , + 'label': 6, + 'pixel_values': tensor([[[ 0.0353, 0.0745, 0.1216, ..., -0.9922, -0.9922, -0.9922], + [-0.0196, 0.0667, 0.1294, ..., -0.9765, -0.9843, -0.9922], + [ 0.0196, 0.0824, 0.1137, ..., -0.9765, -0.9686, -0.8667], + ..., + [ 0.0275, 0.0745, 0.0510, ..., -0.1137, -0.1216, -0.0824], + [ 0.0667, 0.0824, 0.0667, ..., -0.0588, -0.0745, -0.0980], + [ 0.0353, 0.0353, 0.0431, ..., -0.0039, -0.0039, -0.0588]], + + [[ 0.2078, 0.2471, 0.2863, ..., -0.9451, -0.9373, -0.9451], + [ 0.1608, 0.2471, 0.3098, ..., -0.9373, -0.9451, -0.9373], + [ 0.2078, 0.2706, 0.3020, ..., -0.9608, -0.9373, -0.8275], + ..., + [-0.0353, 0.0118, -0.0039, ..., -0.2392, -0.2471, -0.2078], + [ 0.0196, 0.0353, 0.0196, ..., -0.1843, -0.2000, -0.2235], + [-0.0118, -0.0039, -0.0039, ..., -0.0980, -0.0980, -0.1529]], + + [[ 0.3961, 0.4431, 0.4980, ..., -0.9216, -0.9137, -0.9216], + [ 0.3569, 0.4510, 0.5216, ..., -0.9059, -0.9137, -0.9137], + [ 0.4118, 0.4745, 0.5216, ..., -0.9137, -0.8902, -0.7804], + ..., + [-0.2314, -0.1922, -0.2078, ..., -0.4196, -0.4275, -0.3882], + [-0.1843, -0.1686, -0.2000, ..., -0.3647, -0.3804, -0.4039], + [-0.1922, -0.1922, -0.1922, ..., -0.2941, -0.2863, -0.3412]]])} +``` + +Di seguito come si vede l'immagine dopo la fase di preprocessing. Come ci si aspetterebbe dalle trasformazioni applicate, l'immagine è stata ritagliata in modo casuale e le proprietà del colore sono diverse. + +```py +>>> import numpy as np +>>> import matplotlib.pyplot as plt + +>>> img = dataset[0]["pixel_values"] +>>> plt.imshow(img.permute(1, 2, 0)) +``` + +![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png) + +## Multimodal + +Per attività multimodali userai una combinazione di tutto quello che hai imparato poco fa e applicherai le tue competenze alla comprensione automatica del parlato (Automatic Speech Recognition - ASR). Questo significa che avrai bisogno di: + +* Un estrattore delle caratteristiche per processare i dati audio. +* Il Tokenizer per processare i testi. + +Ritorna sul datasere [LJ Speech](https://huggingface.co/datasets/lj_speech): + +```py +>>> from datasets import load_dataset + +>>> lj_speech = load_dataset("lj_speech", split="train") +``` + +Visto che sei interessato solo alle colonne `audio` e `text`, elimina tutte le altre: + +```py +>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) +``` + +Adesso guarda le colonne `audio` e `text`: + +```py +>>> lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} + +>>> lj_speech[0]["text"] +'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' +``` + +Ricorda dalla sezione precedente sull'elaborazione dei dati audio, tu dovresti sempre [ricampionare](preprocessing#audio) la frequenza di campionamento dei tuoi dati audio per farla coincidere con quella del dataset usato dal modello preaddestrato: + +```py +>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) +``` + +### Processor + +Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un processor con [`AutoProcessor.from_pretrained]: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") +``` + +1. Crea una funzione che processi i dati audio in `input_values`, e tokenizza il testo in `labels`. Questi sono i tuoi input per il modello: + +```py +>>> def prepare_dataset(example): +... audio = example["audio"] + +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) + +... return example +``` + +2. Applica la funzione `prepare_dataset` ad un campione: + +```py +>>> prepare_dataset(lj_speech[0]) +``` + +Nota che il processor ha aggiunto `input_values` e `labels`. La frequenza di campionamento è stata corretta riducendola a 16kHz. + +Fantastico, ora dovresti essere in grado di preelaborare i dati per qualsiasi modalità e persino di combinare modalità diverse! Nella prossima esercitazione, impareremo a mettere a punto un modello sui dati appena pre-elaborati. \ No newline at end of file diff --git a/docs/source/it/preprocessing.mdx b/docs/source/it/preprocessing.mdx deleted file mode 100644 index a57ff9df9151..000000000000 --- a/docs/source/it/preprocessing.mdx +++ /dev/null @@ -1,487 +0,0 @@ - - -# Preprocess - -[[open-in-colab]] - -Prima di poter usare i dati in un modello, bisogna processarli in un formato accettabile per quest'ultimo. Un modello non comprende il testo grezzo, le immagini o l'audio. Bisogna convertire questi input in numeri e assemblarli all'interno di tensori. In questa esercitazione, tu potrai: - -* Preprocessare dati testuali con un tokenizer. -* Preprocessare immagini o dati audio con un estrattore di caratteristiche. -* Preprocessare dati per attività multimodali mediante un processore. - -## NLP - - - -Lo strumento principale per processare dati testuali è un [tokenizer](main_classes/tokenizer). Un tokenizer inizia separando il testo in *tokens* secondo una serie di regole. I tokens sono convertiti in numeri, questi vengono utilizzati per costruire i tensori di input del modello. Anche altri input addizionali se richiesti dal modello vengono aggiunti dal tokenizer. - - - -Se stai pensando si utilizzare un modello preaddestrato, è importante utilizzare il tokenizer preaddestrato associato. Questo assicura che il testo sia separato allo stesso modo che nel corpus usato per l'addestramento, e venga usata la stessa mappatura tokens-to-index (solitamente indicato come il *vocabolario*) come nel preaddestramento. - - - -Iniziamo subito caricando un tokenizer preaddestrato con la classe [`AutoTokenizer`]. Questo scarica il *vocabolario* usato quando il modello è stato preaddestrato. - -### Tokenize - -Carica un tokenizer preaddestrato con [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -``` - -Poi inserisci le tue frasi nel tokenizer: - -```py ->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") ->>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -Il tokenizer restituisce un dizionario contenente tre oggetti importanti: - -* [input_ids](glossary#input-ids) sono gli indici che corrispondono ad ogni token nella frase. -* [attention_mask](glossary#attention-mask) indicata se un token deve essere elaborato o no. -* [token_type_ids](glossary#token-type-ids) identifica a quale sequenza appartiene un token se è presente più di una sequenza. - -Si possono decodificare gli `input_ids` per farsi restituire l'input originale: - -```py ->>> tokenizer.decode(encoded_input["input_ids"]) -'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' -``` - -Come si può vedere, il tokenizer aggiunge due token speciali - `CLS` e `SEP` (classificatore e separatore) - alla frase. Non tutti i modelli hanno bisogno dei token speciali, ma se servono, il tokenizer li aggiungerà automaticamente. - -Se ci sono più frasi che vuoi processare, passale come una lista al tokenizer: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_inputs = tokenizer(batch_sentences) ->>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1]]} -``` - -### Pad - -Questo è un argomento importante. Quando processi un insieme di frasi potrebbero non avere tutte la stessa lunghezza. Questo è un problema perchè i tensori, in input del modello, devono avere dimensioni uniformi. Il padding è una strategia per assicurarsi che i tensori siano rettangolari aggiungendo uno speciale *padding token* alle frasi più corte. - -Imposta il parametro `padding` a `True` per imbottire le frasi più corte nel gruppo in modo che combacino con la massima lunghezza presente: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -Nota che il tokenizer aggiunge alle sequenze degli `0` perchè sono troppo corte! - -### Truncation - -L'altra faccia della medaglia è che avolte le sequenze possono essere troppo lunghe per essere gestite dal modello. In questo caso, avrai bisogno di troncare la sequenza per avere una lunghezza minore. - -Imposta il parametro `truncation` a `True` per troncare una sequenza alla massima lunghezza accettata dal modello: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -### Costruire i tensori - -Infine, vuoi che il tokenizer restituisca i tensori prodotti dal modello. - -Imposta il parametro `return_tensors` su `pt` per PyTorch, o `tf` per TensorFlow: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt") ->>> print(encoded_input) -{'input_ids': tensor([[ 101, 153, 7719, 21490, 1122, 1114, 9582, 1623, 102], - [ 101, 5226, 1122, 9649, 1199, 2610, 1236, 102, 0]]), - 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 0]])} -===PT-TF-SPLIT=== ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="tf") ->>> print(encoded_input) -{'input_ids': , - 'token_type_ids': , - 'attention_mask': } -``` - -## Audio - -Gli input audio sono processati in modo differente rispetto al testo, ma l'obiettivo rimane lo stesso: creare sequenze numeriche che il modello può capire. Un [estrattore di caratteristiche](main_classes/feature_extractor) è progettato con lo scopo preciso di estrarre caratteristiche da immagini o dati audio grezzi e convertirli in tensori. Prima di iniziare, installa 🤗 Datasets per caricare un dataset audio e sperimentare: - -```bash -pip install datasets -``` - -Carica il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (vedi il 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) per avere maggiori dettagli su come caricare un dataset): - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") -``` - -Accedi al primo elemento della colonna `audio` per dare uno sguardo all'input. Richiamando la colonna `audio` sarà caricato automaticamente e ricampionato il file audio: - -```py ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} -``` - -Questo restituisce tre oggetti: - -* `array` è il segnale vocale caricato - e potenzialmente ricampionato - come vettore 1D. -* `path` il percorso del file audio. -* `sampling_rate` si riferisce al numero di campioni del segnale vocale misurati al secondo. - -### Ricampionamento - -Per questo tutorial, puoi usare il modello [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). Come puoi vedere dalla model card, il modello Wav2Vec2 è preaddestrato su un campionamento vocale a 16kHz.È importante che la frequenza di campionamento dei tuoi dati audio combaci con la frequenza di campionamento del dataset usato per preaddestrare il modello. Se la frequenza di campionamento dei tuoi dati non è uguale dovrai ricampionare i tuoi dati audio. - -Per esempio, il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) ha una frequenza di campionamento di 8000kHz. Utilizzando il modello Wav2Vec2 su questo dataset, alzala a 16kHz: - -```py ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} -``` - -1. Usa il metodo di 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) per alzare la frequenza di campionamento a 16kHz: - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -2. Carica il file audio: - -```py ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} -``` - -Come puoi notare, la `sampling_rate` adesso è 16kHz! - -### Feature extractor - -Il prossimo passo è caricare un estrattore di caratteristiche per normalizzare e fare padding sull'input. Quando applichiamo il padding sui dati testuali, uno `0` è aggiunto alle sequenze più brevi. La stessa idea si applica ai dati audio, l'estrattore di caratteristiche per gli audio aggiungerà uno `0` - interpretato come silenzio - agli `array`. - -Carica l'estrattore delle caratteristiche con [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") -``` - -Inserisci l' `array` audio nell'estrattore delle caratteristiche. Noi raccomandiamo sempre di aggiungere il parametro `sampling_rate` nell'estrattore delle caratteristiche per correggere meglio qualche errore, dovuto ai silenzi, che potrebbe verificarsi. - -```py ->>> audio_input = [dataset[0]["audio"]["array"]] ->>> feature_extractor(audio_input, sampling_rate=16000) -{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., - 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} -``` - -### Pad e truncate - -Come per il tokenizer, puoi applicare le operazioni padding o truncation per manipolare sequenze di variabili a lotti. Dai uno sguaro alla lunghezza delle sequenze di questi due campioni audio: - -```py ->>> dataset[0]["audio"]["array"].shape -(173398,) - ->>> dataset[1]["audio"]["array"].shape -(106496,) -``` - -Come puoi vedere, il primo campione ha una sequenza più lunga del secondo. Crea una funzione che preprocesserà il dataset. Specifica una lunghezza massima del campione, e l'estrattore di features si occuperà di riempire o troncare la sequenza per coincidervi: - -```py ->>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] -... inputs = feature_extractor( -... audio_arrays, -... sampling_rate=16000, -... padding=True, -... max_length=100000, -... truncation=True, -... ) -... return inputs -``` - -Applica la funzione ai primi esempi nel dataset: - -```py ->>> processed_dataset = preprocess_function(dataset[:5]) -``` - -Adesso guarda la lunghezza dei campioni elaborati: - -```py ->>> processed_dataset["input_values"][0].shape -(100000,) - ->>> processed_dataset["input_values"][1].shape -(100000,) -``` - -La lunghezza dei campioni adesso coincide con la massima lunghezza impostata nelle funzione. - -## Vision - -Un estrattore di caratteristiche si può usare anche per processare immagini e per compiti di visione. Ancora una volta, l'obiettivo è convertire l'immagine grezza in un lotto di tensori come input. - -Carica il dataset [food101](https://huggingface.co/datasets/food101) per questa esercitazione. Usa il parametro `split` di 🤗 Datasets per caricare solo un piccolo campione dal dataset di addestramento poichè il set di dati è molto grande: - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("food101", split="train[:100]") -``` - -Secondo passo, dai uno sguardo alle immagini usando la caratteristica [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) di 🤗 Datasets: - -```py ->>> dataset[0]["image"] -``` - -![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png) - -### Feature extractor - -Carica l'estrattore di caratteristiche [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224") -``` - -### Data augmentation - -Per le attività di visione, è usuale aggiungere alcuni tipi di data augmentation alle immagini come parte del preprocessing. Puoi aggiungere augmentations con qualsiasi libreria che preferisci, ma in questa esercitazione, userai il modulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) di torchvision. - -1. Normalizza l'immagine e usa [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) per concatenare alcune trasformazioni - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) e [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - insieme: - -```py ->>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor - ->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) ->>> _transforms = Compose( -... [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize] -... ) -``` - -2. Il modello accetta [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) come input. Questo valore è generato dall'estrattore di caratteristiche. Crea una funzione che genera `pixel_values` dai transforms: - -```py ->>> def transforms(examples): -... examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]] -... return examples -``` - -3. Poi utilizza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)per applicare al volo la trasformazione: - -```py ->>> dataset.set_transform(transforms) -``` - -4. Adesso quando accedi all'immagine, puoi notare che l'estrattore di caratteristiche ha aggiunto `pixel_values` allo schema di input: - -```py ->>> dataset[0]["image"] -{'image': , - 'label': 6, - 'pixel_values': tensor([[[ 0.0353, 0.0745, 0.1216, ..., -0.9922, -0.9922, -0.9922], - [-0.0196, 0.0667, 0.1294, ..., -0.9765, -0.9843, -0.9922], - [ 0.0196, 0.0824, 0.1137, ..., -0.9765, -0.9686, -0.8667], - ..., - [ 0.0275, 0.0745, 0.0510, ..., -0.1137, -0.1216, -0.0824], - [ 0.0667, 0.0824, 0.0667, ..., -0.0588, -0.0745, -0.0980], - [ 0.0353, 0.0353, 0.0431, ..., -0.0039, -0.0039, -0.0588]], - - [[ 0.2078, 0.2471, 0.2863, ..., -0.9451, -0.9373, -0.9451], - [ 0.1608, 0.2471, 0.3098, ..., -0.9373, -0.9451, -0.9373], - [ 0.2078, 0.2706, 0.3020, ..., -0.9608, -0.9373, -0.8275], - ..., - [-0.0353, 0.0118, -0.0039, ..., -0.2392, -0.2471, -0.2078], - [ 0.0196, 0.0353, 0.0196, ..., -0.1843, -0.2000, -0.2235], - [-0.0118, -0.0039, -0.0039, ..., -0.0980, -0.0980, -0.1529]], - - [[ 0.3961, 0.4431, 0.4980, ..., -0.9216, -0.9137, -0.9216], - [ 0.3569, 0.4510, 0.5216, ..., -0.9059, -0.9137, -0.9137], - [ 0.4118, 0.4745, 0.5216, ..., -0.9137, -0.8902, -0.7804], - ..., - [-0.2314, -0.1922, -0.2078, ..., -0.4196, -0.4275, -0.3882], - [-0.1843, -0.1686, -0.2000, ..., -0.3647, -0.3804, -0.4039], - [-0.1922, -0.1922, -0.1922, ..., -0.2941, -0.2863, -0.3412]]])} -``` - -Di seguito come si vede l'immagine dopo la fase di preprocessing. Come ci si aspetterebbe dalle trasformazioni applicate, l'immagine è stata ritagliata in modo casuale e le proprietà del colore sono diverse. - -```py ->>> import numpy as np ->>> import matplotlib.pyplot as plt - ->>> img = dataset[0]["pixel_values"] ->>> plt.imshow(img.permute(1, 2, 0)) -``` - -![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png) - -## Multimodal - -Per attività multimodali userai una combinazione di tutto quello che hai imparato poco fa e applicherai le tue competenze alla comprensione automatica del parlato (Automatic Speech Recognition - ASR). Questo significa che avrai bisogno di: - -* Un estrattore delle caratteristiche per processare i dati audio. -* Il Tokenizer per processare i testi. - -Ritorna sul datasere [LJ Speech](https://huggingface.co/datasets/lj_speech): - -```py ->>> from datasets import load_dataset - ->>> lj_speech = load_dataset("lj_speech", split="train") -``` - -Visto che sei interessato solo alle colonne `audio` e `text`, elimina tutte le altre: - -```py ->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) -``` - -Adesso guarda le colonne `audio` e `text`: - -```py ->>> lj_speech[0]["audio"] -{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., - 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 22050} - ->>> lj_speech[0]["text"] -'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' -``` - -Ricorda dalla sezione precedente sull'elaborazione dei dati audio, tu dovresti sempre [ricampionare](preprocessing#audio) la frequenza di campionamento dei tuoi dati audio per farla coincidere con quella del dataset usato dal modello preaddestrato: - -```py ->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -### Processor - -Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un processor con [`AutoProcessor.from_pretrained]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") -``` - -1. Crea una funzione che processi i dati audio in `input_values`, e tokenizza il testo in `labels`. Questi sono i tuoi input per il modello: - -```py ->>> def prepare_dataset(example): -... audio = example["audio"] - -... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) - -... return example -``` - -2. Applica la funzione `prepare_dataset` ad un campione: - -```py ->>> prepare_dataset(lj_speech[0]) -``` - -Nota che il processor ha aggiunto `input_values` e `labels`. La frequenza di campionamento è stata corretta riducendola a 16kHz. - -Fantastico, ora dovresti essere in grado di preelaborare i dati per qualsiasi modalità e persino di combinare modalità diverse! Nella prossima esercitazione, impareremo a mettere a punto un modello sui dati appena pre-elaborati. \ No newline at end of file diff --git a/docs/source/it/quicktour.md b/docs/source/it/quicktour.md new file mode 100644 index 000000000000..f0e981d18eb7 --- /dev/null +++ b/docs/source/it/quicktour.md @@ -0,0 +1,401 @@ + + +# Quick tour + +[[open-in-colab]] + +Entra in azione con 🤗 Transformers! Inizia utilizzando [`pipeline`] per un'inferenza veloce, carica un modello pre-allenato e un tokenizer con una [AutoClass](./model_doc/auto) per risolvere i tuoi compiti legati a testo, immagini o audio. + + + +Tutti gli esempi di codice presenti in questa documentazione hanno un pulsante in alto a sinistra che permette di selezionare tra PyTorch e TensorFlow. Se +questo non è presente, ci si aspetta che il codice funzioni per entrambi i backend senza alcun cambiamento. + + + +## Pipeline + +[`pipeline`] è il modo più semplice per utilizzare un modello pre-allenato per un dato compito. + + + +La [`pipeline`] supporta molti compiti comuni: + +**Testo**: +* Analisi del Sentimento (Sentiment Analysis, in inglese): classifica la polarità di un testo dato. +* Generazione del Testo (Text Generation, in inglese): genera del testo a partire da un dato input. +* Riconoscimento di Entità (Name Entity Recognition o NER, in inglese): etichetta ogni parola con l'entità che questa rappresenta (persona, data, luogo, ecc.). +* Rispondere a Domande (Question answering, in inglese): estrae la risposta da un contesto, dato del contesto e una domanda. +* Riempimento di Maschere (Fill-mask, in inglese): riempie gli spazi mancanti in un testo che ha parole mascherate. +* Riassumere (Summarization, in inglese): genera una sintesi di una lunga sequenza di testo o di un documento. +* Traduzione (Translation, in inglese): traduce un testo in un'altra lingua. +* Estrazione di Caratteristiche (Feature Extraction, in inglese): crea un tensore che rappresenta un testo. + +**Immagini**: +* Classificazione di Immagini (Image Classification, in inglese): classifica un'immagine. +* Segmentazione di Immagini (Image Segmentation, in inglese): classifica ogni pixel di un'immagine. +* Rilevazione di Oggetti (Object Detection, in inglese): rileva oggetti all'interno di un'immagine. + +**Audio**: +* Classificazione di Audio (Audio Classification, in inglese): assegna un'etichetta ad un segmento di audio dato. +* Riconoscimento Vocale Automatico (Automatic Speech Recognition o ASR, in inglese): trascrive il contenuto di un audio dato in un testo. + + + +Per maggiori dettagli legati alla [`pipeline`] e ai compiti ad essa associati, fai riferimento alla documentazione [qui](./main_classes/pipelines). + + + +### Utilizzo della Pipeline + +Nel seguente esempio, utilizzerai la [`pipeline`] per l'analisi del sentimento. + +Installa le seguenti dipendenze se non lo hai già fatto: + + + + +```bash +pip install torch +``` + + + +```bash +pip install tensorflow +``` + + + +Importa [`pipeline`] e specifica il compito che vuoi completare: + +```py +>>> from transformers import pipeline + +>>> classificatore = pipeline("sentiment-analysis", model="MilaNLProc/feel-it-italian-sentiment") +``` + +La pipeline scarica e salva il [modello pre-allenato](https://huggingface.co/MilaNLProc/feel-it-italian-sentiment) e il tokenizer per l'analisi del sentimento. Se non avessimo scelto un modello, la pipeline ne avrebbe scelto uno di default. Ora puoi utilizzare il `classifier` sul tuo testo obiettivo: + +```py +>>> classificatore("Siamo molto felici di mostrarti la libreria 🤗 Transformers.") +[{'label': 'positive', 'score': 0.9997}] +``` + +Per più di una frase, passa una lista di frasi alla [`pipeline`] la quale restituirà una lista di dizionari: + +```py +>>> risultati = classificatore( +... ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."] +... ) +>>> for risultato in risultati: +... print(f"etichetta: {risultato['label']}, con punteggio: {round(risultato['score'], 4)}") +etichetta: positive, con punteggio: 0.9998 +etichetta: negative, con punteggio: 0.9998 +``` + +La [`pipeline`] può anche iterare su un dataset intero. Inizia installando la libreria [🤗 Datasets](https://huggingface.co/docs/datasets/): + +```bash +pip install datasets +``` + +Crea una [`pipeline`] con il compito che vuoi risolvere e con il modello che vuoi utilizzare. + +```py +>>> import torch +>>> from transformers import pipeline + +>>> riconoscitore_vocale = pipeline( +... "automatic-speech-recognition", model="radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram" +... ) +``` + +Poi, carica un dataset (vedi 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) per maggiori dettagli) sul quale vuoi iterare. Per esempio, carichiamo il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14): + +```py +>>> from datasets import load_dataset, Audio + +>>> dataset = load_dataset("PolyAI/minds14", name="it-IT", split="train") # doctest: +IGNORE_RESULT +``` + +Dobbiamo assicurarci che la frequenza di campionamento del set di dati corrisponda alla frequenza di campionamento con cui è stato addestrato `radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram`. + +```py +>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=riconoscitore_vocale.feature_extractor.sampling_rate)) +``` + +I file audio vengono caricati automaticamente e ri-campionati quando chiamiamo la colonna "audio". +Estraiamo i vettori delle forme d'onda grezze delle prime 4 osservazioni e passiamoli come lista alla pipeline: + +```py +>>> risultato = riconoscitore_vocale(dataset[:4]["audio"]) +>>> print([d["text"] for d in risultato]) +['dovrei caricare dei soldi sul mio conto corrente', 'buongiorno e senza vorrei depositare denaro sul mio conto corrente come devo fare per cortesia', 'sì salve vorrei depositare del denaro sul mio conto', 'e buon pomeriggio vorrei depositare dei soldi sul mio conto bancario volleo sapere come posso fare se e posso farlo online ed un altro conto o andandoo tramite bancomut'] +``` + +Per un dataset più grande dove gli input sono di dimensione maggiore (come nel parlato/audio o nella visione), dovrai passare un generatore al posto di una lista che carica tutti gli input in memoria. Guarda la [documentazione della pipeline](./main_classes/pipelines) per maggiori informazioni. + +### Utilizzare un altro modello e tokenizer nella pipeline + +La [`pipeline`] può ospitare qualsiasi modello del [Model Hub](https://huggingface.co/models), rendendo semplice l'adattamento della [`pipeline`] per altri casi d'uso. Per esempio, se si vuole un modello capace di trattare testo in francese, usa i tag presenti nel Model Hub in modo da filtrare per ottenere un modello appropriato. Il miglior risultato filtrato restituisce un modello multi-lingua [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) fine-tuned per l'analisi del sentimento. Ottimo, utilizziamo questo modello! + +```py +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +``` + + + +Usa [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `AutoClass` in seguito): + +```py +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + +Usa [`TFAutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `TFAutoClass` in seguito): + +```py +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +>>> tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + + + +Poi puoi specificare il modello e il tokenizer nella [`pipeline`], e applicare il `classifier` sul tuo testo obiettivo: + +```py +>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) +>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") +[{'label': '5 stars', 'score': 0.7273}] +``` + +Se non riesci a trovare un modello per il tuo caso d'uso, dovrai fare fine-tuning di un modello pre-allenato sui tuoi dati. Dai un'occhiata al nostro tutorial [fine-tuning tutorial](./training) per imparare come. Infine, dopo che hai completato il fine-tuning del tuo modello pre-allenato, considera per favore di condividerlo (vedi il tutorial [qui](./model_sharing)) con la comunità sul Model Hub per democratizzare l'NLP! 🤗 + +## AutoClass + + + +Al suo interno, le classi [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] lavorano assieme per dare potere alla [`pipeline`]. Una [AutoClass](./model_doc/auto) è una scorciatoia che automaticamente recupera l'architettura di un modello pre-allenato a partire dal suo nome o path. Hai solo bisogno di selezionare la `AutoClass` appropriata per il tuo compito e il suo tokenizer associato con [`AutoTokenizer`]. + +Ritorniamo al nostro esempio e vediamo come puoi utilizzare la `AutoClass` per replicare i risultati della [`pipeline`]. + +### AutoTokenizer + +Un tokenizer è responsabile dell'elaborazione del testo in modo da trasformarlo in un formato comprensibile dal modello. Per prima cosa, il tokenizer dividerà il testo in parole chiamate *token*. Ci sono diverse regole che governano il processo di tokenizzazione, tra cui come dividere una parola e a quale livello (impara di più sulla tokenizzazione [qui](./tokenizer_summary)). La cosa più importante da ricordare comunque è che hai bisogno di inizializzare il tokenizer con lo stesso nome del modello in modo da assicurarti che stai utilizzando le stesse regole di tokenizzazione con cui il modello è stato pre-allenato. + +Carica un tokenizer con [`AutoTokenizer`]: + +```py +>>> from transformers import AutoTokenizer + +>>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tokenizer = AutoTokenizer.from_pretrained(nome_del_modello) +``` + +Dopodiché, il tokenizer converte i token in numeri in modo da costruire un tensore come input del modello. Questo è conosciuto come il *vocabolario* del modello. + +Passa il tuo testo al tokenizer: + +```py +>>> encoding = tokenizer("Siamo molto felici di mostrarti la libreria 🤗 Transformers.") +>>> print(encoding) +{'input_ids': [101, 56821, 10132, 14407, 13019, 13007, 10120, 47201, 10330, 10106, 91686, 100, 58263, 119, 102], +'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], +'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +Il tokenizer restituirà un dizionario contenente: + +* [input_ids](./glossary#input-ids): rappresentazioni numeriche dei tuoi token. +* [attention_mask](.glossary#attention-mask): indica quali token devono essere presi in considerazione. + +Come con la [`pipeline`], il tokenizer accetterà una lista di input. In più, il tokenizer può anche completare (pad, in inglese) e troncare il testo in modo da restituire un lotto (batch, in inglese) di lunghezza uniforme: + + + +```py +>>> pt_batch = tokenizer( +... ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="pt", +... ) +``` + + +```py +>>> tf_batch = tokenizer( +... ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."], +... padding=True, +... truncation=True, +... max_length=512, +... return_tensors="tf", +... ) +``` + + + +Leggi il tutorial sul [preprocessing](./preprocessing) per maggiori dettagli sulla tokenizzazione. + +### AutoModel + + + +🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`AutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare l'[`AutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`AutoModelForSequenceClassification`]: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +``` + + + +Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito. + + + +Ora puoi passare il tuo lotto di input pre-processati direttamente al modello. Devi solo spacchettare il dizionario aggiungendo `**`: + +```py +>>> pt_outputs = pt_model(**pt_batch) +``` + +Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità: + +```py +>>> from torch import nn + +>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) +>>> print(pt_predictions) +tensor([[0.0041, 0.0037, 0.0203, 0.2005, 0.7713], + [0.3766, 0.3292, 0.1832, 0.0558, 0.0552]], grad_fn=) +``` + + +🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`TFAutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare il [`TFAutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`TFAutoModelForSequenceClassification`]: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment" +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(nome_del_modello) +``` + + + +Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito. + + + +Ora puoi passare il tuo lotto di input pre-processati direttamente al modello passando le chiavi del dizionario al tensore: + +```py +>>> tf_outputs = tf_model(tf_batch) +``` + +Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità: +```py +>>> import tensorflow as tf + +>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) +>>> tf_predictions # doctest: +IGNORE_RESULT +``` + + + + + +Tutti i modelli di 🤗 Transformers (PyTorch e TensorFlow) restituiscono i tensori *prima* della funzione finale +di attivazione (come la softmax) perché la funzione di attivazione finale viene spesso unita a quella di perdita. + + + +I modelli sono [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) standard così puoi utilizzarli all'interno del tuo training loop usuale. Tuttavia, per rendere le cose più semplici, 🤗 Transformers fornisce una classe [`Trainer`] per PyTorch che aggiunge delle funzionalità per l'allenamento distribuito, precisione mista, e altro ancora. Per TensorFlow, puoi utilizzare il metodo `fit` di [Keras](https://keras.io/). Fai riferimento al [tutorial per il training](./training) per maggiori dettagli. + + + +Gli output del modello di 🤗 Transformers sono delle dataclasses speciali in modo che i loro attributi vengano auto-completati all'interno di un IDE. +Gli output del modello si comportano anche come una tupla o un dizionario (ad esempio, puoi indicizzare con un intero, una slice o una stringa) nel qual caso gli attributi che sono `None` vengono ignorati. + + + +### Salva un modello + + + +Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`PreTrainedModel.save_pretrained`]: + +```py +>>> pt_save_directory = "./pt_save_pretrained" +>>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT +>>> pt_model.save_pretrained(pt_save_directory) +``` + +Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`PreTrainedModel.from_pretrained`]: + +```py +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") +``` + + +Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`TFPreTrainedModel.save_pretrained`]: + +```py +>>> tf_save_directory = "./tf_save_pretrained" +>>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT +>>> tf_model.save_pretrained(tf_save_directory) +``` + +Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") +``` + + + +Una caratteristica particolarmente interessante di 🤗 Transformers è la sua abilità di salvare un modello e ri-caricarlo sia come modello di PyTorch che di TensorFlow. I parametri `from_pt` o `from_tf` possono convertire un modello da un framework all'altro: + + + + +```py +>>> from transformers import AutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +``` + + + +```py +>>> from transformers import TFAutoModel + +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +``` + + diff --git a/docs/source/it/quicktour.mdx b/docs/source/it/quicktour.mdx deleted file mode 100644 index 2378edd2c2a1..000000000000 --- a/docs/source/it/quicktour.mdx +++ /dev/null @@ -1,393 +0,0 @@ - - -# Quick tour - -[[open-in-colab]] - -Entra in azione con 🤗 Transformers! Inizia utilizzando [`pipeline`] per un'inferenza veloce, carica un modello pre-allenato e un tokenizer con una [AutoClass](./model_doc/auto) per risolvere i tuoi compiti legati a testo, immagini o audio. - - - -Tutti gli esempi di codice presenti in questa documentazione hanno un pulsante in alto a sinistra che permette di selezionare tra PyTorch e TensorFlow. Se -questo non è presente, ci si aspetta che il codice funzioni per entrambi i backend senza alcun cambiamento. - - - -## Pipeline - -[`pipeline`] è il modo più semplice per utilizzare un modello pre-allenato per un dato compito. - - - -La [`pipeline`] supporta molti compiti comuni: - -**Testo**: -* Analisi del Sentimento (Sentiment Analysis, in inglese): classifica la polarità di un testo dato. -* Generazione del Testo (Text Generation, in inglese): genera del testo a partire da un dato input. -* Riconoscimento di Entità (Name Entity Recognition o NER, in inglese): etichetta ogni parola con l'entità che questa rappresenta (persona, data, luogo, ecc.). -* Rispondere a Domande (Question answering, in inglese): estrae la risposta da un contesto, dato del contesto e una domanda. -* Riempimento di Maschere (Fill-mask, in inglese): riempie gli spazi mancanti in un testo che ha parole mascherate. -* Riassumere (Summarization, in inglese): genera una sintesi di una lunga sequenza di testo o di un documento. -* Traduzione (Translation, in inglese): traduce un testo in un'altra lingua. -* Estrazione di Caratteristiche (Feature Extraction, in inglese): crea un tensore che rappresenta un testo. - -**Immagini**: -* Classificazione di Immagini (Image Classification, in inglese): classifica un'immagine. -* Segmentazione di Immagini (Image Segmentation, in inglese): classifica ogni pixel di un'immagine. -* Rilevazione di Oggetti (Object Detection, in inglese): rileva oggetti all'interno di un'immagine. - -**Audio**: -* Classificazione di Audio (Audio Classification, in inglese): assegna un'etichetta ad un segmento di audio dato. -* Riconoscimento Vocale Automatico (Automatic Speech Recognition o ASR, in inglese): trascrive il contenuto di un audio dato in un testo. - - - -Per maggiori dettagli legati alla [`pipeline`] e ai compiti ad essa associati, fai riferimento alla documentazione [qui](./main_classes/pipelines). - - - -### Utilizzo della Pipeline - -Nel seguente esempio, utilizzerai la [`pipeline`] per l'analisi del sentimento. - -Installa le seguenti dipendenze se non lo hai già fatto: - - - -```bash -pip install torch -``` - - -```bash -pip install tensorflow -``` - - - -Importa [`pipeline`] e specifica il compito che vuoi completare: - -```py ->>> from transformers import pipeline - ->>> classificatore = pipeline("sentiment-analysis", model="MilaNLProc/feel-it-italian-sentiment") -``` - -La pipeline scarica e salva il [modello pre-allenato](https://huggingface.co/MilaNLProc/feel-it-italian-sentiment) e il tokenizer per l'analisi del sentimento. Se non avessimo scelto un modello, la pipeline ne avrebbe scelto uno di default. Ora puoi utilizzare il `classifier` sul tuo testo obiettivo: - -```py ->>> classificatore("Siamo molto felici di mostrarti la libreria 🤗 Transformers.") -[{'label': 'positive', 'score': 0.9997}] -``` - -Per più di una frase, passa una lista di frasi alla [`pipeline`] la quale restituirà una lista di dizionari: - -```py ->>> risultati = classificatore( -... ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."] -... ) ->>> for risultato in risultati: -... print(f"etichetta: {risultato['label']}, con punteggio: {round(risultato['score'], 4)}") -etichetta: positive, con punteggio: 0.9998 -etichetta: negative, con punteggio: 0.9998 -``` - -La [`pipeline`] può anche iterare su un dataset intero. Inizia installando la libreria [🤗 Datasets](https://huggingface.co/docs/datasets/): - -```bash -pip install datasets -``` - -Crea una [`pipeline`] con il compito che vuoi risolvere e con il modello che vuoi utilizzare. - -```py ->>> import torch ->>> from transformers import pipeline - ->>> riconoscitore_vocale = pipeline( -... "automatic-speech-recognition", model="radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram" -... ) -``` - -Poi, carica un dataset (vedi 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) per maggiori dettagli) sul quale vuoi iterare. Per esempio, carichiamo il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14): - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="it-IT", split="train") # doctest: +IGNORE_RESULT -``` - -Dobbiamo assicurarci che la frequenza di campionamento del set di dati corrisponda alla frequenza di campionamento con cui è stato addestrato `radiogroup-crits/wav2vec2-xls-r-1b-italian-doc4lm-5gram`. - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=riconoscitore_vocale.feature_extractor.sampling_rate)) -``` - -I file audio vengono caricati automaticamente e ri-campionati quando chiamiamo la colonna "audio". -Estraiamo i vettori delle forme d'onda grezze delle prime 4 osservazioni e passiamoli come lista alla pipeline: - -```py ->>> risultato = riconoscitore_vocale(dataset[:4]["audio"]) ->>> print([d["text"] for d in risultato]) -['dovrei caricare dei soldi sul mio conto corrente', 'buongiorno e senza vorrei depositare denaro sul mio conto corrente come devo fare per cortesia', 'sì salve vorrei depositare del denaro sul mio conto', 'e buon pomeriggio vorrei depositare dei soldi sul mio conto bancario volleo sapere come posso fare se e posso farlo online ed un altro conto o andandoo tramite bancomut'] -``` - -Per un dataset più grande dove gli input sono di dimensione maggiore (come nel parlato/audio o nella visione), dovrai passare un generatore al posto di una lista che carica tutti gli input in memoria. Guarda la [documentazione della pipeline](./main_classes/pipelines) per maggiori informazioni. - -### Utilizzare un altro modello e tokenizer nella pipeline - -La [`pipeline`] può ospitare qualsiasi modello del [Model Hub](https://huggingface.co/models), rendendo semplice l'adattamento della [`pipeline`] per altri casi d'uso. Per esempio, se si vuole un modello capace di trattare testo in francese, usa i tag presenti nel Model Hub in modo da filtrare per ottenere un modello appropriato. Il miglior risultato filtrato restituisce un modello multi-lingua [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) fine-tuned per l'analisi del sentimento. Ottimo, utilizziamo questo modello! - -```py ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" -``` - - - -Usa [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `AutoClass` in seguito): - -```py ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - -Usa [`TFAutoModelForSequenceClassification`] e [`AutoTokenizer`] per caricare il modello pre-allenato e il suo tokenizer associato (maggiori informazioni su una `TFAutoClass` in seguito): - -```py ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - - -Poi puoi specificare il modello e il tokenizer nella [`pipeline`], e applicare il `classifier` sul tuo testo obiettivo: - -```py ->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) ->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") -[{'label': '5 stars', 'score': 0.7273}] -``` - -Se non riesci a trovare un modello per il tuo caso d'uso, dovrai fare fine-tuning di un modello pre-allenato sui tuoi dati. Dai un'occhiata al nostro tutorial [fine-tuning tutorial](./training) per imparare come. Infine, dopo che hai completato il fine-tuning del tuo modello pre-allenato, considera per favore di condividerlo (vedi il tutorial [qui](./model_sharing)) con la comunità sul Model Hub per democratizzare l'NLP! 🤗 - -## AutoClass - - - -Al suo interno, le classi [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] lavorano assieme per dare potere alla [`pipeline`]. Una [AutoClass](./model_doc/auto) è una scorciatoia che automaticamente recupera l'architettura di un modello pre-allenato a partire dal suo nome o path. Hai solo bisogno di selezionare la `AutoClass` appropriata per il tuo compito e il suo tokenizer associato con [`AutoTokenizer`]. - -Ritorniamo al nostro esempio e vediamo come puoi utilizzare la `AutoClass` per replicare i risultati della [`pipeline`]. - -### AutoTokenizer - -Un tokenizer è responsabile dell'elaborazione del testo in modo da trasformarlo in un formato comprensibile dal modello. Per prima cosa, il tokenizer dividerà il testo in parole chiamate *token*. Ci sono diverse regole che governano il processo di tokenizzazione, tra cui come dividere una parola e a quale livello (impara di più sulla tokenizzazione [qui](./tokenizer_summary)). La cosa più importante da ricordare comunque è che hai bisogno di inizializzare il tokenizer con lo stesso nome del modello in modo da assicurarti che stai utilizzando le stesse regole di tokenizzazione con cui il modello è stato pre-allenato. - -Carica un tokenizer con [`AutoTokenizer`]: - -```py ->>> from transformers import AutoTokenizer - ->>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tokenizer = AutoTokenizer.from_pretrained(nome_del_modello) -``` - -Dopodiché, il tokenizer converte i token in numeri in modo da costruire un tensore come input del modello. Questo è conosciuto come il *vocabolario* del modello. - -Passa il tuo testo al tokenizer: - -```py ->>> encoding = tokenizer("Siamo molto felici di mostrarti la libreria 🤗 Transformers.") ->>> print(encoding) -{'input_ids': [101, 56821, 10132, 14407, 13019, 13007, 10120, 47201, 10330, 10106, 91686, 100, 58263, 119, 102], -'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], -'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -Il tokenizer restituirà un dizionario contenente: - -* [input_ids](./glossary#input-ids): rappresentazioni numeriche dei tuoi token. -* [attention_mask](.glossary#attention-mask): indica quali token devono essere presi in considerazione. - -Come con la [`pipeline`], il tokenizer accetterà una lista di input. In più, il tokenizer può anche completare (pad, in inglese) e troncare il testo in modo da restituire un lotto (batch, in inglese) di lunghezza uniforme: - - - -```py ->>> pt_batch = tokenizer( -... ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="pt", -... ) -``` - - -```py ->>> tf_batch = tokenizer( -... ["Siamo molto felici di mostrarti la libreria 🤗 Transformers.", "Speriamo te non la odierai."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="tf", -... ) -``` - - - -Leggi il tutorial sul [preprocessing](./preprocessing) per maggiori dettagli sulla tokenizzazione. - -### AutoModel - - - -🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`AutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare l'[`AutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`AutoModelForSequenceClassification`]: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) -``` - - - -Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito. - - - -Ora puoi passare il tuo lotto di input pre-processati direttamente al modello. Devi solo spacchettare il dizionario aggiungendo `**`: - -```py ->>> pt_outputs = pt_model(**pt_batch) -``` - -Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità: - -```py ->>> from torch import nn - ->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) ->>> print(pt_predictions) -tensor([[0.0041, 0.0037, 0.0203, 0.2005, 0.7713], - [0.3766, 0.3292, 0.1832, 0.0558, 0.0552]], grad_fn=) -``` - - -🤗 Transformers fornisce un metodo semplice e unificato per caricare istanze pre-allenate. Questo significa che puoi caricare un [`TFAutoModel`] come caricheresti un [`AutoTokenizer`]. L'unica differenza è selezionare il [`TFAutoModel`] corretto per il compito di interesse. Dato che stai facendo classificazione di testi, o sequenze, carica [`TFAutoModelForSequenceClassification`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> nome_del_modello = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(nome_del_modello) -``` - - - -Guarda il [task summary](./task_summary) per sapere quale classe di [`AutoModel`] utilizzare per quale compito. - - - -Ora puoi passare il tuo lotto di input pre-processati direttamente al modello passando le chiavi del dizionario al tensore: - -```py ->>> tf_outputs = tf_model(tf_batch) -``` - -Il modello produrrà le attivazioni finali nell'attributo `logits`. Applica la funzione softmax a `logits` per ottenere le probabilità: -```py ->>> import tensorflow as tf - ->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) ->>> tf_predictions # doctest: +IGNORE_RESULT -``` - - - - - -Tutti i modelli di 🤗 Transformers (PyTorch e TensorFlow) restituiscono i tensori *prima* della funzione finale -di attivazione (come la softmax) perché la funzione di attivazione finale viene spesso unita a quella di perdita. - - - -I modelli sono [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) o [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) standard così puoi utilizzarli all'interno del tuo training loop usuale. Tuttavia, per rendere le cose più semplici, 🤗 Transformers fornisce una classe [`Trainer`] per PyTorch che aggiunge delle funzionalità per l'allenamento distribuito, precisione mista, e altro ancora. Per TensorFlow, puoi utilizzare il metodo `fit` di [Keras](https://keras.io/). Fai riferimento al [tutorial per il training](./training) per maggiori dettagli. - - - -Gli output del modello di 🤗 Transformers sono delle dataclasses speciali in modo che i loro attributi vengano auto-completati all'interno di un IDE. -Gli output del modello si comportano anche come una tupla o un dizionario (ad esempio, puoi indicizzare con un intero, una slice o una stringa) nel qual caso gli attributi che sono `None` vengono ignorati. - - - -### Salva un modello - - - -Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`PreTrainedModel.save_pretrained`]: - -```py ->>> pt_save_directory = "./pt_save_pretrained" ->>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT ->>> pt_model.save_pretrained(pt_save_directory) -``` - -Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`PreTrainedModel.from_pretrained`]: - -```py ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") -``` - - -Una volta completato il fine-tuning del tuo modello, puoi salvarlo con il suo tokenizer utilizzando [`TFPreTrainedModel.save_pretrained`]: - -```py ->>> tf_save_directory = "./tf_save_pretrained" ->>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT ->>> tf_model.save_pretrained(tf_save_directory) -``` - -Quando desideri utilizzare il tuo modello nuovamente, puoi ri-caricarlo con [`TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") -``` - - - -Una caratteristica particolarmente interessante di 🤗 Transformers è la sua abilità di salvare un modello e ri-caricarlo sia come modello di PyTorch che di TensorFlow. I parametri `from_pt` o `from_tf` possono convertire un modello da un framework all'altro: - - - -```py ->>> from transformers import AutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) -``` - - -```py ->>> from transformers import TFAutoModel - ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) -``` - - diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md new file mode 100644 index 000000000000..327eb9374d38 --- /dev/null +++ b/docs/source/it/run_scripts.md @@ -0,0 +1,351 @@ + + +# Addestramento con script + +Insieme ai [notebooks](./noteboks/README) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). + +Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria. + +Non è dato per scontato che gli script di esempio funzionino senza apportare modifiche per ogni problema, bensì potrebbe essere necessario adattare lo script al tuo caso specifico. Per aiutarti in ciò, la maggioranza degli script espone le modalità di pre-processamento dei dati, consentendoti di modificare lo script come preferisci. + +Per qualsiasi feature che vorresti implementare in uno script d'esempio, per favore discutine nel [forum](https://discuss.huggingface.co/) o in un'[issue](https://github.com/huggingface/transformers/issues) prima di inviare una Pull Request. Mentre accogliamo con piacere la correzione di bug, è più improbabile che faremo la stessa con una PR che aggiunge funzionalità sacrificando la leggibilità. + +Questa guida ti mostrerà come eseguire uno script di esempio relativo al task di summarization in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) e [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Tutti gli esempi funzioneranno con entrambi i framework a meno che non sia specificato altrimenti. + +## Installazione + +Per eseguire con successo l'ultima versione degli script di esempio, devi **installare 🤗 Transformers dalla fonte** in un nuovo ambiente virtuale: + +```bash +git clone https://github.com/huggingface/transformers +cd transformers +pip install . +``` +Per le precedenti versioni degli script di esempio, clicca sul pulsante di seguito: + +
+ Esempi per versioni precedenti di 🤗 Transformers + +
+ +Successivamente, cambia la tua attuale copia di 🤗 Transformers specificandone la versione, ad esempio v3.5.1: + +```bash +git checkout tags/v3.5.1 +``` + + Dopo aver configurato correttamente la versione della libreria, naviga nella cartella degli esempi di tua scelta e installa i requisiti: + +```bash +pip install -r requirements.txt +``` + +## Esegui uno script + + + + +Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization. + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando Keras su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization. + +```bash +python examples/tensorflow/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Addestramento distribuito e precisione mista + +Il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supporta l'addestramento distribuito e la precisione mista, che significa che puoi anche usarla in uno script. Per abilitare entrambe le funzionalità: + +- Aggiunto l'argomento `fp16` per abilitare la precisione mista. +- Imposta un numero di GPU da usare con l'argomento `nproc_per_node`. + +```bash +python -m torch.distributed.launch \ + --nproc_per_node 8 pytorch/summarization/run_summarization.py \ + --fp16 \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +Gli script TensorFlow utilizzano una [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) per il training distribuito e non devi aggiungere alcun argomento addizionale allo script di training. Lo script TensorFlow userà multiple GPU in modo predefinito se quest'ultime sono disponibili: + +## Esegui uno script su TPU + + + +Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. PyTorch supporta le TPU con il compilatore per deep learning [XLA](https://www.tensorflow.org/xla) (guarda [questo link](https://github.com/pytorch/xla/blob/master/README.md) per maggiori dettagli). Per usare una TPU, avvia lo script `xla_spawn.py` e usa l'argomento `num_cores` per impostare il numero di core TPU che intendi usare. + +```bash +python xla_spawn.py --num_cores 8 \ + summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + + +Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. Gli script TensorFlow utilizzano una [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) per eseguire l'addestramento su TPU. Per usare una TPU, passa il nome della risorsa TPU all'argomento `tpu`. + +```bash +python run_summarization.py \ + --tpu name_of_tpu_resource \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval +``` + + + +## Esegui uno script con 🤗 Accelerate + +🤗 [Accelerate](https://huggingface.co/docs/accelerate) è una libreria compatibile solo con PyTorch che offre un metodo unificato per addestrare modelli su diverse tipologie di configurazioni (CPU, multiple GPU, TPU) mantenendo una completa visibilità rispetto al ciclo di training di PyTorch. Assicurati di aver effettuato l'installazione di 🤗 Accelerate, nel caso non lo avessi fatto: + +> Nota: dato che Accelerate è in rapido sviluppo, è necessario installare la versione proveniente da git per eseguire gli script: +```bash +pip install git+https://github.com/huggingface/accelerate +``` + +Invece che usare lo script `run_summarization.py`, devi usare lo script `run_summarization_no_trainer.py`. Gli script supportati in 🤗 Accelerate avranno un file chiamato `task_no_trainer.py` nella rispettiva cartella. Per iniziare, esegui il seguente comando per creare e salvare un file di configurazione: + +```bash +accelerate config +``` + +Testa la tua configurazione per assicurarti della sua correttezza: + +```bash +accelerate test +``` + +Ora sei pronto per avviare l'addestramento: + +```bash +accelerate launch run_summarization_no_trainer.py \ + --model_name_or_path t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir ~/tmp/tst-summarization +``` + +## Uso di un dataset personalizzato + +Lo script di summarization supporta dataset personalizzati purché siano file CSV o JSON Line. Quando usi il tuo dataset, devi specificare diversi argomenti aggiuntivi: + +- `train_file` e `validation_file` specificano dove si trovano i file di addestramento e validazione. +- `text_column` è il file di input da riassumere. +- `summary_column` è il file di destinazione per l'output. + +Uno script di summarization usando un dataset personalizzato sarebbe simile a questo: + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --train_file path_to_csv_or_jsonlines_file \ + --validation_file path_to_csv_or_jsonlines_file \ + --text_column text_column_name \ + --summary_column summary_column_name \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --overwrite_output_dir \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --predict_with_generate +``` + +## Testare uno script + +È spesso una buona idea avviare il tuo script su un numero inferiore di esempi tratti dal dataset, per assicurarti che tutto funzioni come previsto prima di eseguire lo script sull'intero dataset, che potrebbe necessitare di ore. Usa i seguenti argomenti per limitare il dataset ad un massimo numero di esempi: + +- `max_train_samples` +- `max_eval_samples` +- `max_predict_samples` + +```bash +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path t5-small \ + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` + +Non tutti gli esempi di script supportano l'argomento `max_predict_samples`. Se non sei sicuro circa il supporto di questo argomento da parte del tuo script, aggiungi l'argomento `-h` per controllare: + +```bash +examples/pytorch/summarization/run_summarization.py -h +``` + +## Riavviare addestramento da un checkpoint + +Un'altra utile opzione è riavviare un addestramento da un checkpoint precedente. Questo garantirà che tu possa riprendere da dove hai interrotto senza ricominciare se l'addestramento viene interrotto. Ci sono due metodi per riavviare l'addestramento da un checkpoint: + +Il primo metodo usa l'argomento `output_dir previous_output_dir` per riavviare l'addestramento dall'ultima versione del checkpoint contenuto in `output_dir`. In questo caso, dovresti rimuovere `overwrite_output_dir`: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --output_dir previous_output_dir \ + --predict_with_generate +``` + +Il secondo metodo usa l'argomento `resume_from_checkpoint path_to_specific_checkpoint` per riavviare un addestramento da una specifica cartella di checkpoint. + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --resume_from_checkpoint path_to_specific_checkpoint \ + --predict_with_generate +``` + +## Condividi il tuo modello + +Tutti gli script possono caricare il tuo modello finale al [Model Hub](https://huggingface.co/models). Prima di iniziare, assicurati di aver effettuato l'accesso su Hugging Face: + +```bash +huggingface-cli login +``` + +Poi, aggiungi l'argomento `push_to_hub` allo script. Questo argomento consentirà di creare un repository con il tuo username Hugging Face e la cartella specificata in `output_dir`. + +Per dare uno specifico nome al repository, usa l'argomento `push_to_hub_model_id`. Il repository verrà automaticamente elencata sotto al tuo namespace. + +Il seguente esempio mostra come caricare un modello specificando il nome del repository: + +```bash +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --push_to_hub \ + --push_to_hub_model_id finetuned-t5-cnn_dailymail \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +``` diff --git a/docs/source/it/run_scripts.mdx b/docs/source/it/run_scripts.mdx deleted file mode 100644 index 3ffd58a62830..000000000000 --- a/docs/source/it/run_scripts.mdx +++ /dev/null @@ -1,347 +0,0 @@ - - -# Addestramento con script - -Insieme ai [notebooks](./noteboks/README) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). - -Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria. - -Non è dato per scontato che gli script di esempio funzionino senza apportare modifiche per ogni problema, bensì potrebbe essere necessario adattare lo script al tuo caso specifico. Per aiutarti in ciò, la maggioranza degli script espone le modalità di pre-processamento dei dati, consentendoti di modificare lo script come preferisci. - -Per qualsiasi feature che vorresti implementare in uno script d'esempio, per favore discutine nel [forum](https://discuss.huggingface.co/) o in un'[issue](https://github.com/huggingface/transformers/issues) prima di inviare una Pull Request. Mentre accogliamo con piacere la correzione di bug, è più improbabile che faremo la stessa con una PR che aggiunge funzionalità sacrificando la leggibilità. - -Questa guida ti mostrerà come eseguire uno script di esempio relativo al task di summarization in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) e [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). Tutti gli esempi funzioneranno con entrambi i framework a meno che non sia specificato altrimenti. - -## Installazione - -Per eseguire con successo l'ultima versione degli script di esempio, devi **installare 🤗 Transformers dalla fonte** in un nuovo ambiente virtuale: - -```bash -git clone https://github.com/huggingface/transformers -cd transformers -pip install . -``` -Per le precedenti versioni degli script di esempio, clicca sul pulsante di seguito: - -
- Esempi per versioni precedenti di 🤗 Transformers - -
- -Successivamente, cambia la tua attuale copia di 🤗 Transformers specificandone la versione, ad esempio v3.5.1: - -```bash -git checkout tags/v3.5.1 -``` - - Dopo aver configurato correttamente la versione della libreria, naviga nella cartella degli esempi di tua scelta e installa i requisiti: - -```bash -pip install -r requirements.txt -``` - -## Esegui uno script - - - - -Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization. - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando Keras su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization. - -```bash -python examples/tensorflow/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval -``` - - - -## Addestramento distribuito e precisione mista - -Il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supporta l'addestramento distribuito e la precisione mista, che significa che puoi anche usarla in uno script. Per abilitare entrambe le funzionalità: - -- Aggiunto l'argomento `fp16` per abilitare la precisione mista. -- Imposta un numero di GPU da usare con l'argomento `nproc_per_node`. - -```bash -python -m torch.distributed.launch \ - --nproc_per_node 8 pytorch/summarization/run_summarization.py \ - --fp16 \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -Gli script TensorFlow utilizzano una [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) per il training distribuito e non devi aggiungere alcun argomento addizionale allo script di training. Lo script TensorFlow userà multiple GPU in modo predefinito se quest'ultime sono disponibili: - -## Esegui uno script su TPU - - - -Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. PyTorch supporta le TPU con il compilatore per deep learning [XLA](https://www.tensorflow.org/xla) (guarda [questo link](https://github.com/pytorch/xla/blob/master/README.md) per maggiori dettagli). Per usare una TPU, avvia lo script `xla_spawn.py` e usa l'argomento `num_cores` per impostare il numero di core TPU che intendi usare. - -```bash -python xla_spawn.py --num_cores 8 \ - summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestazioni. Gli script TensorFlow utilizzano una [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) per eseguire l'addestramento su TPU. Per usare una TPU, passa il nome della risorsa TPU all'argomento `tpu`. - -```bash -python run_summarization.py \ - --tpu name_of_tpu_resource \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval -``` - - - -## Esegui uno script con 🤗 Accelerate - -🤗 [Accelerate](https://huggingface.co/docs/accelerate) è una libreria compatibile solo con PyTorch che offre un metodo unificato per addestrare modelli su diverse tipologie di configurazioni (CPU, multiple GPU, TPU) mantenendo una completa visibilità rispetto al ciclo di training di PyTorch. Assicurati di aver effettuato l'installazione di 🤗 Accelerate, nel caso non lo avessi fatto: - -> Nota: dato che Accelerate è in rapido sviluppo, è necessario installare la versione proveniente da git per eseguire gli script: -```bash -pip install git+https://github.com/huggingface/accelerate -``` - -Invece che usare lo script `run_summarization.py`, devi usare lo script `run_summarization_no_trainer.py`. Gli script supportati in 🤗 Accelerate avranno un file chiamato `task_no_trainer.py` nella rispettiva cartella. Per iniziare, esegui il seguente comando per creare e salvare un file di configurazione: - -```bash -accelerate config -``` - -Testa la tua configurazione per assicurarti della sua correttezza: - -```bash -accelerate test -``` - -Ora sei pronto per avviare l'addestramento: - -```bash -accelerate launch run_summarization_no_trainer.py \ - --model_name_or_path t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir ~/tmp/tst-summarization -``` - -## Uso di un dataset personalizzato - -Lo script di summarization supporta dataset personalizzati purché siano file CSV o JSON Line. Quando usi il tuo dataset, devi specificare diversi argomenti aggiuntivi: - -- `train_file` e `validation_file` specificano dove si trovano i file di addestramento e validazione. -- `text_column` è il file di input da riassumere. -- `summary_column` è il file di destinazione per l'output. - -Uno script di summarization usando un dataset personalizzato sarebbe simile a questo: - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --train_file path_to_csv_or_jsonlines_file \ - --validation_file path_to_csv_or_jsonlines_file \ - --text_column text_column_name \ - --summary_column summary_column_name \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --overwrite_output_dir \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --predict_with_generate -``` - -## Testare uno script - -È spesso una buona idea avviare il tuo script su un numero inferiore di esempi tratti dal dataset, per assicurarti che tutto funzioni come previsto prima di eseguire lo script sull'intero dataset, che potrebbe necessitare di ore. Usa i seguenti argomenti per limitare il dataset ad un massimo numero di esempi: - -- `max_train_samples` -- `max_eval_samples` -- `max_predict_samples` - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path t5-small \ - --max_train_samples 50 \ - --max_eval_samples 50 \ - --max_predict_samples 50 \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -Non tutti gli esempi di script supportano l'argomento `max_predict_samples`. Se non sei sicuro circa il supporto di questo argomento da parte del tuo script, aggiungi l'argomento `-h` per controllare: - -```bash -examples/pytorch/summarization/run_summarization.py -h -``` - -## Riavviare addestramento da un checkpoint - -Un'altra utile opzione è riavviare un addestramento da un checkpoint precedente. Questo garantirà che tu possa riprendere da dove hai interrotto senza ricominciare se l'addestramento viene interrotto. Ci sono due metodi per riavviare l'addestramento da un checkpoint: - -Il primo metodo usa l'argomento `output_dir previous_output_dir` per riavviare l'addestramento dall'ultima versione del checkpoint contenuto in `output_dir`. In questo caso, dovresti rimuovere `overwrite_output_dir`: - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --output_dir previous_output_dir \ - --predict_with_generate -``` - -Il secondo metodo usa l'argomento `resume_from_checkpoint path_to_specific_checkpoint` per riavviare un addestramento da una specifica cartella di checkpoint. - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --resume_from_checkpoint path_to_specific_checkpoint \ - --predict_with_generate -``` - -## Condividi il tuo modello - -Tutti gli script possono caricare il tuo modello finale al [Model Hub](https://huggingface.co/models). Prima di iniziare, assicurati di aver effettuato l'accesso su Hugging Face: - -```bash -huggingface-cli login -``` - -Poi, aggiungi l'argomento `push_to_hub` allo script. Questo argomento consentirà di creare un repository con il tuo username Hugging Face e la cartella specificata in `output_dir`. - -Per dare uno specifico nome al repository, usa l'argomento `push_to_hub_model_id`. Il repository verrà automaticamente elencata sotto al tuo namespace. - -Il seguente esempio mostra come caricare un modello specificando il nome del repository: - -```bash -python examples/pytorch/summarization/run_summarization.py - --model_name_or_path t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --push_to_hub \ - --push_to_hub_model_id finetuned-t5-cnn_dailymail \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` diff --git a/docs/source/it/serialization.md b/docs/source/it/serialization.md new file mode 100644 index 000000000000..0067f1a3c52e --- /dev/null +++ b/docs/source/it/serialization.md @@ -0,0 +1,677 @@ + + +# Esporta modelli 🤗 Transformers + +Se devi implementare 🤗 modelli Transformers in ambienti di produzione, noi +consigliamo di esportarli in un formato serializzato che può essere caricato ed eseguito +su runtime e hardware specializzati. In questa guida ti mostreremo come farlo +esporta 🤗 Modelli Transformers in due formati ampiamente utilizzati: ONNX e TorchScript. + +Una volta esportato, un modello può essere ottimizato per l'inferenza tramite tecniche come +la quantizzazione e soppressione. Se sei interessato a ottimizzare i tuoi modelli per l'esecuzione +con la massima efficienza, dai un'occhiata a [🤗 Optimum +library](https://github.com/huggingface/optimum). + +## ONNX + +Il progetto [ONNX (Open Neural Network eXchange)](http://onnx.ai) Il progetto onnx è un open +standard che definisce un insieme comune di operatori e un formato di file comune a +rappresentano modelli di deep learning in un'ampia varietà di framework, tra cui +PyTorch e TensorFlow. Quando un modello viene esportato nel formato ONNX, questi +operatori sono usati per costruire un grafico computazionale (often called an +_intermediate representation_) che rappresenta il flusso di dati attraverso la +rete neurale. + +Esponendo un grafico con operatori e tipi di dati standardizzati, ONNX rende +più facile passare da un framework all'altro. Ad esempio, un modello allenato in PyTorch può +essere esportato in formato ONNX e quindi importato in TensorFlow (e viceversa). + +🤗 Transformers fornisce un pacchetto `transformers.onnx` che ti consente di +convertire i checkpoint del modello in un grafico ONNX sfruttando gli oggetti di configurazione. +Questi oggetti di configurazione sono già pronti per una serie di architetture di modelli, +e sono progettati per essere facilmente estensibili ad altre architetture. + +Le configurazioni pronte includono le seguenti architetture: + + + +- ALBERT +- BART +- BEiT +- BERT +- BigBird +- BigBird-Pegasus +- Blenderbot +- BlenderbotSmall +- CamemBERT +- ConvBERT +- Data2VecText +- Data2VecVision +- DeiT +- DistilBERT +- ELECTRA +- FlauBERT +- GPT Neo +- GPT-J +- I-BERT +- LayoutLM +- M2M100 +- Marian +- mBART +- MobileBERT +- OpenAI GPT-2 +- Perceiver +- PLBart +- RoBERTa +- RoFormer +- SqueezeBERT +- T5 +- ViT +- XLM +- XLM-RoBERTa +- XLM-RoBERTa-XL + +Nelle prossime due sezioni, ti mostreremo come: + +* Esporta un modello supportato usando il pacchetto `transformers.onnx`. +* Esporta un modello personalizzato per un'architettura non supportata. + +### Esportazione di un modello in ONNX + +Per esportare un modello 🤗 Transformers in ONNX, dovrai prima installarne alcune +dipendenze extra: + +```bash +pip install transformers[onnx] +``` + +Il pacchetto `transformers.onnx` può essere usato come modulo Python: + +```bash +python -m transformers.onnx --help + +usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output + +positional arguments: + output Path indicating where to store generated ONNX model. + +optional arguments: + -h, --help show this help message and exit + -m MODEL, --model MODEL + Model ID on huggingface.co or path on disk to load model from. + --feature {causal-lm, ...} + The type of features to export the model with. + --opset OPSET ONNX opset version to export the model with. + --atol ATOL Absolute difference tolerance when validating the model. +``` + +L'esportazione di un checkpoint utilizzando una configurazione già pronta può essere eseguita come segue: + +```bash +python -m transformers.onnx --model=distilbert-base-uncased onnx/ +``` + +che dovrebbe mostrare i seguenti log: + +```bash +Validating ONNX model... + -[✓] ONNX model output names match reference model ({'last_hidden_state'}) + - Validating ONNX Model output "last_hidden_state": + -[✓] (2, 8, 768) matches (2, 8, 768) + -[✓] all values close (atol: 1e-05) +All good, model saved at: onnx/model.onnx +``` + +Questo esporta un grafico ONNX del checkpoint definito dall'argomento `--model`. +In questo esempio è `distilbert-base-uncased`, ma può essere qualsiasi checkpoint +Hugging Face Hub o uno memorizzato localmente. + +Il file risultante `model.onnx` può quindi essere eseguito su uno dei [tanti +acceleratori](https://onnx.ai/supported-tools.html#deployModel) che supportano il +lo standard ONNX. Ad esempio, possiamo caricare ed eseguire il modello con [ONNX +Runtime](https://onnxruntime.ai/) come segue: + +```python +>>> from transformers import AutoTokenizer +>>> from onnxruntime import InferenceSession + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> session = InferenceSession("onnx/model.onnx") +>>> # ONNX Runtime expects NumPy arrays as input +>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") +>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) +``` + +I nomi di output richiesti (cioè `["last_hidden_state"]`) possono essere ottenuti +dando un'occhiata alla configurazione ONNX di ogni modello. Ad esempio, per +DistilBERT abbiamo: + +```python +>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig + +>>> config = DistilBertConfig() +>>> onnx_config = DistilBertOnnxConfig(config) +>>> print(list(onnx_config.outputs.keys())) +["last_hidden_state"] +``` + +Il processo è identico per i checkpoint TensorFlow sull'hub. Ad esempio, noi +possiamo esportare un checkpoint TensorFlow puro da [Keras +organizzazione](https://huggingface.co/keras-io) come segue: + +```bash +python -m transformers.onnx --model=keras-io/transformers-qa onnx/ +``` + +Per esportare un modello memorizzato localmente, devi disporre dei pesi del modello +e file tokenizer memorizzati in una directory. Ad esempio, possiamo caricare e salvare un +checkpoint come segue: + + + +```python +>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification + +>>> # Load tokenizer and PyTorch weights form the Hub +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +>>> # Save to disk +>>> tokenizer.save_pretrained("local-pt-checkpoint") +>>> pt_model.save_pretrained("local-pt-checkpoint") +``` + +Una volta salvato il checkpoint, possiamo esportarlo su ONNX puntando l'argomento `--model` +del pacchetto `transformers.onnx` nella directory desiderata: + +```bash +python -m transformers.onnx --model=local-pt-checkpoint onnx/ +``` + + +```python +>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +>>> # Load tokenizer and TensorFlow weights from the Hub +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +>>> # Save to disk +>>> tokenizer.save_pretrained("local-tf-checkpoint") +>>> tf_model.save_pretrained("local-tf-checkpoint") +``` + +Once the checkpoint is saved, we can export it to ONNX by pointing the `--model` +argument of the `transformers.onnx` package to the desired directory: + +```bash +python -m transformers.onnx --model=local-tf-checkpoint onnx/ +``` + + + +### Selezione delle caratteristiche per diverse topologie di modello + +Ogni configurazione già pronta viene fornita con una serie di _caratteristiche_ che ti consentono di +esportare modelli per diversi tipi di topologie o attività. Come mostrato nella tabella +di seguito, ogni caratteristica è associata a una diversa Auto Class: + +| Caratteristica | Auto Class | +| ------------------------------------ | ------------------------------------ | +| `causal-lm`, `causal-lm-with-past` | `AutoModelForCausalLM` | +| `default`, `default-with-past` | `AutoModel` | +| `masked-lm` | `AutoModelForMaskedLM` | +| `question-answering` | `AutoModelForQuestionAnswering` | +| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM` | +| `sequence-classification` | `AutoModelForSequenceClassification` | +| `token-classification` | `AutoModelForTokenClassification` | + +Per ciascuna configurazione, puoi trovare l'elenco delle funzionalità supportate tramite il +`FeaturesManager`. Ad esempio, per DistilBERT abbiamo: + +```python +>>> from transformers.onnx.features import FeaturesManager + +>>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys()) +>>> print(distilbert_features) +["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"] +``` + +Puoi quindi passare una di queste funzionalità all'argomento `--feature` nel +pacchetto `transformers.onnx`. Ad esempio, per esportare un modello di classificazione del testo +possiamo scegliere un modello ottimizzato dall'Hub ed eseguire: + +```bash +python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \ + --feature=sequence-classification onnx/ +``` + +che visualizzerà i seguenti registri: + +```bash +Validating ONNX model... + -[✓] ONNX model output names match reference model ({'logits'}) + - Validating ONNX Model output "logits": + -[✓] (2, 2) matches (2, 2) + -[✓] all values close (atol: 1e-05) +All good, model saved at: onnx/model.onnx +``` + +Puoi notare che in questo caso, i nomi di output del modello ottimizzato sono +`logits` invece di `last_hidden_state` che abbiamo visto con il +checkpoint `distilbert-base-uncased` precedente. Questo è previsto dal +modello ottimizato visto che ha una testa di e. + + + +Le caratteristiche che hanno un suffisso `wtih-past` (ad es. `causal-lm-with-past`) +corrispondono a topologie di modello con stati nascosti precalcolati (chiave e valori +nei blocchi di attenzione) che possono essere utilizzati per la decodifica autoregressiva veloce. + + + + +### Esportazione di un modello per un'architettura non supportata + +Se desideri esportare un modello la cui architettura non è nativamente supportata dalla +libreria, ci sono tre passaggi principali da seguire: + +1. Implementare una configurazione ONNX personalizzata. +2. Esportare il modello in ONNX. +3. Convalidare gli output di PyTorch e dei modelli esportati. + +In questa sezione, vedremo come DistilBERT è stato implementato per mostrare cosa è +coinvolto in ogni passaggio. + +#### Implementazione di una configurazione ONNX personalizzata + +Iniziamo con l'oggetto di configurazione ONNX. Forniamo tre classi +astratte da cui ereditare, a seconda del tipo di archittettura +del modello che desideri esportare: + +* I modelli basati su encoder ereditano da [`~onnx.config.OnnxConfig`] +* I modelli basati su decoder ereditano da [`~onnx.config.OnnxConfigWithPast`] +* I modelli encoder-decoder ereditano da[`~onnx.config.OnnxSeq2SeqConfigWithPast`] + + + +Un buon modo per implementare una configurazione ONNX personalizzata è guardare l'implementazione +esistente nel file `configuration_.py` di un'architettura simile. + + + +Poiché DistilBERT è un modello basato su encoder, la sua configurazione eredita da +`OnnxConfig`: + +```python +>>> from typing import Mapping, OrderedDict +>>> from transformers.onnx import OnnxConfig + + +>>> class DistilBertOnnxConfig(OnnxConfig): +... @property +... def inputs(self) -> Mapping[str, Mapping[int, str]]: +... return OrderedDict( +... [ +... ("input_ids", {0: "batch", 1: "sequence"}), +... ("attention_mask", {0: "batch", 1: "sequence"}), +... ] +... ) +``` + +Ogni oggetto di configurazione deve implementare la proprietà `inputs` e restituire una +mappatura, dove ogni chiave corrisponde a un input previsto e ogni valore +indica l'asse di quell'input. Per DistilBERT, possiamo vedere che sono richiesti +due input: `input_ids` e `attention_mask`. Questi inputs hanno la stessa forma di +`(batch_size, sequence_length)` per questo motivo vediamo gli stessi assi usati nella +configurazione. + + + +Puoi notare che la proprietà `inputs` per `DistilBertOnnxConfig` restituisce un +`OrdinatoDict`. Ciò garantisce che gli input corrispondano alla loro posizione +relativa all'interno del metodo `PreTrainedModel.forward()` durante il tracciamento del grafico. +Raccomandiamo di usare un `OrderedDict` per le proprietà `inputs` e `outputs` +quando si implementano configurazioni ONNX personalizzate. + + + +Dopo aver implementato una configurazione ONNX, è possibile istanziarla +fornendo alla configurazione del modello base come segue: + +```python +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("distilbert-base-uncased") +>>> onnx_config = DistilBertOnnxConfig(config) +``` + +L'oggetto risultante ha diverse proprietà utili. Ad esempio è possibile visualizzare il +Set operatore ONNX che verrà utilizzato durante l'esportazione: + +```python +>>> print(onnx_config.default_onnx_opset) +11 +``` + +È inoltre possibile visualizzare gli output associati al modello come segue: + +```python +>>> print(onnx_config.outputs) +OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})]) +``` + +Puoi notare che la proprietà degli output segue la stessa struttura degli input; esso +restituisce un `OrderedDict` di output con nome e le loro forme. La struttura di output +è legato alla scelta della funzione con cui viene inizializzata la configurazione. +Per impostazione predefinita, la configurazione ONNX viene inizializzata con la funzione 'predefinita' +che corrisponde all'esportazione di un modello caricato con la classe `AutoModel`. Se tu +desideri esportare una topologia di modello diversa, è sufficiente fornire una funzionalità diversa a +l'argomento `task` quando inizializzi la configurazione ONNX. Ad esempio, se +volevamo esportare DistilBERT con una testa di classificazione per sequenze, potremmo +usare: + +```python +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("distilbert-base-uncased") +>>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification") +>>> print(onnx_config_for_seq_clf.outputs) +OrderedDict([('logits', {0: 'batch'})]) +``` + + + +Tutte le proprietà e i metodi di base associati a [`~onnx.config.OnnxConfig`] e le +altre classi di configurazione possono essere sovrascritte se necessario. Guarda +[`BartOnnxConfig`] per un esempio avanzato. + + + +#### Esportazione del modello + +Una volta implementata la configurazione ONNX, il passaggio successivo consiste nell'esportare il +modello. Qui possiamo usare la funzione `export()` fornita dal +pacchetto `transformers.onnx`. Questa funzione prevede la configurazione ONNX, insieme +con il modello base e il tokenizer e il percorso per salvare il file esportato: + +```python +>>> from pathlib import Path +>>> from transformers.onnx import export +>>> from transformers import AutoTokenizer, AutoModel + +>>> onnx_path = Path("model.onnx") +>>> model_ckpt = "distilbert-base-uncased" +>>> base_model = AutoModel.from_pretrained(model_ckpt) +>>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt) + +>>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path) +``` + +Gli `onnx_inputs` e `onnx_outputs` restituiti dalla funzione `export()` sono +liste di chiavi definite nelle proprietà di `input` e `output` della +configurazione. Una volta esportato il modello, puoi verificare che il modello sia ben +formato come segue: + +```python +>>> import onnx + +>>> onnx_model = onnx.load("model.onnx") +>>> onnx.checker.check_model(onnx_model) +``` + + + +Se il tuo modello è più largo di 2 GB, vedrai che molti file aggiuntivi sono +creati durante l'esportazione. Questo è _previsto_ perché ONNX utilizza [Protocol +Buffer](https://developers.google.com/protocol-buffers/) per memorizzare il modello e +questi hanno un limite di dimensione 2 GB. Vedi la [Documentazione +ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) +per istruzioni su come caricare modelli con dati esterni. + + + +#### Convalida degli output del modello + +Il passaggio finale consiste nel convalidare gli output dal modello di base e quello esportato +corrispondere entro una soglia di tolleranza assoluta. Qui possiamo usare la +Funzione `validate_model_outputs()` fornita dal pacchetto `transformers.onnx` +come segue: + +```python +>>> from transformers.onnx import validate_model_outputs + +>>> validate_model_outputs( +... onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation +... ) +``` + +Questa funzione usa il metodo `OnnxConfig.generate_dummy_inputs()` per generare +input per il modello di base e quello esportato e la tolleranza assoluta può essere +definita nella configurazione. Generalmente troviamo una corrispondenza numerica nell'intervallo da 1e-6 +a 1e-4, anche se è probabile che qualsiasi cosa inferiore a 1e-3 vada bene. + +### Contribuire con una nuova configurazione a 🤗 Transformers + +Stiamo cercando di espandere l'insieme di configurazioni già pronte e di accettare +contributi della community! Se vuoi contribuire con la tua aggiunta +nella libreria, dovrai: + +* Implementare la configurazione ONNX nella corrispondente `configuration file +_.py` +* Includere l'architettura del modello e le funzioni corrispondenti in [`~onnx.features.FeatureManager`] +* Aggiungere la tua architettura del modello ai test in `test_onnx_v2.py` + +Scopri come stato contribuito la configurazione per [IBERT] +(https://github.com/huggingface/transformers/pull/14868/files) per +avere un'idea di cosa è coinvolto. + +## TorchScript + + + +Questo è l'inizio dei nostri esperimenti con TorchScript e stiamo ancora esplorando le sue capacità con +modelli con variable-input-size. È una nostra priorità e approfondiremo le nostre analisi nelle prossime versioni, +con più esempi di codici, un'implementazione più flessibile e benchmark che confrontano i codici basati su Python con quelli compilati con +TorchScript. + + + +Secondo la documentazione di Pytorch: "TorchScript è un modo per creare modelli serializzabili e ottimizzabili da codice +Pytorch". I due moduli di Pytorch [JIT e TRACE](https://pytorch.org/docs/stable/jit.html) consentono allo sviluppatore di esportare +il loro modello da riutilizzare in altri programmi, come i programmi C++ orientati all'efficienza. + +Abbiamo fornito un'interfaccia che consente l'esportazione di modelli 🤗 Transformers in TorchScript in modo che possano essere riutilizzati +in un ambiente diverso rispetto a un programma Python basato su Pytorch. Qui spieghiamo come esportare e utilizzare i nostri modelli utilizzando +TorchScript. + +Esportare un modello richiede due cose: + +- Un passaggio in avanti con input fittizzi. +- Istanziazione del modello con flag `torchscript`. + +Queste necessità implicano diverse cose a cui gli sviluppatori dovrebbero prestare attenzione. Questi dettagli mostrati sotto. + +### Flag TorchScript e pesi legati + +Questo flag è necessario perché la maggior parte dei modelli linguistici in questo repository hanno pesi legati tra il loro +strato "Embedding" e lo strato "Decoding". TorchScript non consente l'esportazione di modelli che hanno pesi +legati, quindi è necessario prima slegare e clonare i pesi. + +Ciò implica che i modelli istanziati con il flag `torchscript` hanno il loro strato `Embedding` e strato `Decoding` +separato, il che significa che non dovrebbero essere addestrati in futuro. L'allenamento de-sincronizza i due +strati, portando a risultati inaspettati. + +Questo non è il caso per i modelli che non hanno una testa del modello linguistico, poiché quelli non hanno pesi legati. Questi modelli +può essere esportato in sicurezza senza il flag `torchscript`. + +### Input fittizi e standard lengths + +Gli input fittizzi sono usati per fare un modello passaggio in avanti . Mentre i valori degli input si propagano attraverso i strati, +Pytorch tiene traccia delle diverse operazioni eseguite su ciascun tensore. Queste operazioni registrate vengono quindi utilizzate per +creare la "traccia" del modello. + +La traccia viene creata relativamente alle dimensioni degli input. È quindi vincolato dalle dimensioni dell'input +fittizio e non funzionerà per altre lunghezze di sequenza o dimensioni batch. Quando si proverà con una dimensione diversa, ci sarà errore +come: + +`La dimensione espansa del tensore (3) deve corrispondere alla dimensione esistente (7) nella dimensione non singleton 2` + +will be raised. Si consiglia pertanto di tracciare il modello con una dimensione di input fittizia grande almeno quanto il più grande +input che verrà fornito al modello durante l'inferenza. È possibile eseguire il padding per riempire i valori mancanti. Il modello +sarà tracciato con una grande dimensione di input, tuttavia, anche le dimensioni della diverse matrici saranno grandi, +risultando in più calcoli. + +Si raccomanda di prestare attenzione al numero totale di operazioni eseguite su ciascun input e di seguire da vicino le prestazioni +durante l'esportazione di modelli di sequenza-lunghezza variabili. + +### Usare TorchSscript in Python + +Di seguito è riportato un esempio, che mostra come salvare, caricare modelli e come utilizzare la traccia per l'inferenza. + +#### Salvare un modello + +Questo frammento di codice mostra come usare TorchScript per esportare un `BertModel`. Qui il `BertModel` è istanziato secondo +una classe `BertConfig` e quindi salvato su disco con il nome del file `traced_bert.pt` + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch + +enc = BertTokenizer.from_pretrained("bert-base-uncased") + +# Tokenizing input text +text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" +tokenized_text = enc.tokenize(text) + +# Masking one of the input tokens +masked_index = 8 +tokenized_text[masked_index] = "[MASK]" +indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + +# Creating a dummy input +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) +dummy_input = [tokens_tensor, segments_tensors] + +# Initializing the model with the torchscript flag +# Flag set to True even though it is not necessary as this model does not have an LM Head. +config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, +) + +# Instantiating the model +model = BertModel(config) + +# The model needs to be in evaluation mode +model.eval() + +# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag +model = BertModel.from_pretrained("bert-base-uncased", torchscript=True) + +# Creating the trace +traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) +torch.jit.save(traced_model, "traced_bert.pt") +``` + +#### Caricare un modello + +Questo frammento di codice mostra come caricare il `BertModel` che era stato precedentemente salvato su disco con il nome `traced_bert.pt`. +Stiamo riutilizzando il `dummy_input` precedentemente inizializzato. + +```python +loaded_model = torch.jit.load("traced_bert.pt") +loaded_model.eval() + +all_encoder_layers, pooled_output = loaded_model(*dummy_input) +``` + +#### Utilizzare un modello tracciato per l'inferenza + +Usare il modello tracciato per l'inferenza è semplice come usare il suo metodo dunder `__call__`: + +```python +traced_model(tokens_tensor, segments_tensors) +``` + +###Implementare modelli HuggingFace TorchScript su AWS utilizzando Neuron SDK + +AWS ha introdotto [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) +famiglia di istanze per l'inferenza di machine learning a basso costo e ad alte prestazioni nel cloud. +Le istanze Inf1 sono alimentate dal chip AWS Inferentia, un acceleratore hardware personalizzato, +specializzato in carichi di lavoro di inferenza di deep learning. +[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) +è l'SDK per Inferentia che supporta il tracciamento e l'ottimizzazione dei modelli transformers per +distribuzione su Inf1. L'SDK Neuron fornisce: + + +1. API di facile utilizzo con una riga di modifica del codice per tracciare e ottimizzare un modello TorchScript per l'inferenza nel cloud. +2. Ottimizzazioni delle prestazioni pronte all'uso per [miglioramento dei costi-prestazioni](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>) +3. Supporto per i modelli di trasformatori HuggingFace costruiti con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) + o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). + +#### Implicazioni + +Modelli Transformers basati su architettura [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), +o sue varianti come [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) +e [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) +funzioneranno meglio su Inf1 per attività non generative come la question answering estrattive, +Classificazione della sequenza, Classificazione dei token. In alternativa, generazione di testo +le attività possono essere adattate per essere eseguite su Inf1, secondo questo [tutorial AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). +Ulteriori informazioni sui modelli che possono essere convertiti fuori dagli schemi su Inferentia possono essere +trovati nella [sezione Model Architecture Fit della documentazione Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia). + +#### Dipendenze + +L'utilizzo di AWS Neuron per convertire i modelli richiede le seguenti dipendenze e l'ambiente: + +* A [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide), + which comes pre-configured on [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). + +#### Convertire un modello per AWS Neuron + +Usando lo stesso script come in [Usando TorchScipt in Python](https://huggingface.co/docs/transformers/main/en/serialization#using-torchscript-in-python) +per tracciare un "BertModel", importi l'estensione del framework `torch.neuron` per accedere +i componenti di Neuron SDK tramite un'API Python. + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch +import torch.neuron +``` +E modificare solo la riga di codice di traccia + +Da: + +```python +torch.jit.trace(model, [tokens_tensor, segments_tensors]) +``` + +A: + +```python +torch.neuron.trace(model, [token_tensor, segments_tensors]) +``` + +Questa modifica consente a Neuron SDK di tracciare il modello e ottimizzarlo per l'esecuzione nelle istanze Inf1. + +Per ulteriori informazioni sulle funzionalità, gli strumenti, i tutorial di esempi e gli ultimi aggiornamenti di AWS Neuron SDK, +consultare la [documentazione AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). \ No newline at end of file diff --git a/docs/source/it/serialization.mdx b/docs/source/it/serialization.mdx deleted file mode 100644 index 1dde00f429bd..000000000000 --- a/docs/source/it/serialization.mdx +++ /dev/null @@ -1,673 +0,0 @@ - - -# Esporta modelli 🤗 Transformers - -Se devi implementare 🤗 modelli Transformers in ambienti di produzione, noi -consigliamo di esportarli in un formato serializzato che può essere caricato ed eseguito -su runtime e hardware specializzati. In questa guida ti mostreremo come farlo -esporta 🤗 Modelli Transformers in due formati ampiamente utilizzati: ONNX e TorchScript. - -Una volta esportato, un modello può essere ottimizato per l'inferenza tramite tecniche come -la quantizzazione e soppressione. Se sei interessato a ottimizzare i tuoi modelli per l'esecuzione -con la massima efficienza, dai un'occhiata a [🤗 Optimum -library](https://github.com/huggingface/optimum). - -## ONNX - -Il progetto [ONNX (Open Neural Network eXchange)](http://onnx.ai) Il progetto onnx è un open -standard che definisce un insieme comune di operatori e un formato di file comune a -rappresentano modelli di deep learning in un'ampia varietà di framework, tra cui -PyTorch e TensorFlow. Quando un modello viene esportato nel formato ONNX, questi -operatori sono usati per costruire un grafico computazionale (often called an -_intermediate representation_) che rappresenta il flusso di dati attraverso la -rete neurale. - -Esponendo un grafico con operatori e tipi di dati standardizzati, ONNX rende -più facile passare da un framework all'altro. Ad esempio, un modello allenato in PyTorch può -essere esportato in formato ONNX e quindi importato in TensorFlow (e viceversa). - -🤗 Transformers fornisce un pacchetto `transformers.onnx` che ti consente di -convertire i checkpoint del modello in un grafico ONNX sfruttando gli oggetti di configurazione. -Questi oggetti di configurazione sono già pronti per una serie di architetture di modelli, -e sono progettati per essere facilmente estensibili ad altre architetture. - -Le configurazioni pronte includono le seguenti architetture: - - - -- ALBERT -- BART -- BEiT -- BERT -- BigBird -- BigBird-Pegasus -- Blenderbot -- BlenderbotSmall -- CamemBERT -- ConvBERT -- Data2VecText -- Data2VecVision -- DeiT -- DistilBERT -- ELECTRA -- FlauBERT -- GPT Neo -- GPT-J -- I-BERT -- LayoutLM -- M2M100 -- Marian -- mBART -- MobileBERT -- OpenAI GPT-2 -- Perceiver -- PLBart -- RoBERTa -- RoFormer -- SqueezeBERT -- T5 -- ViT -- XLM -- XLM-RoBERTa -- XLM-RoBERTa-XL - -Nelle prossime due sezioni, ti mostreremo come: - -* Esporta un modello supportato usando il pacchetto `transformers.onnx`. -* Esporta un modello personalizzato per un'architettura non supportata. - -### Esportazione di un modello in ONNX - -Per esportare un modello 🤗 Transformers in ONNX, dovrai prima installarne alcune -dipendenze extra: - -```bash -pip install transformers[onnx] -``` - -Il pacchetto `transformers.onnx` può essere usato come modulo Python: - -```bash -python -m transformers.onnx --help - -usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output - -positional arguments: - output Path indicating where to store generated ONNX model. - -optional arguments: - -h, --help show this help message and exit - -m MODEL, --model MODEL - Model ID on huggingface.co or path on disk to load model from. - --feature {causal-lm, ...} - The type of features to export the model with. - --opset OPSET ONNX opset version to export the model with. - --atol ATOL Absolute difference tolerance when validating the model. -``` - -L'esportazione di un checkpoint utilizzando una configurazione già pronta può essere eseguita come segue: - -```bash -python -m transformers.onnx --model=distilbert-base-uncased onnx/ -``` - -che dovrebbe mostrare i seguenti log: - -```bash -Validating ONNX model... - -[✓] ONNX model output names match reference model ({'last_hidden_state'}) - - Validating ONNX Model output "last_hidden_state": - -[✓] (2, 8, 768) matches (2, 8, 768) - -[✓] all values close (atol: 1e-05) -All good, model saved at: onnx/model.onnx -``` - -Questo esporta un grafico ONNX del checkpoint definito dall'argomento `--model`. -In questo esempio è `distilbert-base-uncased`, ma può essere qualsiasi checkpoint -Hugging Face Hub o uno memorizzato localmente. - -Il file risultante `model.onnx` può quindi essere eseguito su uno dei [tanti -acceleratori](https://onnx.ai/supported-tools.html#deployModel) che supportano il -lo standard ONNX. Ad esempio, possiamo caricare ed eseguire il modello con [ONNX -Runtime](https://onnxruntime.ai/) come segue: - -```python ->>> from transformers import AutoTokenizer ->>> from onnxruntime import InferenceSession - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> session = InferenceSession("onnx/model.onnx") ->>> # ONNX Runtime expects NumPy arrays as input ->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") ->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) -``` - -I nomi di output richiesti (cioè `["last_hidden_state"]`) possono essere ottenuti -dando un'occhiata alla configurazione ONNX di ogni modello. Ad esempio, per -DistilBERT abbiamo: - -```python ->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig - ->>> config = DistilBertConfig() ->>> onnx_config = DistilBertOnnxConfig(config) ->>> print(list(onnx_config.outputs.keys())) -["last_hidden_state"] -``` - -Il processo è identico per i checkpoint TensorFlow sull'hub. Ad esempio, noi -possiamo esportare un checkpoint TensorFlow puro da [Keras -organizzazione](https://huggingface.co/keras-io) come segue: - -```bash -python -m transformers.onnx --model=keras-io/transformers-qa onnx/ -``` - -Per esportare un modello memorizzato localmente, devi disporre dei pesi del modello -e file tokenizer memorizzati in una directory. Ad esempio, possiamo caricare e salvare un -checkpoint come segue: - - - -```python ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> # Load tokenizer and PyTorch weights form the Hub ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") ->>> # Save to disk ->>> tokenizer.save_pretrained("local-pt-checkpoint") ->>> pt_model.save_pretrained("local-pt-checkpoint") -``` - -Una volta salvato il checkpoint, possiamo esportarlo su ONNX puntando l'argomento `--model` -del pacchetto `transformers.onnx` nella directory desiderata: - -```bash -python -m transformers.onnx --model=local-pt-checkpoint onnx/ -``` - - -```python ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> # Load tokenizer and TensorFlow weights from the Hub ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") ->>> # Save to disk ->>> tokenizer.save_pretrained("local-tf-checkpoint") ->>> tf_model.save_pretrained("local-tf-checkpoint") -``` - -Once the checkpoint is saved, we can export it to ONNX by pointing the `--model` -argument of the `transformers.onnx` package to the desired directory: - -```bash -python -m transformers.onnx --model=local-tf-checkpoint onnx/ -``` - - - -### Selezione delle caratteristiche per diverse topologie di modello - -Ogni configurazione già pronta viene fornita con una serie di _caratteristiche_ che ti consentono di -esportare modelli per diversi tipi di topologie o attività. Come mostrato nella tabella -di seguito, ogni caratteristica è associata a una diversa Auto Class: - -| Caratteristica | Auto Class | -| ------------------------------------ | ------------------------------------ | -| `causal-lm`, `causal-lm-with-past` | `AutoModelForCausalLM` | -| `default`, `default-with-past` | `AutoModel` | -| `masked-lm` | `AutoModelForMaskedLM` | -| `question-answering` | `AutoModelForQuestionAnswering` | -| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM` | -| `sequence-classification` | `AutoModelForSequenceClassification` | -| `token-classification` | `AutoModelForTokenClassification` | - -Per ciascuna configurazione, puoi trovare l'elenco delle funzionalità supportate tramite il -`FeaturesManager`. Ad esempio, per DistilBERT abbiamo: - -```python ->>> from transformers.onnx.features import FeaturesManager - ->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys()) ->>> print(distilbert_features) -["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"] -``` - -Puoi quindi passare una di queste funzionalità all'argomento `--feature` nel -pacchetto `transformers.onnx`. Ad esempio, per esportare un modello di classificazione del testo -possiamo scegliere un modello ottimizzato dall'Hub ed eseguire: - -```bash -python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \ - --feature=sequence-classification onnx/ -``` - -che visualizzerà i seguenti registri: - -```bash -Validating ONNX model... - -[✓] ONNX model output names match reference model ({'logits'}) - - Validating ONNX Model output "logits": - -[✓] (2, 2) matches (2, 2) - -[✓] all values close (atol: 1e-05) -All good, model saved at: onnx/model.onnx -``` - -Puoi notare che in questo caso, i nomi di output del modello ottimizzato sono -`logits` invece di `last_hidden_state` che abbiamo visto con il -checkpoint `distilbert-base-uncased` precedente. Questo è previsto dal -modello ottimizato visto che ha una testa di e. - - - -Le caratteristiche che hanno un suffisso `wtih-past` (ad es. `causal-lm-with-past`) -corrispondono a topologie di modello con stati nascosti precalcolati (chiave e valori -nei blocchi di attenzione) che possono essere utilizzati per la decodifica autoregressiva veloce. - - - - -### Esportazione di un modello per un'architettura non supportata - -Se desideri esportare un modello la cui architettura non è nativamente supportata dalla -libreria, ci sono tre passaggi principali da seguire: - -1. Implementare una configurazione ONNX personalizzata. -2. Esportare il modello in ONNX. -3. Convalidare gli output di PyTorch e dei modelli esportati. - -In questa sezione, vedremo come DistilBERT è stato implementato per mostrare cosa è -coinvolto in ogni passaggio. - -#### Implementazione di una configurazione ONNX personalizzata - -Iniziamo con l'oggetto di configurazione ONNX. Forniamo tre classi -astratte da cui ereditare, a seconda del tipo di archittettura -del modello che desideri esportare: - -* I modelli basati su encoder ereditano da [`~onnx.config.OnnxConfig`] -* I modelli basati su decoder ereditano da [`~onnx.config.OnnxConfigWithPast`] -* I modelli encoder-decoder ereditano da[`~onnx.config.OnnxSeq2SeqConfigWithPast`] - - - -Un buon modo per implementare una configurazione ONNX personalizzata è guardare l'implementazione -esistente nel file `configuration_.py` di un'architettura simile. - - - -Poiché DistilBERT è un modello basato su encoder, la sua configurazione eredita da -`OnnxConfig`: - -```python ->>> from typing import Mapping, OrderedDict ->>> from transformers.onnx import OnnxConfig - - ->>> class DistilBertOnnxConfig(OnnxConfig): -... @property -... def inputs(self) -> Mapping[str, Mapping[int, str]]: -... return OrderedDict( -... [ -... ("input_ids", {0: "batch", 1: "sequence"}), -... ("attention_mask", {0: "batch", 1: "sequence"}), -... ] -... ) -``` - -Ogni oggetto di configurazione deve implementare la proprietà `inputs` e restituire una -mappatura, dove ogni chiave corrisponde a un input previsto e ogni valore -indica l'asse di quell'input. Per DistilBERT, possiamo vedere che sono richiesti -due input: `input_ids` e `attention_mask`. Questi inputs hanno la stessa forma di -`(batch_size, sequence_length)` per questo motivo vediamo gli stessi assi usati nella -configurazione. - - - -Puoi notare che la proprietà `inputs` per `DistilBertOnnxConfig` restituisce un -`OrdinatoDict`. Ciò garantisce che gli input corrispondano alla loro posizione -relativa all'interno del metodo `PreTrainedModel.forward()` durante il tracciamento del grafico. -Raccomandiamo di usare un `OrderedDict` per le proprietà `inputs` e `outputs` -quando si implementano configurazioni ONNX personalizzate. - - - -Dopo aver implementato una configurazione ONNX, è possibile istanziarla -fornendo alla configurazione del modello base come segue: - -```python ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("distilbert-base-uncased") ->>> onnx_config = DistilBertOnnxConfig(config) -``` - -L'oggetto risultante ha diverse proprietà utili. Ad esempio è possibile visualizzare il -Set operatore ONNX che verrà utilizzato durante l'esportazione: - -```python ->>> print(onnx_config.default_onnx_opset) -11 -``` - -È inoltre possibile visualizzare gli output associati al modello come segue: - -```python ->>> print(onnx_config.outputs) -OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})]) -``` - -Puoi notare che la proprietà degli output segue la stessa struttura degli input; esso -restituisce un `OrderedDict` di output con nome e le loro forme. La struttura di output -è legato alla scelta della funzione con cui viene inizializzata la configurazione. -Per impostazione predefinita, la configurazione ONNX viene inizializzata con la funzione 'predefinita' -che corrisponde all'esportazione di un modello caricato con la classe `AutoModel`. Se tu -desideri esportare una topologia di modello diversa, è sufficiente fornire una funzionalità diversa a -l'argomento `task` quando inizializzi la configurazione ONNX. Ad esempio, se -volevamo esportare DistilBERT con una testa di classificazione per sequenze, potremmo -usare: - -```python ->>> from transformers import AutoConfig - ->>> config = AutoConfig.from_pretrained("distilbert-base-uncased") ->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification") ->>> print(onnx_config_for_seq_clf.outputs) -OrderedDict([('logits', {0: 'batch'})]) -``` - - - -Tutte le proprietà e i metodi di base associati a [`~onnx.config.OnnxConfig`] e le -altre classi di configurazione possono essere sovrascritte se necessario. Guarda -[`BartOnnxConfig`] per un esempio avanzato. - - - -#### Esportazione del modello - -Una volta implementata la configurazione ONNX, il passaggio successivo consiste nell'esportare il -modello. Qui possiamo usare la funzione `export()` fornita dal -pacchetto `transformers.onnx`. Questa funzione prevede la configurazione ONNX, insieme -con il modello base e il tokenizer e il percorso per salvare il file esportato: - -```python ->>> from pathlib import Path ->>> from transformers.onnx import export ->>> from transformers import AutoTokenizer, AutoModel - ->>> onnx_path = Path("model.onnx") ->>> model_ckpt = "distilbert-base-uncased" ->>> base_model = AutoModel.from_pretrained(model_ckpt) ->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt) - ->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path) -``` - -Gli `onnx_inputs` e `onnx_outputs` restituiti dalla funzione `export()` sono -liste di chiavi definite nelle proprietà di `input` e `output` della -configurazione. Una volta esportato il modello, puoi verificare che il modello sia ben -formato come segue: - -```python ->>> import onnx - ->>> onnx_model = onnx.load("model.onnx") ->>> onnx.checker.check_model(onnx_model) -``` - - - -Se il tuo modello è più largo di 2 GB, vedrai che molti file aggiuntivi sono -creati durante l'esportazione. Questo è _previsto_ perché ONNX utilizza [Protocol -Buffer](https://developers.google.com/protocol-buffers/) per memorizzare il modello e -questi hanno un limite di dimensione 2 GB. Vedi la [Documentazione -ONNX](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) -per istruzioni su come caricare modelli con dati esterni. - - - -#### Convalida degli output del modello - -Il passaggio finale consiste nel convalidare gli output dal modello di base e quello esportato -corrispondere entro una soglia di tolleranza assoluta. Qui possiamo usare la -Funzione `validate_model_outputs()` fornita dal pacchetto `transformers.onnx` -come segue: - -```python ->>> from transformers.onnx import validate_model_outputs - ->>> validate_model_outputs( -... onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation -... ) -``` - -Questa funzione usa il metodo `OnnxConfig.generate_dummy_inputs()` per generare -input per il modello di base e quello esportato e la tolleranza assoluta può essere -definita nella configurazione. Generalmente troviamo una corrispondenza numerica nell'intervallo da 1e-6 -a 1e-4, anche se è probabile che qualsiasi cosa inferiore a 1e-3 vada bene. - -### Contribuire con una nuova configurazione a 🤗 Transformers - -Stiamo cercando di espandere l'insieme di configurazioni già pronte e di accettare -contributi della community! Se vuoi contribuire con la tua aggiunta -nella libreria, dovrai: - -* Implementare la configurazione ONNX nella corrispondente `configuration file -_.py` -* Includere l'architettura del modello e le funzioni corrispondenti in [`~onnx.features.FeatureManager`] -* Aggiungere la tua architettura del modello ai test in `test_onnx_v2.py` - -Scopri come stato contribuito la configurazione per [IBERT] -(https://github.com/huggingface/transformers/pull/14868/files) per -avere un'idea di cosa è coinvolto. - -## TorchScript - - - -Questo è l'inizio dei nostri esperimenti con TorchScript e stiamo ancora esplorando le sue capacità con -modelli con variable-input-size. È una nostra priorità e approfondiremo le nostre analisi nelle prossime versioni, -con più esempi di codici, un'implementazione più flessibile e benchmark che confrontano i codici basati su Python con quelli compilati con -TorchScript. - - - -Secondo la documentazione di Pytorch: "TorchScript è un modo per creare modelli serializzabili e ottimizzabili da codice -Pytorch". I due moduli di Pytorch [JIT e TRACE](https://pytorch.org/docs/stable/jit.html) consentono allo sviluppatore di esportare -il loro modello da riutilizzare in altri programmi, come i programmi C++ orientati all'efficienza. - -Abbiamo fornito un'interfaccia che consente l'esportazione di modelli 🤗 Transformers in TorchScript in modo che possano essere riutilizzati -in un ambiente diverso rispetto a un programma Python basato su Pytorch. Qui spieghiamo come esportare e utilizzare i nostri modelli utilizzando -TorchScript. - -Esportare un modello richiede due cose: - -- Un passaggio in avanti con input fittizzi. -- Istanziazione del modello con flag `torchscript`. - -Queste necessità implicano diverse cose a cui gli sviluppatori dovrebbero prestare attenzione. Questi dettagli mostrati sotto. - -### Flag TorchScript e pesi legati - -Questo flag è necessario perché la maggior parte dei modelli linguistici in questo repository hanno pesi legati tra il loro -strato "Embedding" e lo strato "Decoding". TorchScript non consente l'esportazione di modelli che hanno pesi -legati, quindi è necessario prima slegare e clonare i pesi. - -Ciò implica che i modelli istanziati con il flag `torchscript` hanno il loro strato `Embedding` e strato `Decoding` -separato, il che significa che non dovrebbero essere addestrati in futuro. L'allenamento de-sincronizza i due -strati, portando a risultati inaspettati. - -Questo non è il caso per i modelli che non hanno una testa del modello linguistico, poiché quelli non hanno pesi legati. Questi modelli -può essere esportato in sicurezza senza il flag `torchscript`. - -### Input fittizi e standard lengths - -Gli input fittizzi sono usati per fare un modello passaggio in avanti . Mentre i valori degli input si propagano attraverso i strati, -Pytorch tiene traccia delle diverse operazioni eseguite su ciascun tensore. Queste operazioni registrate vengono quindi utilizzate per -creare la "traccia" del modello. - -La traccia viene creata relativamente alle dimensioni degli input. È quindi vincolato dalle dimensioni dell'input -fittizio e non funzionerà per altre lunghezze di sequenza o dimensioni batch. Quando si proverà con una dimensione diversa, ci sarà errore -come: - -`La dimensione espansa del tensore (3) deve corrispondere alla dimensione esistente (7) nella dimensione non singleton 2` - -will be raised. Si consiglia pertanto di tracciare il modello con una dimensione di input fittizia grande almeno quanto il più grande -input che verrà fornito al modello durante l'inferenza. È possibile eseguire il padding per riempire i valori mancanti. Il modello -sarà tracciato con una grande dimensione di input, tuttavia, anche le dimensioni della diverse matrici saranno grandi, -risultando in più calcoli. - -Si raccomanda di prestare attenzione al numero totale di operazioni eseguite su ciascun input e di seguire da vicino le prestazioni -durante l'esportazione di modelli di sequenza-lunghezza variabili. - -### Usare TorchSscript in Python - -Di seguito è riportato un esempio, che mostra come salvare, caricare modelli e come utilizzare la traccia per l'inferenza. - -#### Salvare un modello - -Questo frammento di codice mostra come usare TorchScript per esportare un `BertModel`. Qui il `BertModel` è istanziato secondo -una classe `BertConfig` e quindi salvato su disco con il nome del file `traced_bert.pt` - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch - -enc = BertTokenizer.from_pretrained("bert-base-uncased") - -# Tokenizing input text -text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" -tokenized_text = enc.tokenize(text) - -# Masking one of the input tokens -masked_index = 8 -tokenized_text[masked_index] = "[MASK]" -indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) -segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] - -# Creating a dummy input -tokens_tensor = torch.tensor([indexed_tokens]) -segments_tensors = torch.tensor([segments_ids]) -dummy_input = [tokens_tensor, segments_tensors] - -# Initializing the model with the torchscript flag -# Flag set to True even though it is not necessary as this model does not have an LM Head. -config = BertConfig( - vocab_size_or_config_json_file=32000, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - torchscript=True, -) - -# Instantiating the model -model = BertModel(config) - -# The model needs to be in evaluation mode -model.eval() - -# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag -model = BertModel.from_pretrained("bert-base-uncased", torchscript=True) - -# Creating the trace -traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) -torch.jit.save(traced_model, "traced_bert.pt") -``` - -#### Caricare un modello - -Questo frammento di codice mostra come caricare il `BertModel` che era stato precedentemente salvato su disco con il nome `traced_bert.pt`. -Stiamo riutilizzando il `dummy_input` precedentemente inizializzato. - -```python -loaded_model = torch.jit.load("traced_bert.pt") -loaded_model.eval() - -all_encoder_layers, pooled_output = loaded_model(*dummy_input) -``` - -#### Utilizzare un modello tracciato per l'inferenza - -Usare il modello tracciato per l'inferenza è semplice come usare il suo metodo dunder `__call__`: - -```python -traced_model(tokens_tensor, segments_tensors) -``` - -###Implementare modelli HuggingFace TorchScript su AWS utilizzando Neuron SDK - -AWS ha introdotto [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) -famiglia di istanze per l'inferenza di machine learning a basso costo e ad alte prestazioni nel cloud. -Le istanze Inf1 sono alimentate dal chip AWS Inferentia, un acceleratore hardware personalizzato, -specializzato in carichi di lavoro di inferenza di deep learning. -[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) -è l'SDK per Inferentia che supporta il tracciamento e l'ottimizzazione dei modelli transformers per -distribuzione su Inf1. L'SDK Neuron fornisce: - - -1. API di facile utilizzo con una riga di modifica del codice per tracciare e ottimizzare un modello TorchScript per l'inferenza nel cloud. -2. Ottimizzazioni delle prestazioni pronte all'uso per [miglioramento dei costi-prestazioni](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>) -3. Supporto per i modelli di trasformatori HuggingFace costruiti con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) - o [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). - -#### Implicazioni - -Modelli Transformers basati su architettura [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), -o sue varianti come [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) -e [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) -funzioneranno meglio su Inf1 per attività non generative come la question answering estrattive, -Classificazione della sequenza, Classificazione dei token. In alternativa, generazione di testo -le attività possono essere adattate per essere eseguite su Inf1, secondo questo [tutorial AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). -Ulteriori informazioni sui modelli che possono essere convertiti fuori dagli schemi su Inferentia possono essere -trovati nella [sezione Model Architecture Fit della documentazione Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia). - -#### Dipendenze - -L'utilizzo di AWS Neuron per convertire i modelli richiede le seguenti dipendenze e l'ambiente: - -* A [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide), - which comes pre-configured on [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). - -#### Convertire un modello per AWS Neuron - -Usando lo stesso script come in [Usando TorchScipt in Python](https://huggingface.co/docs/transformers/main/en/serialization#using-torchscript-in-python) -per tracciare un "BertModel", importi l'estensione del framework `torch.neuron` per accedere -i componenti di Neuron SDK tramite un'API Python. - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch -import torch.neuron -``` -E modificare solo la riga di codice di traccia - -Da: - -```python -torch.jit.trace(model, [tokens_tensor, segments_tensors]) -``` - -A: - -```python -torch.neuron.trace(model, [token_tensor, segments_tensors]) -``` - -Questa modifica consente a Neuron SDK di tracciare il modello e ottimizzarlo per l'esecuzione nelle istanze Inf1. - -Per ulteriori informazioni sulle funzionalità, gli strumenti, i tutorial di esempi e gli ultimi aggiornamenti di AWS Neuron SDK, -consultare la [documentazione AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). \ No newline at end of file diff --git a/docs/source/it/training.md b/docs/source/it/training.md new file mode 100644 index 000000000000..be0883f07b77 --- /dev/null +++ b/docs/source/it/training.md @@ -0,0 +1,376 @@ + + +# Fine-tuning di un modello pre-addestrato + +[[open-in-colab]] + +Ci sono benefici significativi nell'usare un modello pre-addestrato. Si riducono i costi computazionali, l'impronta di carbonio e ti consente di usare modelli stato dell'arte senza doverli addestrare da zero. 🤗 Transformers consente l'accesso a migliaia di modelli pre-addestrati per un'ampia gamma di compiti. Quando usi un modello pre-addestrato, lo alleni su un dataset specifico per il tuo compito. Questo è conosciuto come fine-tuning, una tecnica di addestramento incredibilmente potente. In questa esercitazione, potrai fare il fine-tuning di un modello pre-addestrato, con un framework di deep learning a tua scelta: + +* Fine-tuning di un modello pre-addestrato con 🤗 Transformers [`Trainer`]. +* Fine-tuning di un modello pre-addestrato in TensorFlow con Keras. +* Fine-tuning di un modello pre-addestrato con PyTorch. + + + +## Preparare un dataset + + + +Prima di poter fare il fine-tuning di un modello pre-addestrato, scarica un dataset e preparalo per l'addestramento. La precedente esercitazione ti ha mostrato come processare i dati per l'addestramento e adesso hai l'opportunità di metterti alla prova! + +Inizia caricando il dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full): + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("yelp_review_full") +>>> dataset["train"][100] +{'label': 0, + 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} +``` + +Come già sai, hai bisogno di un tokenizer per processare il testo e includere una strategia di padding e truncation per gestire sequenze di lunghezza variabile. Per processare il dataset in un unico passo, usa il metodo [`map`](https://huggingface.co/docs/datasets/process.html#map) di 🤗 Datasets che applica la funzione di preprocessing all'intero dataset: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + + +>>> def tokenize_function(examples): +... return tokenizer(examples["text"], padding="max_length", truncation=True) + + +>>> tokenized_datasets = dataset.map(tokenize_function, batched=True) +``` + +Se vuoi, puoi creare un sottoinsieme più piccolo del dataset per il fine-tuning così da ridurre il tempo necessario: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + + + +## Addestramento + + + + + +🤗 Transformers mette a disposizione la classe [`Trainer`] ottimizzata per addestrare modelli 🤗 Transformers, rendendo semplice iniziare l'addestramento senza scrivere manualmente il tuo ciclo di addestramento. L'API [`Trainer`] supporta un'ampia gamma di opzioni e funzionalità di addestramento come logging, gradient accumulation e mixed precision. + +Inizia caricando il tuo modello e specificando il numero di etichette (labels) attese. Nel dataset Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), sai che ci sono cinque etichette: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + + + +Potresti vedere un warning dato che alcuni dei pesi pre-addestrati non sono stati utilizzati e altri pesi sono stati inizializzati casualmente. Non preoccuparti, è completamente normale! L'head pre-addestrata del modello BERT viene scartata e rimpiazzata da una classification head inizializzata casualmente. Farai il fine-tuning di questa nuova head del modello sul tuo compito di classificazione, trasferendogli la conoscenza del modello pre-addestrato. + + + +### Iperparametri per il training + +Successivamente, crea una classe [`TrainingArguments`] contenente tutti gli iperparametri che si possono regore nonché le variabili per attivare le differenti opzioni di addestramento. Per questa esercitazione puoi iniziare con gli [iperparametri](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) di ddestramento predefiniti, ma sentiti libero di sperimentare per trovare la configurazione ottimale per te. + +Specifica dove salvare i checkpoints del tuo addestramento: + +```py +>>> from transformers import TrainingArguments + +>>> training_args = TrainingArguments(output_dir="test_trainer") +``` + +### Metriche + +[`Trainer`] non valuta automaticamente le performance del modello durante l'addestramento. Dovrai passare a [`Trainer`] una funzione che calcola e restituisce le metriche. La libreria 🤗 Datasets mette a disposizione una semplice funzione [`accuracy`](https://huggingface.co/metrics/accuracy) che puoi caricare con la funzione `load_metric` (guarda questa [esercitazione](https://huggingface.co/docs/datasets/metrics.html) per maggiori informazioni): + +```py +>>> import numpy as np +>>> from datasets import load_metric + +>>> metric = load_metric("accuracy") +``` + +Richiama `compute` su `metric` per calcolare l'accuratezza delle tue previsioni. Prima di passare le tue previsioni a `compute`, hai bisogno di convertirle in logits (ricorda che tutti i modelli 🤗 Transformers restituiscono logits): + +```py +>>> def compute_metrics(eval_pred): +... logits, labels = eval_pred +... predictions = np.argmax(logits, axis=-1) +... return metric.compute(predictions=predictions, references=labels) +``` + +Se preferisci monitorare le tue metriche di valutazione durante il fine-tuning, specifica il parametro `evaluation_strategy` nei tuoi training arguments per restituire le metriche di valutazione ad ogni epoca di addestramento: + +```py +>>> from transformers import TrainingArguments, Trainer + +>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") +``` + +### Trainer + +Crea un oggetto [`Trainer`] col tuo modello, training arguments, dataset di training e test, e funzione di valutazione: + +```py +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=small_train_dataset, +... eval_dataset=small_eval_dataset, +... compute_metrics=compute_metrics, +... ) +``` + +Poi metti a punto il modello richiamando [`~transformers.Trainer.train`]: + +```py +>>> trainer.train() +``` + + + + + + +I modelli 🤗 Transformers supportano anche l'addestramento in TensorFlow usando l'API di Keras. + +### Convertire dataset nel formato per TensorFlow + +Il [`DefaultDataCollator`] assembla tensori in lotti su cui il modello si addestrerà. Assicurati di specificare di restituire tensori per TensorFlow in `return_tensors`: + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") +``` + + + +[`Trainer`] usa [`DataCollatorWithPadding`] in maniera predefinita in modo da non dover specificare esplicitamente un collettore di dati. + + + +Successivamente, converti i datasets tokenizzati in TensorFlow datasets con il metodo [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specifica il tuo input in `columns` e le tue etichette in `label_cols`: + +```py +>>> tf_train_dataset = small_train_dataset.to_tf_dataset( +... columns=["attention_mask", "input_ids", "token_type_ids"], +... label_cols=["labels"], +... shuffle=True, +... collate_fn=data_collator, +... batch_size=8, +... ) + +>>> tf_validation_dataset = small_eval_dataset.to_tf_dataset( +... columns=["attention_mask", "input_ids", "token_type_ids"], +... label_cols=["labels"], +... shuffle=False, +... collate_fn=data_collator, +... batch_size=8, +... ) +``` + +### Compilazione e addestramento + +Carica un modello TensorFlow col numero atteso di etichette: + +```py +>>> import tensorflow as tf +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + +Poi compila e fai il fine-tuning del tuo modello usando [`fit`](https://keras.io/api/models/model_training_apis/) come faresti con qualsiasi altro modello di Keras: + +```py +>>> model.compile( +... optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), +... loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), +... metrics=tf.metrics.SparseCategoricalAccuracy(), +... ) + +>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3) +``` + + + + + +## Addestramento in PyTorch nativo + + + + + +[`Trainer`] si occupa del ciclo di addestramento e ti consente di mettere a punto un modello con una sola riga di codice. Per chi preferisse scrivere un proprio ciclo di addestramento personale, puoi anche fare il fine-tuning di un modello 🤗 Transformers in PyTorch nativo. + +A questo punto, potresti avere bisogno di riavviare il tuo notebook o eseguire il seguente codice per liberare un po' di memoria: + +```py +del model +del pytorch_model +del trainer +torch.cuda.empty_cache() +``` + +Successivamente, postprocessa manualmente il `tokenized_dataset` per prepararlo ad essere allenato. + +1. Rimuovi la colonna `text` perché il modello non accetta testo grezzo come input: + + ```py + >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) + ``` + +2. Rinomina la colonna `label` in `labels` perché il modello si aspetta che questo argomento si chiami `labels`: + + ```py + >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") + ``` + +3. Imposta il formato del dataset per farti restituire tensori di PyTorch all'interno delle liste: + + ```py + >>> tokenized_datasets.set_format("torch") + ``` + +Poi crea un piccolo sottocampione del dataset come visto precedentemente per velocizzare il fine-tuning: + +```py +>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +``` + +### DataLoader + +Crea un `DataLoader` per i tuoi datasets di train e test così puoi iterare sui lotti di dati: + +```py +>>> from torch.utils.data import DataLoader + +>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) +>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) +``` + +Carica il tuo modello con il numero atteso di etichette: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) +``` + +### Ottimizzatore e learning rate scheduler + +Crea un ottimizzatore e il learning rate scheduler per fare il fine-tuning del modello. Usa l'ottimizzatore [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) di PyTorch: + +```py +>>> from torch.optim import AdamW + +>>> optimizer = AdamW(model.parameters(), lr=5e-5) +``` + +Crea il learning rate scheduler predefinito da [`Trainer`]: + +```py +>>> from transformers import get_scheduler + +>>> num_epochs = 3 +>>> num_training_steps = num_epochs * len(train_dataloader) +>>> lr_scheduler = get_scheduler( +... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps +... ) +``` + +Infine specifica come `device` da usare una GPU se ne hai una. Altrimenti, l'addestramento su una CPU può richiedere diverse ore invece di un paio di minuti. + +```py +>>> import torch + +>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +>>> model.to(device) +``` + + + +Ottieni l'accesso gratuito a una GPU sul cloud se non ne possiedi una usando un notebook sul web come [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/). + + + +Ottimo, adesso possiamo addestrare! 🥳 + +### Training loop + +Per tenere traccia dei tuoi progressi durante l'addestramento, usa la libreria [tqdm](https://tqdm.github.io/) per aggiungere una progress bar sopra il numero dei passi di addestramento: + +```py +>>> from tqdm.auto import tqdm + +>>> progress_bar = tqdm(range(num_training_steps)) + +>>> model.train() +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... outputs = model(**batch) +... loss = outputs.loss +... loss.backward() + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +### Metriche + +Proprio come è necessario aggiungere una funzione di valutazione del [`Trainer`], è necessario fare lo stesso quando si scrive il proprio ciclo di addestramento. Ma invece di calcolare e riportare la metrica alla fine di ogni epoca, questa volta accumulerai tutti i batch con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) e calcolerai la metrica alla fine. + +```py +>>> metric = load_metric("accuracy") +>>> model.eval() +>>> for batch in eval_dataloader: +... batch = {k: v.to(device) for k, v in batch.items()} +... with torch.no_grad(): +... outputs = model(**batch) + +... logits = outputs.logits +... predictions = torch.argmax(logits, dim=-1) +... metric.add_batch(predictions=predictions, references=batch["labels"]) + +>>> metric.compute() +``` + + + + + +## Altre risorse + +Per altri esempi sul fine-tuning, fai riferimento a: + +- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) include scripts per addestrare compiti comuni di NLP in PyTorch e TensorFlow. + +- [🤗 Transformers Notebooks](notebooks) contiene diversi notebooks su come mettere a punto un modello per compiti specifici in PyTorch e TensorFlow. diff --git a/docs/source/it/training.mdx b/docs/source/it/training.mdx deleted file mode 100644 index 68f6434bbb5a..000000000000 --- a/docs/source/it/training.mdx +++ /dev/null @@ -1,372 +0,0 @@ - - -# Fine-tuning di un modello pre-addestrato - -[[open-in-colab]] - -Ci sono benefici significativi nell'usare un modello pre-addestrato. Si riducono i costi computazionali, l'impronta di carbonio e ti consente di usare modelli stato dell'arte senza doverli addestrare da zero. 🤗 Transformers consente l'accesso a migliaia di modelli pre-addestrati per un'ampia gamma di compiti. Quando usi un modello pre-addestrato, lo alleni su un dataset specifico per il tuo compito. Questo è conosciuto come fine-tuning, una tecnica di addestramento incredibilmente potente. In questa esercitazione, potrai fare il fine-tuning di un modello pre-addestrato, con un framework di deep learning a tua scelta: - -* Fine-tuning di un modello pre-addestrato con 🤗 Transformers [`Trainer`]. -* Fine-tuning di un modello pre-addestrato in TensorFlow con Keras. -* Fine-tuning di un modello pre-addestrato con PyTorch. - - - -## Preparare un dataset - - - -Prima di poter fare il fine-tuning di un modello pre-addestrato, scarica un dataset e preparalo per l'addestramento. La precedente esercitazione ti ha mostrato come processare i dati per l'addestramento e adesso hai l'opportunità di metterti alla prova! - -Inizia caricando il dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full): - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("yelp_review_full") ->>> dataset["train"][100] -{'label': 0, - 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} -``` - -Come già sai, hai bisogno di un tokenizer per processare il testo e includere una strategia di padding e truncation per gestire sequenze di lunghezza variabile. Per processare il dataset in un unico passo, usa il metodo [`map`](https://huggingface.co/docs/datasets/process.html#map) di 🤗 Datasets che applica la funzione di preprocessing all'intero dataset: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - - ->>> def tokenize_function(examples): -... return tokenizer(examples["text"], padding="max_length", truncation=True) - - ->>> tokenized_datasets = dataset.map(tokenize_function, batched=True) -``` - -Se vuoi, puoi creare un sottoinsieme più piccolo del dataset per il fine-tuning così da ridurre il tempo necessario: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - - - -## Addestramento - - - - - -🤗 Transformers mette a disposizione la classe [`Trainer`] ottimizzata per addestrare modelli 🤗 Transformers, rendendo semplice iniziare l'addestramento senza scrivere manualmente il tuo ciclo di addestramento. L'API [`Trainer`] supporta un'ampia gamma di opzioni e funzionalità di addestramento come logging, gradient accumulation e mixed precision. - -Inizia caricando il tuo modello e specificando il numero di etichette (labels) attese. Nel dataset Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), sai che ci sono cinque etichette: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - - - -Potresti vedere un warning dato che alcuni dei pesi pre-addestrati non sono stati utilizzati e altri pesi sono stati inizializzati casualmente. Non preoccuparti, è completamente normale! L'head pre-addestrata del modello BERT viene scartata e rimpiazzata da una classification head inizializzata casualmente. Farai il fine-tuning di questa nuova head del modello sul tuo compito di classificazione, trasferendogli la conoscenza del modello pre-addestrato. - - - -### Iperparametri per il training - -Successivamente, crea una classe [`TrainingArguments`] contenente tutti gli iperparametri che si possono regore nonché le variabili per attivare le differenti opzioni di addestramento. Per questa esercitazione puoi iniziare con gli [iperparametri](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) di ddestramento predefiniti, ma sentiti libero di sperimentare per trovare la configurazione ottimale per te. - -Specifica dove salvare i checkpoints del tuo addestramento: - -```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments(output_dir="test_trainer") -``` - -### Metriche - -[`Trainer`] non valuta automaticamente le performance del modello durante l'addestramento. Dovrai passare a [`Trainer`] una funzione che calcola e restituisce le metriche. La libreria 🤗 Datasets mette a disposizione una semplice funzione [`accuracy`](https://huggingface.co/metrics/accuracy) che puoi caricare con la funzione `load_metric` (guarda questa [esercitazione](https://huggingface.co/docs/datasets/metrics.html) per maggiori informazioni): - -```py ->>> import numpy as np ->>> from datasets import load_metric - ->>> metric = load_metric("accuracy") -``` - -Richiama `compute` su `metric` per calcolare l'accuratezza delle tue previsioni. Prima di passare le tue previsioni a `compute`, hai bisogno di convertirle in logits (ricorda che tutti i modelli 🤗 Transformers restituiscono logits): - -```py ->>> def compute_metrics(eval_pred): -... logits, labels = eval_pred -... predictions = np.argmax(logits, axis=-1) -... return metric.compute(predictions=predictions, references=labels) -``` - -Se preferisci monitorare le tue metriche di valutazione durante il fine-tuning, specifica il parametro `evaluation_strategy` nei tuoi training arguments per restituire le metriche di valutazione ad ogni epoca di addestramento: - -```py ->>> from transformers import TrainingArguments, Trainer - ->>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") -``` - -### Trainer - -Crea un oggetto [`Trainer`] col tuo modello, training arguments, dataset di training e test, e funzione di valutazione: - -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -Poi metti a punto il modello richiamando [`~transformers.Trainer.train`]: - -```py ->>> trainer.train() -``` - - - - - - -I modelli 🤗 Transformers supportano anche l'addestramento in TensorFlow usando l'API di Keras. - -### Convertire dataset nel formato per TensorFlow - -Il [`DefaultDataCollator`] assembla tensori in lotti su cui il modello si addestrerà. Assicurati di specificare di restituire tensori per TensorFlow in `return_tensors`: - -```py ->>> from transformers import DefaultDataCollator - ->>> data_collator = DefaultDataCollator(return_tensors="tf") -``` - - - -[`Trainer`] usa [`DataCollatorWithPadding`] in maniera predefinita in modo da non dover specificare esplicitamente un collettore di dati. - - - -Successivamente, converti i datasets tokenizzati in TensorFlow datasets con il metodo [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Specifica il tuo input in `columns` e le tue etichette in `label_cols`: - -```py ->>> tf_train_dataset = small_train_dataset.to_tf_dataset( -... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], -... shuffle=True, -... collate_fn=data_collator, -... batch_size=8, -... ) - ->>> tf_validation_dataset = small_eval_dataset.to_tf_dataset( -... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], -... shuffle=False, -... collate_fn=data_collator, -... batch_size=8, -... ) -``` - -### Compilazione e addestramento - -Carica un modello TensorFlow col numero atteso di etichette: - -```py ->>> import tensorflow as tf ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - -Poi compila e fai il fine-tuning del tuo modello usando [`fit`](https://keras.io/api/models/model_training_apis/) come faresti con qualsiasi altro modello di Keras: - -```py ->>> model.compile( -... optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), -... loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), -... metrics=tf.metrics.SparseCategoricalAccuracy(), -... ) - ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3) -``` - - - - - -## Addestramento in PyTorch nativo - - - - - -[`Trainer`] si occupa del ciclo di addestramento e ti consente di mettere a punto un modello con una sola riga di codice. Per chi preferisse scrivere un proprio ciclo di addestramento personale, puoi anche fare il fine-tuning di un modello 🤗 Transformers in PyTorch nativo. - -A questo punto, potresti avere bisogno di riavviare il tuo notebook o eseguire il seguente codice per liberare un po' di memoria: - -```py -del model -del pytorch_model -del trainer -torch.cuda.empty_cache() -``` - -Successivamente, postprocessa manualmente il `tokenized_dataset` per prepararlo ad essere allenato. - -1. Rimuovi la colonna `text` perché il modello non accetta testo grezzo come input: - - ```py - >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) - ``` - -2. Rinomina la colonna `label` in `labels` perché il modello si aspetta che questo argomento si chiami `labels`: - - ```py - >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") - ``` - -3. Imposta il formato del dataset per farti restituire tensori di PyTorch all'interno delle liste: - - ```py - >>> tokenized_datasets.set_format("torch") - ``` - -Poi crea un piccolo sottocampione del dataset come visto precedentemente per velocizzare il fine-tuning: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - -### DataLoader - -Crea un `DataLoader` per i tuoi datasets di train e test così puoi iterare sui lotti di dati: - -```py ->>> from torch.utils.data import DataLoader - ->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) ->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) -``` - -Carica il tuo modello con il numero atteso di etichette: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) -``` - -### Ottimizzatore e learning rate scheduler - -Crea un ottimizzatore e il learning rate scheduler per fare il fine-tuning del modello. Usa l'ottimizzatore [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) di PyTorch: - -```py ->>> from torch.optim import AdamW - ->>> optimizer = AdamW(model.parameters(), lr=5e-5) -``` - -Crea il learning rate scheduler predefinito da [`Trainer`]: - -```py ->>> from transformers import get_scheduler - ->>> num_epochs = 3 ->>> num_training_steps = num_epochs * len(train_dataloader) ->>> lr_scheduler = get_scheduler( -... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps -... ) -``` - -Infine specifica come `device` da usare una GPU se ne hai una. Altrimenti, l'addestramento su una CPU può richiedere diverse ore invece di un paio di minuti. - -```py ->>> import torch - ->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ->>> model.to(device) -``` - - - -Ottieni l'accesso gratuito a una GPU sul cloud se non ne possiedi una usando un notebook sul web come [Colaboratory](https://colab.research.google.com/) o [SageMaker StudioLab](https://studiolab.sagemaker.aws/). - - - -Ottimo, adesso possiamo addestrare! 🥳 - -### Training loop - -Per tenere traccia dei tuoi progressi durante l'addestramento, usa la libreria [tqdm](https://tqdm.github.io/) per aggiungere una progress bar sopra il numero dei passi di addestramento: - -```py ->>> from tqdm.auto import tqdm - ->>> progress_bar = tqdm(range(num_training_steps)) - ->>> model.train() ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... outputs = model(**batch) -... loss = outputs.loss -... loss.backward() - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -### Metriche - -Proprio come è necessario aggiungere una funzione di valutazione del [`Trainer`], è necessario fare lo stesso quando si scrive il proprio ciclo di addestramento. Ma invece di calcolare e riportare la metrica alla fine di ogni epoca, questa volta accumulerai tutti i batch con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) e calcolerai la metrica alla fine. - -```py ->>> metric = load_metric("accuracy") ->>> model.eval() ->>> for batch in eval_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... with torch.no_grad(): -... outputs = model(**batch) - -... logits = outputs.logits -... predictions = torch.argmax(logits, dim=-1) -... metric.add_batch(predictions=predictions, references=batch["labels"]) - ->>> metric.compute() -``` - - - - - -## Altre risorse - -Per altri esempi sul fine-tuning, fai riferimento a: - -- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) include scripts per addestrare compiti comuni di NLP in PyTorch e TensorFlow. - -- [🤗 Transformers Notebooks](notebooks) contiene diversi notebooks su come mettere a punto un modello per compiti specifici in PyTorch e TensorFlow. diff --git a/docs/source/ja/_toctree.yml b/docs/source/ja/_toctree.yml new file mode 100644 index 000000000000..8ac8b1e3183f --- /dev/null +++ b/docs/source/ja/_toctree.yml @@ -0,0 +1,14 @@ +- sections: + - local: index + title: 🤗 Transformers + - local: installation + title: インストール + title: はじめに +- sections: + - local: accelerate + title: 🤗 Accelerate を用いた分散学習 + title: チュートリアル +- sections: + - sections: + - local: multilingual + title: 推論のための多言語モデル \ No newline at end of file diff --git a/docs/source/ja/accelerate.md b/docs/source/ja/accelerate.md new file mode 100644 index 000000000000..73e45b9cd3c5 --- /dev/null +++ b/docs/source/ja/accelerate.md @@ -0,0 +1,136 @@ + + +# 🤗 Accelerate を用いた分散学習 + +モデルが大きくなるにつれて、限られたハードウェアでより大きなモデルを訓練し、訓練速度を大幅に上昇させるための方法として並列処理が浮上してきました。1台のマシンに複数のGPUがあっても、複数のマシンにまたがる複数のGPUがあっても、あらゆるタイプの分散処理セットアップ上でユーザーが簡単に 🤗 Transformers モデルを訓練できるように、 Hugging Face では [🤗 Accelerate](https://huggingface.co/docs/accelerate) ライブラリを作成しました。このチュートリアルでは、PyTorch の訓練ループをカスタマイズして、分散処理環境での訓練を可能にする方法について学びます。 + +## セットアップ + +はじめに 🤗 Accelerate をインストールしましょう: + +```bash +pip install accelerate +``` + +そしたらインポートして [`~accelerate.Accelerator`] オブジェクトを作成しましょう。[`~accelerate.Accelerator`] は分散処理セットアップを自動的に検出し、訓練のために必要な全てのコンポーネントを初期化します。モデルをデバイスに明示的に配置する必要はありません。 + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## Accelerate する準備をしましょう + +次に、関連する全ての訓練オブジェクトを [`~accelerate.Accelerator.prepare`] メソッドに渡します。これには、訓練と評価それぞれのDataloader、モデル、optimizer が含まれます: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## Backward + +最後に訓練ループ内の `loss.backward()` を 🤗 Accelerate の [`~accelerate.Accelerator.backward`] メソッドで置き換えます: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +以下のコードで確認できる通り、訓練ループに4行のコードを追加するだけで分散学習が可能です! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## 訓練する + +関連するコードを追加したら、スクリプトまたは Colaboratory などのノートブックで訓練を開始します。 + +### スクリプトで訓練する + +スクリプトから訓練をしている場合は、設定ファイルを作成・保存するために以下のコマンドを実行してください: + +```bash +accelerate config +``` + +そして次のようにして訓練を開始します: + +```bash +accelerate launch train.py +``` + +### ノートブックで訓練する + +Colaboratory の TPU の利用をお考えの場合、🤗 Accelerate はノートブック上で実行することもできます。訓練に必要な全てのコードを関数に含め、[`~accelerate.notebook_launcher`] に渡してください: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +🤗 Accelerate と豊富な機能についてもっと知りたい方は[ドキュメント](https://huggingface.co/docs/accelerate)を参照してください。 diff --git a/docs/source/ja/index.md b/docs/source/ja/index.md new file mode 100644 index 000000000000..364a3b34caba --- /dev/null +++ b/docs/source/ja/index.md @@ -0,0 +1,399 @@ + + +# 🤗 Transformers + +[PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), [JAX](https://jax.readthedocs.io/en/latest/)のための最先端機械学習。 + +🤗 Transformers は最先端の学習済みモデルを簡単にダウンロードして学習するAPIとツールを提供します。学習済みモデルを使用することで計算コストと二酸化炭素の排出量を削減でき、またゼロからモデルを学習するために要求される時間とリソースを節約することができます。 これらのモデルは以下のような異なるモダリティにおける一般的なタスクをサポートします: + +📝 **自然言語処理**: テキスト分類、 固有表現抽出、 質問応答、 言語モデリング、 文章要約、 機械翻訳、 複数選択、テキスト生成。
+🖼️ **コンピュータビジョン**: 画像分類、 物体検出、 セグメンテーション。
+🗣️ **音声**: 自動音声認識、音声分類。
+🐙 **マルチモーダル**: テーブル質問応答、 光学文字認識(OCR)、 スキャンされたドキュメントからの情報抽出、 動画分類、 visual question answering(視覚的質問応答)。 + +🤗 Transformers はPyTorch, TensorFlow, JAX間のフレームワーク相互運用性をサポートしています。 これはモデルの各段階で異なるフレームワークを使うための柔軟性を提供します。あるフレームワークで3行のコードでモデルを学習し、別のフレームワークで推論のためにモデルをロードすることが可能です。また、本番環境のデプロイのためにモデルをONNXやTorchScriptのような形式でエクスポートすることも可能です。 + +[Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb)で成長中のコミュニティに今日参加しましょう! + +## Hugging Faceチームによるカスタムサポートをご希望の場合 + + + HuggingFace Expert Acceleration Program + + +## 目次 + +ドキュメントは以下の5つのセクションで構成されています: + +- **はじめに** は、ライブラリのクイックツアーとライブラリを使い始めるためのインストール手順を提供しています。 +- **チュートリアル** は、初心者が始めるのに最適な場所です。このセクションでは、ライブラリを使い始めるために必要な基本的なスキルを習得できます。 +- **HOW-TOガイド** は、言語モデリングのために学習済みモデルをfinetuningすることやカスタムモデルの作成と共有の方法などといった特定の目標を達成するための方法を示しています。 +- **コンセプトガイド** は、モデルやタスク、そして 🤗 Transformersの設計思想の背景にある基本的にコンセプトや考え方についてより深く考察し解説しています。 +- **API** 全てのクラスと関数を説明します: + + - **MAIN CLASSES** は、configuration, model, tokenizer, pipelineといった最も重要なクラスについて詳細に説明しています。 + - **MODELS** は、ライブラリで実装されているそれぞれのモデルに関連したクラスと関数を詳細に説明しています。 + - **INTERNAL HELPERS** は、内部で使用されているユーティリティクラスや関数を詳細に説明しています。 + +### サポートされているモデル + + + +1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (Google Research and the Toyota Technological Institute at Chicago から) Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut から公開された研究論文: [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) +1. **[AltCLIP](https://huggingface.co/docs/transformers/main/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) +1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) +1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) +1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) +1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) +1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (Microsoft から) Hangbo Bao, Li Dong, Furu Wei から公開された研究論文: [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) +1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (Google から) Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova から公開された研究論文: [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) +1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (Google から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) +1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research から) Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen から公開された研究論文: [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) +1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) +1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (Google Research から) Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed から公開された研究論文: [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) +1. **[BioGpt](https://huggingface.co/docs/transformers/main/model_doc/biogpt)** (Microsoft Research AI4Science から) Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu から公開された研究論文: [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) +1. **[BiT](https://huggingface.co/docs/transformers/main/model_doc/bit)** (Google AI から) Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil から公開された研究論文: [Big Transfer (BiT)](https://arxiv.org/abs/1912.11370)Houlsby. +1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) +1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (Facebook から) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston から公開された研究論文: [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) +1. **[BLIP](https://huggingface.co/docs/transformers/main/model_doc/blip)** (Salesforce から) Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi から公開された研究論文: [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) +1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (BigScience workshop から) [BigScience Workshop](https://bigscience.huggingface.co/) から公開されました. +1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (Alexa から) Adrian de Wynter and Daniel J. Perry から公開された研究論文: [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) +1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (Google Research から) Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel から公開された研究論文: [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) +1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) +1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) +1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) +1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) +1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) +1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) +1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (Microsoft Research Asia から) Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang から公開された研究論文: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) +1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (YituTech から) Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan から公開された研究論文: [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) +1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) +1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. +1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) +1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) +1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) +1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) +1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) +1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) +1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) +1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research から) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai から公開された研究論文: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) +1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook から) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou から公開された研究論文: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) +1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) +1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) +1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) +1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) +1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) +1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) +1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) +1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (Intel Labs から) René Ranftl, Alexey Bochkovskiy, Vladlen Koltun から公開された研究論文: [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) +1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le. +1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) +1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) +1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) +1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです. **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と **ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) +1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei +1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) +1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) +1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) +1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) +1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) +1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) +1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) +1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (EleutherAI から) Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach から公開された研究論文: [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) +1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (ABEJA から) Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori からリリース. +1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) +1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) +1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) +1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) +1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) +1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) +1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) +1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) +1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) +1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) +1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia から) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei から公開された研究論文: [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) +1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (Microsoft Research Asia から) Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei から公開された研究論文: [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) +1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) +1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) +1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) +1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) +1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) +1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) +1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill から) Hao Tan and Mohit Bansal から公開された研究論文: [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) +1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook から) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert から公開された研究論文: [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) +1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) +1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team が現在開発中です. +1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) +1. **[Mask2Former](https://huggingface.co/docs/transformers/main/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) +1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) +1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) +1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) +1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) +1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) +1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) +1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) +1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) +1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) +1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) +1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) +1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) +1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) +1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) +1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) +1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) +1. **[OneFormer](https://huggingface.co/docs/transformers/main/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) +1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) +1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) +1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) +1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) +1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) +1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research から) Dat Quoc Nguyen and Anh Tuan Nguyen から公開された研究論文: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) +1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) +1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) +1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) +1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) +1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) +1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) +1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research から) Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya から公開された研究論文: [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) +1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Platforms から) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár から公開された研究論文: [Designing Network Design Space](https://arxiv.org/abs/2003.13678) +1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) +1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (Microsoft Research から) Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun から公開された研究論文: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (Facebook から), Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov から公開された研究論文: [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) +1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) +1. **[RoCBert](https://huggingface.co/docs/transformers/main/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) +1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) +1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) +1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) +1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) +1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (Facebook から), Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino から公開された研究論文: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) +1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) +1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) +1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) +1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) +1. **[Swin2SR](https://huggingface.co/docs/transformers/main/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) +1. **[SwitchTransformers](https://huggingface.co/docs/transformers/main/model_doc/switch_transformers)** (Google から) William Fedus, Barret Zoph, Noam Shazeer から公開された研究論文: [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) +1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開された研究論文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) +1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (Google AI から) Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu から公開されたレポジトリー [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) +1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (Microsoft Research から) Brandon Smock, Rohith Pesala, Robin Abraham から公開された研究論文: [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) +1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (Google AI から) Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos から公開された研究論文: [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) +1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (Microsoft Research から) Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou から公開された研究論文: [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) +1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (HuggingFace から). +1. **[TimeSformer](https://huggingface.co/docs/transformers/main/model_doc/timesformer)** (Facebook から) Gedas Bertasius, Heng Wang, Lorenzo Torresani から公開された研究論文: [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) +1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) +1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) +1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) +1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler +1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) +1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) +1. **[UPerNet](https://huggingface.co/docs/transformers/main/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) +1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741) +1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) +1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) +1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) +1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) +1. **[ViT Hybrid](https://huggingface.co/docs/transformers/main/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) +1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) +1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) +1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) +1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) +1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) +1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research から) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei から公開された研究論文: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) +1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI から) Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever から公開された研究論文: [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) +1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (Microsoft Research から) Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling から公開された研究論文: [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) +1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li から公開された研究論文: [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) +1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (Facebook から) Guillaume Lample and Alexis Conneau から公開された研究論文: [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) +1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) +1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (Facebook AI から), Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov から公開された研究論文: [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) +1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (Facebook AI から), Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau から公開された研究論文: [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) +1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (Google/CMU から) Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le から公開された研究論文: [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) +1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (Facebook AI から) Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli から公開された研究論文: [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) +1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (Facebook AI から) Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) +1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (Huazhong University of Science & Technology から) Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu から公開された研究論文: [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) +1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh から公開された研究論文: [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) + + +### サポートされているフレームワーク + +以下のテーブルはそれぞれのモデルでサポートされているライブラリを示しています。"slow"と呼ばれるPythonトークナイザー、🤗 Tokenizers ライブラリによる"fast"トークナイザー、PyTorch, TensorFlow, Flaxの5つのそれぞれがサポートされているかを示しています。 + + + +| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support | +|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:| +| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| AltCLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| Audio Spectrogram Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | +| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ | +| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ | +| BigBird-Pegasus | ❌ | ❌ | ✅ | ❌ | ❌ | +| BioGpt | ✅ | ❌ | ✅ | ❌ | ❌ | +| BiT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ | +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ | +| BLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| BLOOM | ❌ | ✅ | ✅ | ❌ | ❌ | +| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| CANINE | ✅ | ❌ | ✅ | ❌ | ❌ | +| Chinese-CLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ | +| CLIPSeg | ❌ | ❌ | ✅ | ❌ | ❌ | +| CodeGen | ✅ | ✅ | ✅ | ❌ | ❌ | +| Conditional DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ | +| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ | +| CvT | ❌ | ❌ | ✅ | ✅ | ❌ | +| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ | +| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ | +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | +| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ | +| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Deformable DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DeiT | ❌ | ❌ | ✅ | ✅ | ❌ | +| DETR | ❌ | ❌ | ✅ | ❌ | ❌ | +| DiNAT | ❌ | ❌ | ✅ | ❌ | ❌ | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | +| DonutSwin | ❌ | ❌ | ✅ | ❌ | ❌ | +| DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +| DPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | +| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| ERNIE | ❌ | ❌ | ✅ | ❌ | ❌ | +| ESM | ✅ | ❌ | ✅ | ✅ | ❌ | +| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | +| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | +| FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | +| FNet | ✅ | ✅ | ✅ | ❌ | ❌ | +| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| GIT | ❌ | ❌ | ✅ | ❌ | ❌ | +| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | +| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | +| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | +| GPT NeoX Japanese | ✅ | ❌ | ✅ | ❌ | ❌ | +| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ | +| GPT-Sw3 | ✅ | ✅ | ✅ | ✅ | ✅ | +| GroupViT | ❌ | ❌ | ✅ | ✅ | ❌ | +| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ | +| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Jukebox | ✅ | ❌ | ✅ | ❌ | ❌ | +| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ | +| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | +| LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | +| LED | ✅ | ✅ | ✅ | ✅ | ❌ | +| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| LiLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| LongT5 | ❌ | ❌ | ✅ | ❌ | ✅ | +| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ | +| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| M-CTC-T | ❌ | ❌ | ✅ | ❌ | ❌ | +| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ | +| Marian | ✅ | ❌ | ✅ | ✅ | ✅ | +| MarkupLM | ✅ | ✅ | ✅ | ❌ | ❌ | +| Mask2Former | ❌ | ❌ | ✅ | ❌ | ❌ | +| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| MaskFormerSwin | ❌ | ❌ | ❌ | ❌ | ❌ | +| mBART | ✅ | ✅ | ✅ | ✅ | ✅ | +| Megatron-BERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| MobileNetV1 | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileNetV2 | ❌ | ❌ | ✅ | ❌ | ❌ | +| MobileViT | ❌ | ❌ | ✅ | ✅ | ❌ | +| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| MT5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| MVP | ✅ | ✅ | ✅ | ❌ | ❌ | +| NAT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Nezha | ❌ | ❌ | ✅ | ❌ | ❌ | +| Nyströmformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ | +| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | +| OPT | ❌ | ❌ | ✅ | ✅ | ✅ | +| OWL-ViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ | +| PEGASUS-X | ❌ | ❌ | ✅ | ❌ | ❌ | +| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ | +| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ | +| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ | +| RAG | ✅ | ❌ | ✅ | ✅ | ❌ | +| REALM | ✅ | ✅ | ✅ | ❌ | ❌ | +| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ | +| RegNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +| ResNet | ❌ | ❌ | ✅ | ✅ | ✅ | +| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| RoBERTa-PreLayerNorm | ❌ | ❌ | ✅ | ✅ | ✅ | +| RoCBert | ✅ | ❌ | ✅ | ❌ | ❌ | +| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | +| SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ | +| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | +| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | +| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ | +| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | +| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | +| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | +| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ | +| Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ | +| Swin2SR | ❌ | ❌ | ✅ | ❌ | ❌ | +| SwitchTransformers | ❌ | ❌ | ✅ | ❌ | ❌ | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| Table Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +| Time Series Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| TimeSformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | +| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | +| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | +| UPerNet | ❌ | ❌ | ✅ | ❌ | ❌ | +| VAN | ❌ | ❌ | ✅ | ❌ | ❌ | +| VideoMAE | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ | +| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ | +| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViT | ❌ | ❌ | ✅ | ✅ | ✅ | +| ViT Hybrid | ❌ | ❌ | ✅ | ❌ | ❌ | +| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ | +| ViTMSN | ❌ | ❌ | ✅ | ❌ | ❌ | +| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | +| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | +| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | +| Whisper | ✅ | ❌ | ✅ | ✅ | ❌ | +| X-CLIP | ❌ | ❌ | ✅ | ❌ | ❌ | +| XGLM | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM | ✅ | ❌ | ✅ | ✅ | ❌ | +| XLM-ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | +| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | +| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ | +| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ | +| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ | +| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ | + + \ No newline at end of file diff --git a/docs/source/ja/installation.md b/docs/source/ja/installation.md new file mode 100644 index 000000000000..3b8646672e52 --- /dev/null +++ b/docs/source/ja/installation.md @@ -0,0 +1,244 @@ + + +# インストール + +使用しているDeep Learningライブラリに対して、🤗 Transformersをインストールしてキャッシュを設定、そしてオプションでオフラインで実行できるように 🤗 Transformersを設定します。 + +🤗 TransformersはPython 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, Flaxで動作確認しています。 使用しているDeep Learningライブラリに合わせて、以下のインストール方法に従ってください: + +* [PyTorch](https://pytorch.org/get-started/locally/)のインストール手順。 +* [TensorFlow 2.0](https://www.tensorflow.org/install/pip)のインストール手順。 +* [Flax](https://flax.readthedocs.io/en/latest/)のインストール手順。 + +## pipでのインストール + +🤗 Transformersを[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。 もし、Pythonの仮想環境に馴染みがない場合は、この[ガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)をご覧ください。仮想環境によって異なるプロジェクトの管理がより簡単になり、依存関係間の互換性の問題を回避できます。 + +まず、プロジェクトディレクトリに仮想環境を作成することから始めましょう: + +```bash +python -m venv .env +``` + +仮想環境を起動しましょう。LinuxとMacOsの場合は以下のコマンドで起動します: + +```bash +source .env/bin/activate +``` +Windowsで仮想環境を起動します + +```bash +.env/Scripts/activate +``` + +これで、次のコマンドで🤗 Transformersをインストールする準備が整いました: + +```bash +pip install transformers +``` + +CPU対応のみ必要な場合、🤗 TransformersとDeep Learningライブラリを1行でインストールできるようになっていて便利です。例えば、🤗 TransformersとPyTorchを以下のように一緒にインストールできます: + +```bash +pip install transformers[torch] +``` + +🤗 TransformersとTensorFlow 2.0: + +```bash +pip install transformers[tf-cpu] +``` + +🤗 TransformersとFlax: + +```bash +pip install transformers[flax] +``` + +最後に、以下のコマンドを実行することで🤗 Transformersが正しくインストールされているかを確認します。学習済みモデルがダウンロードされます: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))" +``` + +その後、ラベルとスコアが出力されます: + +```bash +[{'label': 'POSITIVE', 'score': 0.9998704791069031}] +``` + +## ソースからのインストール + +以下のコマンドでソースから🤗 Transformersをインストールします: + +```bash +pip install git+https://github.com/huggingface/transformers +``` + +このコマンドは最新の安定版ではなく、開発における最新の`main`バージョンをインストールします。`main`バージョンは最新の開発状況に対応するのに便利です。例えば、最後の公式リリース以降にバグが修正されたが、新しいリリースがまだ展開されていない場合などです。しかし、これは`main`バージョンが常に安定しているとは限らないことを意味します。私たちは`main`バージョンの運用を維持するよう努め、ほとんどの問題は通常、数時間から1日以内に解決されます。もし問題に遭遇した場合は、より早く修正できるように[Issue](https://github.com/huggingface/transformers/issues)を作成してください! + +以下のコマンドを実行して、🤗 Transformersが正しくインストールされているかどうかを確認します: + +```bash +python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))" +``` + +## 編集可能なインストール + +必要に応じて、編集可能なインストールをします: + +* ソースコードの`main`バージョンを使います。 +* 🤗 Transformersにコントリビュートし、コードの変更をテストする必要があります。 + +以下のコマンドでレポジトリをクローンして、🤗 Transformersをインストールします: + +```bash +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install -e . +``` + +上記のコマンドは、レポジトリをクローンしたフォルダとPythonのライブラリをパスをリンクします。Pythonは通常のライブラリパスに加えて、あなたがクローンしたフォルダの中も見るようになります。例えば、Pythonパッケージが通常、`~/anaconda3/envs/main/lib/python3.7/site-packages/`にインストールされている場合、Pythonはクローンしたフォルダも検索するようになります: `~/transformers/`. + + + +ライブラリーを使い続けたい場合は、transformersフォルダーを保持しつづける必要があります。 + + + +これで、次のコマンドで簡単にクローンを🤗 Transformersの最新版に更新できます: + +```bash +cd ~/transformers/ +git pull +``` + +Python環境は次回の実行時に🤗 Transformersの`main`バージョンを見つけるようになります。 + +## condaでのインストール + +`huggingface`のcondaチャンネルからインストールします: + +```bash +conda install -c huggingface transformers +``` + +## キャッシュの設定 + +学習済みモデルはダウンロードされ、ローカルにキャッシュされます: `~/.cache/huggingface/hub`. これはシェル環境変数`TRANSFORMERS_CACHE`で指定されるデフォルトのディレクトリです。Windowsでは、デフォルトのディレクトリは`C:\Users\username\.cache\huggingface\hub`になっています。異なるキャッシュディレクトリを指定するために、以下のシェル環境変数を変更することが可能です。優先度は以下の順番に対応します: + +1. シェル環境変数 (デフォルト): `HUGGINGFACE_HUB_CACHE` または `TRANSFORMERS_CACHE`. +2. シェル環境変数: `HF_HOME`. +3. シェル環境変数: `XDG_CACHE_HOME` + `/huggingface`. + + + +もし、以前のバージョンのライブラリを使用していた人で、`PYTORCH_TRANSFORMERS_CACHE`または`PYTORCH_PRETRAINED_BERT_CACHE`を設定していた場合、シェル環境変数`TRANSFORMERS_CACHE`を指定しない限り🤗 Transformersはこれらのシェル環境変数を使用します。 + + + +## オフラインモード + +🤗 Transformersはローカルファイルのみを使用することでファイアウォールやオフラインの環境でも動作させることができます。この動作を有効にするためには、環境変数`TRANSFORMERS_OFFLINE=1`を設定します。 + + + +環境変数`HF_DATASETS_OFFLINE=1`を設定し、オフライントレーニングワークフローに[🤗 Datasets](https://huggingface.co/docs/datasets/)を追加します。 + + + +例えば、外部インスタンスに対してファイアウォールで保護された通常のネットワーク上でプログラムを実行する場合、通常以下のようなコマンドで実行することになります: + +```bash +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +オフラインインスタンスでこの同じプログラムを実行します: + +```bash +HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ +python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ... +``` + +このスクリプトは、ローカルファイルのみを検索することが分かっているので、ハングアップしたりタイムアウトを待ったりすることなく実行されるはずです。 + +### オフラインで使用するためにモデルやトークナイザーを取得する + +オフラインで🤗 Transformersを使用するもう1つの方法は、前もってファイルをダウンロードしておき、オフラインで使用する必要があるときにそのローカルパスを指定することです。これには3つの方法があります: + +* [Model Hub](https://huggingface.co/models)のユーザーインターフェース上から↓アイコンをクリックしてファイルをダウンロードする方法。 + + ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png) + +* [`PreTrainedModel.from_pretrained`]および[`PreTrainedModel.save_pretrained`]のワークフローを使用する方法: + + 1. [`PreTrainedModel.from_pretrained`]で前もってファイルをダウンロードします: + + ```py + >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + + >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B") + ``` + + 2. [`PreTrainedModel.save_pretrained`]で指定されたディレクトリにファイルを保存しておきます: + + ```py + >>> tokenizer.save_pretrained("./your/path/bigscience_t0") + >>> model.save_pretrained("./your/path/bigscience_t0") + ``` + + 3. オフラインにある時、[`PreTrainedModel.from_pretrained`]に指定したディレクトリからファイルをリロードします: + + ```py + >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0") + >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0") + ``` + +* プログラム的に[huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub)ライブラリを用いて、ファイルをダウンロードする方法: + + 1. 仮想環境に`huggingface_hub`ライブラリをインストールします: + + ```bash + python -m pip install huggingface_hub + ``` + + 2. 指定のパスにファイルをダウンロードするために、[`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub)関数を使用します。例えば、以下のコマンドで、[T0](https://huggingface.co/bigscience/T0_3B)モデルの`config.json`ファイルを指定のパスにダウンロードできます: + + ```py + >>> from huggingface_hub import hf_hub_download + + >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0") + ``` + +ファイルがダウンロードされ、ローカルにキャッシュされたら、そのローカルパスを指定してファイルをロードして使用します: + +```py +>>> from transformers import AutoConfig + +>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json") +``` + + + +Hubに保存されているファイルをダウンロードする方法の詳細については、[How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream)セクションを参照してください。 + + \ No newline at end of file diff --git a/docs/source/ja/multilingual.md b/docs/source/ja/multilingual.md new file mode 100644 index 000000000000..86dabb94633c --- /dev/null +++ b/docs/source/ja/multilingual.md @@ -0,0 +1,178 @@ + + +# 推論のための多言語モデル + +[[open-in-colab]] + +🤗 Transformers にはいくつかの多言語モデルがあり、それらの推論の使用方法は単一言語モデルとは異なります。ただし、多言語モデルの使用方法がすべて異なるわけではありません。 [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased) などの一部のモデルは、単一言語モデルと同様に使用できます。 このガイドでは、推論のために使用方法が異なる多言語モデルをどのように使うかを示します。 + +## XLM + +XLM には10の異なるチェックポイントがあり、そのうちの1つだけが単一言語です。 残りの9つのモデルチェックポイントは、言語埋め込みを使用するチェックポイントと使用しないチェックポイントの2つのカテゴリに分けることができます。 + +### 言語の埋め込みがある XLM + +次の XLM モデルは、言語の埋め込みを使用して、推論で使用される言語を指定します。 + +- `xlm-mlm-ende-1024` (マスク化された言語モデリング、英語-ドイツ語) +- `xlm-mlm-enfr-1024` (マスク化された言語モデリング、英語-フランス語) +- `xlm-mlm-enro-1024` (マスク化された言語モデリング、英語-ルーマニア語) +- `xlm-mlm-xnli15-1024` (マスク化された言語モデリング、XNLI 言語) +- `xlm-mlm-tlm-xnli15-1024` (マスク化された言語モデリング + 翻訳 + XNLI 言語) +- `xlm-clm-enfr-1024` (因果言語モデリング、英語-フランス語) +- `xlm-clm-ende-1024` (因果言語モデリング、英語-ドイツ語) + +言語の埋め込みは、モデルに渡される `input_ids` と同じ形状のテンソルとして表されます。 これらのテンソルの値は、使用される言語に依存し、トークナイザーの `lang2id` および `id2lang` 属性によって識別されます。 + +この例では、`xlm-clm-enfr-1024` チェックポイントをロードします (因果言語モデリング、英語-フランス語)。 + +```py +>>> import torch +>>> from transformers import XLMTokenizer, XLMWithLMHeadModel + +>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") +>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") +``` + +トークナイザーの `lang2id` 属性は、このモデルの言語とその ID を表示します。 + +```py +>>> print(tokenizer.lang2id) +{'en': 0, 'fr': 1} +``` + +次に、入力例を作成します。 + +```py +>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 +``` + +言語 ID を `en` に設定し、それを使用して言語の埋め込みを定義します。 言語の埋め込みは、英語の言語 ID であるため、`0` で埋められたテンソルです。 このテンソルは `input_ids` と同じサイズにする必要があります。 + +```py +>>> language_id = tokenizer.lang2id["en"] # 0 +>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) + +>>> # We reshape it to be of size (batch_size, sequence_length) +>>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) +``` + +これで、`input_ids` と言語の埋め込みをモデルに渡すことができます。 + +```py +>>> outputs = model(input_ids, langs=langs) +``` + +[run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) スクリプトは、`xlm-clm` チェックポイントを使用して、言語が埋め込まれたテキストを生成できます。 + +### 言語の埋め込みがないXLM + +次の XLM モデルは、推論中に言語の埋め込みを必要としません。 + +- `xlm-mlm-17-1280` (マスク化された言語モデリング、17の言語) +- `xlm-mlm-100-1280` (マスク化された言語モデリング、100の言語) + +これらのモデルは、以前の XLM チェックポイントとは異なり、一般的な文の表現に使用されます。 + +## BERT + +以下の BERT モデルは、多言語タスクに使用できます。 + +- `bert-base-multilingual-uncased` (マスク化された言語モデリング + 次の文の予測、102の言語) +- `bert-base-multilingual-cased` (マスク化された言語モデリング + 次の文の予測、104の言語) + +これらのモデルは、推論中に言語の埋め込みを必要としません。 文脈から言語を識別し、それに応じて推測する必要があります。 + +## XLM-RoBERTa + +次の XLM-RoBERTa モデルは、多言語タスクに使用できます。 + +- `xlm-roberta-base` (マスク化された言語モデリング、100の言語) +- `xlm-roberta-large` (マスク化された言語モデリング、100の言語) + +XLM-RoBERTa は、100の言語で新しく作成およびクリーニングされた2.5 TB の CommonCrawl データでトレーニングされました。 これは、分類、シーケンスのラベル付け、質問応答などのダウンストリームタスクで、mBERT や XLM などの以前にリリースされた多言語モデルを大幅に改善します。 + +## M2M100 + +次の M2M100 モデルは、多言語翻訳に使用できます。 + +- `facebook/m2m100_418M` (翻訳) +- `facebook/m2m100_1.2B` (翻訳) + +この例では、`facebook/m2m100_418M` チェックポイントをロードして、中国語から英語に翻訳します。 トークナイザーでソース言語を設定できます。 + +```py +>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." + +>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +``` + +テキストをトークン化します。 + +```py +>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") +``` + +M2M100 は、最初に生成されたトークンとしてターゲット言語 ID を強制的にターゲット言語に翻訳します。 英語に翻訳するには、`generate` メソッドで `forced_bos_token_id` を `en` に設定します。 + +```py +>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' +``` + +## MBart + +多言語翻訳には、次の MBart モデルを使用できます。 + +- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages) +- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages) +- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages) +- `facebook/mbart-large-50` (Multilingual translation, 50 languages) +- `facebook/mbart-large-cc25` + +この例では、`facebook/mbart-large-50-many-to-many-mmt` チェックポイントをロードして、フィンランド語を英語に翻訳します。トークナイザーでソース言語を設定できます。 + +```py +>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." + +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +``` + +テキストをトークン化します。 + +```py +>>> encoded_en = tokenizer(en_text, return_tensors="pt") +``` + +MBart は、最初に生成されたトークンとしてターゲット言語 ID を強制的にターゲット言語に翻訳します。 英語に翻訳するには、`generate` メソッドで `forced_bos_token_id` を `en` に設定します。 + +```py +>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." +``` + +`facebook/mbart-large-50-many-to-one-mmt` チェックポイントを使用している場合、最初に生成されたトークンとしてターゲット言語 ID を強制する必要はありません。それ以外の場合、使用方法は同じです。 \ No newline at end of file diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 62c6e57c72dd..b26d85aeae92 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -1,58 +1,692 @@ - sections: - local: index title: 🤗 Transformers + - local: quicktour + title: 둘러보기 + - local: installation + title: 설치방법 title: 시작하기 - sections: - - local: in_translation - title: (번역 중) + - local: pipeline_tutorial + title: Pipeline으로 추론하기 + - local: autoclass_tutorial + title: AutoClass로 사전 학습된 인스턴스 로드하기 + - local: preprocessing + title: 데이터 전처리하기 + - local: training + title: 사전 학습된 모델 미세 조정하기 + - local: run_scripts + title: 스크립트로 학습하기 + - local: accelerate + title: 🤗 Accelerate로 분산 학습 구성하기 + - local: peft + title: 🤗 PEFT로 어댑터 로드 및 학습하기 + - local: model_sharing + title: 만든 모델 공유하기 + - local: transformers_agents + title: 에이전트 + - local: llm_tutorial + title: 대규모 언어 모델로 생성하기 title: 튜토리얼 - sections: - - local: in_translation - title: (번역 중) - title: How-to 가이드 + - sections: + - local: tasks/sequence_classification + title: 텍스트 분류 + - local: tasks/token_classification + title: 토큰 분류 + - local: tasks/question_answering + title: 질의 응답(Question Answering) + - local: tasks/language_modeling + title: 인과적 언어 모델링(Causal language modeling) + - local: tasks/masked_language_modeling + title: 마스킹된 언어 모델링(Masked language modeling) + - local: tasks/translation + title: 번역 + - local: tasks/summarization + title: 요약 + - local: tasks/multiple_choice + title: 객관식 문제(Multiple Choice) + title: 자연어처리 + isExpanded: false + - sections: + - local: tasks/audio_classification + title: 오디오 분류 + - local: tasks/asr + title: 자동 음성 인식 + title: 오디오 + isExpanded: false + - sections: + - local: tasks/image_classification + title: 이미지 분류 + - local: tasks/semantic_segmentation + title: 의미적 분할(Semantic segmentation) + - local: tasks/video_classification + title: 영상 분류 + - local: tasks/object_detection + title: 객체 탐지 + - local: tasks/zero_shot_object_detection + title: 제로샷(zero-shot) 객체 탐지 + - local: tasks/zero_shot_image_classification + title: 제로샷(zero-shot) 이미지 분류 + - local: tasks/monocular_depth_estimation + title: 단일 영상 기반 깊이 추정 + title: 컴퓨터 비전 + isExpanded: false + - sections: + - local: tasks/image_captioning + title: 이미지 캡셔닝 + - local: tasks/document_question_answering + title: 문서 질의 응답(Document Question Answering) + - local: tasks/visual_question_answering + title: 시각적 질의응답 (Visual Question Answering) + title: 멀티모달 + isExpanded: false + title: 태스크 가이드 +- sections: + - local: fast_tokenizers + title: 🤗 Tokenizers 라이브러리에서 토크나이저 사용하기 + - local: multilingual + title: 다국어 모델 추론하기 + - local: in_translation + title: (번역중) Customize text generation strategy + - local: create_a_model + title: 모델별 API 사용하기 + - local: custom_models + title: 사용자 정의 모델 공유하기 + - local: sagemaker + title: Amazon SageMaker에서 학습 실행하기 + - local: serialization + title: ONNX로 내보내기 + - local: tflite + title: TFLite로 내보내기 + - local: torchscript + title: TorchScript로 내보내기 + - local: in_translation + title: (번역중) Benchmarks + - local: in_translation + title: (번역중) Notebooks with examples + - local: community + title: 커뮤니티 리소스 + - local: custom_tools + title: 사용자 정의 도구와 프롬프트 + - local: troubleshooting + title: 문제 해결 + title: (번역중) 개발자 가이드 +- sections: + - local: performance + title: 성능 및 확장성 + - local: in_translation + title: (번역중) Training on one GPU + - local: perf_train_gpu_many + title: 다중 GPU에서 훈련 진행하기 + - local: perf_train_cpu + title: CPU에서 훈련 + - local: perf_train_cpu_many + title: 다중 CPU에서 훈련하기 + - local: in_translation + title: (번역중) Training on TPUs + - local: perf_train_tpu_tf + title: TensorFlow로 TPU에서 훈련하기 + - local: in_translation + title: (번역중) Training on Specialized Hardware + - local: perf_infer_cpu + title: CPU로 추론하기 + - local: perf_infer_gpu_one + title: 하나의 GPU를 활용한 추론 + - local: perf_infer_gpu_many + title: 다중 GPU에서 추론 + - local: in_translation + title: (번역중) Inference on Specialized Hardware + - local: perf_hardware + title: 훈련용 사용자 맞춤형 하드웨어 + - local: in_translation + title: (번역중) Instantiating a big model + - local: debugging + title: 디버깅 + - local: hpo_train + title: Trainer API를 사용한 하이퍼파라미터 탐색 + - local: tf_xla + title: TensorFlow 모델을 위한 XLA 통합 + title: (번역중) 성능 및 확장성 - sections: + - local: contributing + title: 🤗 Transformers에 기여하는 방법 + - local: add_new_model + title: 🤗 Transformers에 새로운 모델을 추가하는 방법 + - local: add_tensorflow_model + title: 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요? + - local: add_new_pipeline + title: 어떻게 🤗 Transformers에 파이프라인을 추가하나요? + - local: testing + title: 테스트 + - local: pr_checks + title: Pull Request에 대한 검사 + title: (번역중) 기여하기 + +- sections: + - local: philosophy + title: 이념과 목표 - local: in_translation - title: (번역 중) - title: 개념 가이드 + title: (번역중) Glossary + - local: task_summary + title: 🤗 Transformers로 할 수 있는 작업 + - local: tasks_explained + title: 🤗 Transformers로 작업을 해결하는 방법 + - local: model_summary + title: Transformer 모델군 + - local: tokenizer_summary + title: 토크나이저 요약 + - local: attention + title: 어텐션 매커니즘 + - local: pad_truncation + title: 패딩과 잘라내기 + - local: bertology + title: BERTology + - local: perplexity + title: 고정 길이 모델의 펄플렉서티(Perplexity) + - local: pipeline_webserver + title: 추론 웹 서버를 위한 파이프라인 + - local: model_memory_anatomy + title: 모델 학습 해부하기 + title: (번역중) 개념 가이드 - sections: - sections: - local: in_translation - title: (번역 중) - title: 메인 클래스 + title: (번역중) Auto Classes + - local: in_translation + title: (번역중) Callbacks + - local: in_translation + title: (번역중) Configuration + - local: in_translation + title: (번역중) Data Collator + - local: in_translation + title: (번역중) Keras callbacks + - local: in_translation + title: (번역중) Logging + - local: in_translation + title: (번역중) Models + - local: in_translation + title: (번역중) Text Generation + - local: in_translation + title: (번역중) ONNX + - local: in_translation + title: (번역중) Optimization + - local: in_translation + title: (번역중) Model outputs + - local: in_translation + title: (번역중) Pipelines + - local: in_translation + title: (번역중) Processors + - local: in_translation + title: (번역중) Quantization + - local: in_translation + title: (번역중) Tokenizer + - local: in_translation + title: (번역중) Trainer + - local: in_translation + title: (번역중) DeepSpeed Integration + - local: in_translation + title: (번역중) Feature Extractor + - local: in_translation + title: (번역중) Image Processor + title: (번역중) 메인 클래스 - sections: - isExpanded: false sections: - local: in_translation - title: (번역 중) - title: 텍스트 모델 + title: (번역중) ALBERT + - local: in_translation + title: (번역중) BART + - local: in_translation + title: (번역중) BARThez + - local: in_translation + title: (번역중) BARTpho + - local: in_translation + title: (번역중) BERT + - local: in_translation + title: (번역중) BertGeneration + - local: in_translation + title: (번역중) BertJapanese + - local: in_translation + title: (번역중) Bertweet + - local: in_translation + title: (번역중) BigBird + - local: in_translation + title: (번역중) BigBirdPegasus + - local: in_translation + title: (번역중) BioGpt + - local: in_translation + title: (번역중) Blenderbot + - local: in_translation + title: (번역중) Blenderbot Small + - local: in_translation + title: (번역중) BLOOM + - local: in_translation + title: (번역중) BORT + - local: in_translation + title: (번역중) ByT5 + - local: in_translation + title: (번역중) CamemBERT + - local: in_translation + title: (번역중) CANINE + - local: in_translation + title: (번역중) CodeGen + - local: in_translation + title: (번역중) ConvBERT + - local: in_translation + title: (번역중) CPM + - local: in_translation + title: (번역중) CPMANT + - local: in_translation + title: (번역중) CTRL + - local: in_translation + title: (번역중) DeBERTa + - local: in_translation + title: (번역중) DeBERTa-v2 + - local: in_translation + title: (번역중) DialoGPT + - local: in_translation + title: (번역중) DistilBERT + - local: in_translation + title: (번역중) DPR + - local: in_translation + title: (번역중) ELECTRA + - local: in_translation + title: (번역중) Encoder Decoder Models + - local: in_translation + title: (번역중) ERNIE + - local: in_translation + title: (번역중) ErnieM + - local: in_translation + title: (번역중) ESM + - local: in_translation + title: (번역중) FLAN-T5 + - local: in_translation + title: (번역중) FLAN-UL2 + - local: in_translation + title: (번역중) FlauBERT + - local: in_translation + title: (번역중) FNet + - local: in_translation + title: (번역중) FSMT + - local: in_translation + title: (번역중) Funnel Transformer + - local: in_translation + title: (번역중) GPT + - local: in_translation + title: (번역중) GPT Neo + - local: in_translation + title: (번역중) GPT NeoX + - local: in_translation + title: (번역중) GPT NeoX Japanese + - local: in_translation + title: (번역중) GPT-J + - local: in_translation + title: (번역중) GPT2 + - local: in_translation + title: (번역중) GPTBigCode + - local: in_translation + title: (번역중) GPTSAN Japanese + - local: in_translation + title: (번역중) GPTSw3 + - local: in_translation + title: (번역중) HerBERT + - local: in_translation + title: (번역중) I-BERT + - local: in_translation + title: (번역중) Jukebox + - local: in_translation + title: (번역중) LED + - local: model_doc/llama + title: LLaMA + - local: model_doc/llama2 + title: LLaMA2 + - local: in_translation + title: (번역중) Longformer + - local: in_translation + title: (번역중) LongT5 + - local: in_translation + title: (번역중) LUKE + - local: in_translation + title: (번역중) M2M100 + - local: in_translation + title: (번역중) MarianMT + - local: in_translation + title: (번역중) MarkupLM + - local: in_translation + title: (번역중) MBart and MBart-50 + - local: in_translation + title: (번역중) MEGA + - local: in_translation + title: (번역중) MegatronBERT + - local: in_translation + title: (번역중) MegatronGPT2 + - local: in_translation + title: (번역중) mLUKE + - local: in_translation + title: (번역중) MobileBERT + - local: in_translation + title: (번역중) MPNet + - local: in_translation + title: (번역중) MT5 + - local: in_translation + title: (번역중) MVP + - local: in_translation + title: (번역중) NEZHA + - local: in_translation + title: (번역중) NLLB + - local: in_translation + title: (번역중) NLLB-MoE + - local: in_translation + title: (번역중) Nyströmformer + - local: in_translation + title: (번역중) Open-Llama + - local: in_translation + title: (번역중) OPT + - local: in_translation + title: (번역중) Pegasus + - local: in_translation + title: (번역중) PEGASUS-X + - local: in_translation + title: (번역중) PhoBERT + - local: in_translation + title: (번역중) PLBart + - local: in_translation + title: (번역중) ProphetNet + - local: in_translation + title: (번역중) QDQBert + - local: in_translation + title: (번역중) RAG + - local: in_translation + title: (번역중) REALM + - local: in_translation + title: (번역중) Reformer + - local: in_translation + title: (번역중) RemBERT + - local: in_translation + title: (번역중) RetriBERT + - local: in_translation + title: (번역중) RoBERTa + - local: in_translation + title: (번역중) RoBERTa-PreLayerNorm + - local: in_translation + title: (번역중) RoCBert + - local: in_translation + title: (번역중) RoFormer + - local: in_translation + title: (번역중) Splinter + - local: in_translation + title: (번역중) SqueezeBERT + - local: in_translation + title: (번역중) SwitchTransformers + - local: in_translation + title: (번역중) T5 + - local: in_translation + title: (번역중) T5v1.1 + - local: in_translation + title: (번역중) TAPEX + - local: in_translation + title: (번역중) Transformer XL + - local: in_translation + title: (번역중) UL2 + - local: in_translation + title: (번역중) X-MOD + - local: in_translation + title: (번역중) XGLM + - local: in_translation + title: (번역중) XLM + - local: in_translation + title: (번역중) XLM-ProphetNet + - local: in_translation + title: (번역중) XLM-RoBERTa + - local: in_translation + title: (번역중) XLM-RoBERTa-XL + - local: in_translation + title: (번역중) XLM-V + - local: in_translation + title: (번역중) XLNet + - local: in_translation + title: (번역중) YOSO + title: (번역중) 텍스트 모델 + - isExpanded: false + sections: + - local: in_translation + title: (번역중) BEiT + - local: in_translation + title: (번역중) BiT + - local: in_translation + title: (번역중) Conditional DETR + - local: in_translation + title: (번역중) ConvNeXT + - local: in_translation + title: (번역중) ConvNeXTV2 + - local: in_translation + title: (번역중) CvT + - local: in_translation + title: (번역중) Deformable DETR + - local: in_translation + title: (번역중) DeiT + - local: in_translation + title: (번역중) DETA + - local: in_translation + title: (번역중) DETR + - local: in_translation + title: (번역중) DiNAT + - local: in_translation + title: (번역중) DiT + - local: in_translation + title: (번역중) DPT + - local: in_translation + title: (번역중) EfficientFormer + - local: in_translation + title: (번역중) EfficientNet + - local: in_translation + title: (번역중) FocalNet + - local: in_translation + title: (번역중) GLPN + - local: in_translation + title: (번역중) ImageGPT + - local: in_translation + title: (번역중) LeViT + - local: in_translation + title: (번역중) Mask2Former + - local: in_translation + title: (번역중) MaskFormer + - local: in_translation + title: (번역중) MobileNetV1 + - local: in_translation + title: (번역중) MobileNetV2 + - local: in_translation + title: (번역중) MobileViT + - local: in_translation + title: (번역중) NAT + - local: in_translation + title: (번역중) PoolFormer + - local: in_translation + title: (번역중) RegNet + - local: in_translation + title: (번역중) ResNet + - local: in_translation + title: (번역중) SegFormer + - local: in_translation + title: (번역중) Swin Transformer + - local: in_translation + title: (번역중) Swin Transformer V2 + - local: in_translation + title: (번역중) Swin2SR + - local: in_translation + title: (번역중) Table Transformer + - local: in_translation + title: (번역중) TimeSformer + - local: in_translation + title: (번역중) UperNet + - local: in_translation + title: (번역중) VAN + - local: in_translation + title: (번역중) VideoMAE + - local: in_translation + title: (번역중) Vision Transformer (ViT) + - local: in_translation + title: (번역중) ViT Hybrid + - local: in_translation + title: (번역중) ViTMAE + - local: in_translation + title: (번역중) ViTMSN + - local: in_translation + title: (번역중) YOLOS + title: (번역중) 비전 모델 - isExpanded: false sections: - local: in_translation - title: (번역 중) - title: 비전 모델 + title: (번역중) Audio Spectrogram Transformer + - local: in_translation + title: (번역중) CLAP + - local: in_translation + title: (번역중) Hubert + - local: in_translation + title: (번역중) MCTCT + - local: in_translation + title: (번역중) SEW + - local: in_translation + title: (번역중) SEW-D + - local: in_translation + title: (번역중) Speech2Text + - local: in_translation + title: (번역중) Speech2Text2 + - local: in_translation + title: (번역중) SpeechT5 + - local: in_translation + title: (번역중) UniSpeech + - local: in_translation + title: (번역중) UniSpeech-SAT + - local: in_translation + title: (번역중) Wav2Vec2 + - local: in_translation + title: (번역중) Wav2Vec2-Conformer + - local: in_translation + title: (번역중) Wav2Vec2Phoneme + - local: in_translation + title: (번역중) WavLM + - local: model_doc/whisper + title: Whisper + - local: in_translation + title: (번역중) XLS-R + - local: in_translation + title: (번역중) XLSR-Wav2Vec2 + title: (번역중) 오디오 모델 - isExpanded: false sections: - local: in_translation - title: (번역 중) - title: 오디오 모델 + title: (번역중) ALIGN + - local: in_translation + title: (번역중) AltCLIP + - local: in_translation + title: (번역중) BLIP + - local: in_translation + title: (번역중) BLIP-2 + - local: in_translation + title: (번역중) BridgeTower + - local: in_translation + title: (번역중) Chinese-CLIP + - local: in_translation + title: (번역중) CLIP + - local: in_translation + title: (번역중) CLIPSeg + - local: in_translation + title: (번역중) Data2Vec + - local: in_translation + title: (번역중) DePlot + - local: in_translation + title: (번역중) Donut + - local: in_translation + title: (번역중) FLAVA + - local: in_translation + title: (번역중) GIT + - local: in_translation + title: (번역중) GroupViT + - local: in_translation + title: (번역중) LayoutLM + - local: in_translation + title: (번역중) LayoutLMV2 + - local: in_translation + title: (번역중) LayoutLMV3 + - local: in_translation + title: (번역중) LayoutXLM + - local: in_translation + title: (번역중) LiLT + - local: in_translation + title: (번역중) LXMERT + - local: in_translation + title: (번역중) MatCha + - local: in_translation + title: (번역중) MGP-STR + - local: in_translation + title: (번역중) OneFormer + - local: in_translation + title: (번역중) OWL-ViT + - local: in_translation + title: (번역중) Perceiver + - local: in_translation + title: (번역중) Pix2Struct + - local: in_translation + title: (번역중) Segment Anything + - local: in_translation + title: (번역중) Speech Encoder Decoder Models + - local: in_translation + title: (번역중) TAPAS + - local: in_translation + title: (번역중) TrOCR + - local: in_translation + title: (번역중) TVLT + - local: in_translation + title: (번역중) ViLT + - local: in_translation + title: (번역중) Vision Encoder Decoder Models + - local: in_translation + title: (번역중) Vision Text Dual Encoder + - local: in_translation + title: (번역중) VisualBERT + - local: in_translation + title: (번역중) X-CLIP + title: (번역중) 멀티모달 모델 - isExpanded: false sections: - local: in_translation - title: (번역 중) - title: 멀티모달 모델 + title: (번역중) Decision Transformer + - local: in_translation + title: (번역중) Trajectory Transformer + title: (번역중) 강화학습 모델 - isExpanded: false sections: - local: in_translation - title: (번역 중) - title: 강화학습 모델 + title: (번역중) Informer + - local: in_translation + title: (번역중) Time Series Transformer + title: (번역중) 시계열 모델 - isExpanded: false sections: - local: in_translation - title: (번역 중) - title: 시계열 모델 - title: 모델 + title: (번역중) Graphormer + title: (번역중) Graph models + title: (번역중) 모델 - sections: - local: in_translation - title: (번역 중) - title: 내부 유틸리티 - title: API + title: (번역중) Custom Layers and Utilities + - local: in_translation + title: (번역중) Utilities for pipelines + - local: in_translation + title: (번역중) Utilities for Tokenizers + - local: in_translation + title: (번역중) Utilities for Trainer + - local: in_translation + title: (번역중) Utilities for Generation + - local: in_translation + title: (번역중) Utilities for Image Processors + - local: in_translation + title: (번역중) Utilities for Audio processing + - local: in_translation + title: (번역중) General Utilities + - local: in_translation + title: (번역중) Utilities for Time Series + title: (번역중) Internal Helpers + title: (번역중) API diff --git a/docs/source/ko/accelerate.md b/docs/source/ko/accelerate.md new file mode 100644 index 000000000000..0ef8957de3ac --- /dev/null +++ b/docs/source/ko/accelerate.md @@ -0,0 +1,136 @@ + + +# 🤗 Accelerate를 활용한 분산 학습[[distributed-training-with-accelerate]] + +모델이 커지면서 병렬 처리는 제한된 하드웨어에서 더 큰 모델을 훈련하고 훈련 속도를 몇 배로 가속화하기 위한 전략으로 등장했습니다. Hugging Face에서는 사용자가 하나의 머신에 여러 개의 GPU를 사용하든 여러 머신에 여러 개의 GPU를 사용하든 모든 유형의 분산 설정에서 🤗 Transformers 모델을 쉽게 훈련할 수 있도록 돕기 위해 [🤗 Accelerate](https://huggingface.co/docs/accelerate) 라이브러리를 만들었습니다. 이 튜토리얼에서는 분산 환경에서 훈련할 수 있도록 기본 PyTorch 훈련 루프를 커스터마이즈하는 방법을 알아봅시다. + +## 설정[[setup]] + +🤗 Accelerate 설치 시작하기: + +```bash +pip install accelerate +``` + +그 다음, [`~accelerate.Accelerator`] 객체를 불러오고 생성합니다. [`~accelerate.Accelerator`]는 자동으로 분산 설정 유형을 감지하고 훈련에 필요한 모든 구성 요소를 초기화합니다. 장치에 모델을 명시적으로 배치할 필요는 없습니다. + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## 가속화를 위한 준비[[prepare-to-accelerate]] + +다음 단계는 관련된 모든 훈련 객체를 [`~accelerate.Accelerator.prepare`] 메소드에 전달하는 것입니다. 여기에는 훈련 및 평가 데이터로더, 모델 및 옵티마이저가 포함됩니다: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## 백워드(Backward)[[backward]] + +마지막으로 훈련 루프의 일반적인 `loss.backward()`를 🤗 Accelerate의 [`~accelerate.Accelerator.backward`] 메소드로 대체하기만 하면 됩니다: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +다음 코드에서 볼 수 있듯이, 훈련 루프에 코드 네 줄만 추가하면 분산 학습을 활성화할 수 있습니다! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## 학습[[train]] + +관련 코드를 추가한 후에는 스크립트나 Colaboratory와 같은 노트북에서 훈련을 시작하세요. + +### 스크립트로 학습하기[[train-with-a-script]] + +스크립트에서 훈련을 실행하는 경우, 다음 명령을 실행하여 구성 파일을 생성하고 저장합니다: + +```bash +accelerate config +``` + +Then launch your training with: + +```bash +accelerate launch train.py +``` + +### 노트북으로 학습하기[[train-with-a-notebook]] + +Collaboratory의 TPU를 사용하려는 경우, 노트북에서도 🤗 Accelerate를 실행할 수 있습니다. 훈련을 담당하는 모든 코드를 함수로 감싸서 [`~accelerate.notebook_launcher`]에 전달하세요: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +🤗 Accelerate 및 다양한 기능에 대한 자세한 내용은 [documentation](https://huggingface.co/docs/accelerate)를 참조하세요. \ No newline at end of file diff --git a/docs/source/ko/add_new_model.md b/docs/source/ko/add_new_model.md new file mode 100644 index 000000000000..6ae32d2ac60f --- /dev/null +++ b/docs/source/ko/add_new_model.md @@ -0,0 +1,630 @@ + + +# Hugging Face Transformers를 추가하는 방법은 무엇인가요? [[how-to-add-a-model-to-transformers]] + +Hugging Face Transformers 라이브러리는 커뮤니티 기여자들 덕분에 새로운 모델을 제공할 수 있는 경우가 많습니다. 하지만 이는 도전적인 프로젝트이며 Hugging Face Transformers 라이브러리와 구현할 모델에 대한 깊은 이해가 필요합니다. Hugging Face에서는 더 많은 커뮤니티 멤버가 모델을 적극적으로 추가할 수 있도록 지원하고자 하며, 이 가이드를 통해 PyTorch 모델을 추가하는 과정을 안내하고 있습니다 (PyTorch가 설치되어 있는지 확인해주세요). + + + +TensorFlow 모델을 구현하고자 하는 경우 [🤗 Transformers 모델을 TensorFlow로 변환하는 방법](add_tensorflow_model) 가이드를 살펴보세요! + + + +이 과정을 진행하면 다음과 같은 내용을 이해하게 됩니다: + +- 오픈 소스의 모범 사례에 대한 통찰력을 얻습니다. +- 가장 인기 있는 딥러닝 라이브러리의 설계 원칙을 이해합니다. +- 대규모 모델을 효율적으로 테스트하는 방법을 배웁니다. +- `black`, `ruff`, `make fix-copies`와 같은 Python 유틸리티를 통합하여 깔끔하고 가독성 있는 코드를 작성하는 방법을 배웁니다. + +Hugging Face 팀은 항상 도움을 줄 준비가 되어 있으므로 혼자가 아니라는 점을 기억하세요. 🤗 ❤️ + +시작에 앞서 🤗 Transformers에 원하는 모델을 추가하기 위해 [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) 이슈를 열어야 합니다. 특정 모델을 기여하는 데 특별히 까다로운 기준을 가지지 않는 경우 [New model label](https://github.com/huggingface/transformers/labels/New%20model)을 필터링하여 요청되지 않은 모델이 있는지 확인하고 작업할 수 있습니다. + +새로운 모델 요청을 열었다면 첫 번째 단계는 🤗 Transformers에 익숙해지는 것입니다! + +## 🤗 Transformers의 전반적인 개요 [[general-overview-of-transformers]] + +먼저 🤗 Transformers에 대한 전반적인 개요를 파악해야 합니다. 🤗 Transformers는 매우 주관적인 라이브러리이기 때문에 해당 라이브러리의 철학이나 설계 선택 사항에 동의하지 않을 수도 있습니다. 그러나 우리의 경험상 라이브러리의 기본적인 설계 선택과 철학은 🤗 Transformers의 규모를 효율적으로 확장하면서 유지 보수 비용을 합리적인 수준으로 유지하는 것입니다. + +[라이브러리의 철학에 대한 문서](philosophy)를 읽는 것이 라이브러리를 더 잘 이해하는 좋은 시작점입니다. 모든 모델에 적용하려는 몇 가지 작업 방식에 대한 선택 사항이 있습니다: + +- 일반적으로 추상화보다는 구성을 선호합니다. +- 코드를 복제하는 것이 항상 나쁜 것은 아닙니다. 코드의 가독성이나 접근성을 크게 향상시킨다면 복제하는 것은 좋습니다. +- 모델 파일은 가능한 한 독립적으로 유지되어야 합니다. 따라서 특정 모델의 코드를 읽을 때 해당 `modeling_....py` 파일만 확인하면 됩니다. + +우리는 라이브러리의 코드가 제품을 제공하는 수단뿐만 아니라 개선하고자 하는 제품이라고도 생각합니다. 따라서 모델을 추가할 때, 사용자는 모델을 사용할 사람뿐만 아니라 코드를 읽고 이해하고 필요한 경우 조정할 수 있는 모든 사람까지도 포함한다는 점을 기억해야 합니다. + +이를 염두에 두고 일반적인 라이브러리 설계에 대해 조금 더 자세히 알아보겠습니다. + +### 모델 개요 [[overview-of-models]] + +모델을 성공적으로 추가하려면 모델과 해당 구성인 [`PreTrainedModel`] 및 [`PretrainedConfig`] 간의 상호작용을 이해하는 것이 중요합니다. 예를 들어, 🤗 Transformers에 추가하려는 모델을 `BrandNewBert`라고 부르겠습니다. + +다음을 살펴보겠습니다: + + + +보다시피, 🤗 Transformers에서는 상속을 사용하지만 추상화 수준을 최소한으로 유지합니다. 라이브러리의 어떤 모델에서도 두 수준 이상의 추상화가 존재하지 않습니다. `BrandNewBertModel`은 `BrandNewBertPreTrainedModel`에서 상속받고, 이 클래스는 [`PreTrainedModel`]에서 상속받습니다. 이로써 새로운 모델은 [`PreTrainedModel`]에만 의존하도록 하려고 합니다. 모든 새로운 모델에 자동으로 제공되는 중요한 기능은 [`~PreTrainedModel.from_pretrained`] 및 [`~PreTrainedModel.save_pretrained`]입니다. 이러한 기능 외에도 `BrandNewBertModel.forward`와 같은 다른 중요한 기능은 새로운 `modeling_brand_new_bert.py` 스크립트에서 완전히 정의되어야 합니다. 또한 `BrandNewBertForMaskedLM`과 같은 특정 헤드 레이어를 가진 모델은 `BrandNewBertModel`을 상속받지 않고 forward pass에서 호출할 수 있는 `BrandNewBertModel`을 사용하여 추상화 수준을 낮게 유지합니다. 모든 새로운 모델은 `BrandNewBertConfig`라는 구성 클래스를 필요로 합니다. 이 구성은 항상 [`PreTrainedModel`]의 속성으로 저장되며, 따라서 `BrandNewBertPreTrainedModel`을 상속받는 모든 클래스에서 `config` 속성을 통해 액세스할 수 있습니다: + +```python +model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") +model.config # model has access to its config +``` + +모델과 마찬가지로 구성은 [`PretrainedConfig`]에서 기본 직렬화 및 역직렬화 기능을 상속받습니다. 구성과 모델은 항상 *pytorch_model.bin* 파일과 *config.json* 파일로 각각 별도로 직렬화됩니다. [`~PreTrainedModel.save_pretrained`]를 호출하면 자동으로 [`~PretrainedConfig.save_pretrained`]도 호출되므로 모델과 구성이 모두 저장됩니다. + + +### 코드 스타일 [[code-style]] + +새로운 모델을 작성할 때, Transformers는 주관적인 라이브러리이며 몇 가지 독특한 코딩 스타일이 있습니다: + +1. 모델의 forward pass는 모델 파일에 완전히 작성되어야 합니다. 라이브러리의 다른 모델에서 블록을 재사용하려면 코드를 복사하여 위에 `# Copied from` 주석과 함께 붙여넣으면 됩니다 (예: [여기](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)를 참조하세요). +2. 코드는 완전히 이해하기 쉬워야 합니다. 변수 이름을 명확하게 지정하고 약어를 사용하지 않는 것이 좋습니다. 예를 들어, `act`보다는 `activation`을 선호합니다. 한 글자 변수 이름은 루프의 인덱스인 경우를 제외하고 권장되지 않습니다. +3. 더 일반적으로, 짧은 마법 같은 코드보다는 길고 명시적인 코드를 선호합니다. +4. PyTorch에서 `nn.Sequential`을 하위 클래스로 만들지 말고 `nn.Module`을 하위 클래스로 만들고 forward pass를 작성하여 다른 사람이 코드를 빠르게 디버그할 수 있도록 합니다. print 문이나 중단점을 추가할 수 있습니다. +5. 함수 시그니처에는 타입 주석을 사용해야 합니다. 그 외에는 타입 주석보다 변수 이름이 훨씬 읽기 쉽고 이해하기 쉽습니다. + +### 토크나이저 개요 [[overview-of-tokenizers]] + +아직 준비되지 않았습니다 :-( 이 섹션은 곧 추가될 예정입니다! + +## 🤗 Transformers에 모델 추가하는 단계별 방법 [[stepbystep-recipe-to-add-a-model-to-transformers]] + +각자 모델을 이식하는 방법에 대한 선호가 다르기 때문에 다른 기여자들이 Hugging Face에 모델을 이식하는 방법에 대한 요약을 살펴보는 것이 매우 유용할 수 있습니다. 다음은 모델을 이식하는 방법에 대한 커뮤니티 블로그 게시물 목록입니다: + +1. [GPT2 모델 이식하기](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) - [Thomas](https://huggingface.co/thomwolf) +2. [WMT19 MT 모델 이식하기](https://huggingface.co/blog/porting-fsmt) - [Stas](https://huggingface.co/stas) + +경험상 모델을 추가할 때 주의해야 할 가장 중요한 사항은 다음과 같습니다: + +- 같은 일을 반복하지 마세요! 새로운 🤗 Transformers 모델을 위해 추가할 코드의 대부분은 이미 🤗 Transformers 어딘가에 존재합니다. 이미 존재하는 복사할 수 있는 유사한 모델과 토크나이저를 찾는데 시간을 투자하세요. [grep](https://www.gnu.org/software/grep/)와 [rg](https://github.com/BurntSushi/ripgrep)를 참고하세요. 모델의 토크나이저가 한 모델을 기반으로 하고 모델링 코드가 다른 모델을 기반으로 하는 경우가 존재할 수도 있습니다. 예를 들어 FSMT의 모델링 코드는 BART를 기반으로 하고 FSMT의 토크나이저 코드는 XLM을 기반으로 합니다. +- 이것은 과학적인 도전보다는 공학적인 도전입니다. 논문의 모델의 모든 이론적 측면을 이해하려는 것보다 효율적인 디버깅 환경을 만드는 데 더 많은 시간을 소비해야 합니다. +- 막힐 때 도움을 요청하세요! 모델은 🤗 Transformers의 핵심 구성 요소이므로 Hugging Face의 우리는 당신이 모델을 추가하는 각 단계에서 기꺼이 도움을 줄 준비가 되어 있습니다. 진전이 없다고 느끼면 주저하지 말고 도움을 요청하세요. + +다음에서는 모델을 🤗 Transformers로 이식하는 데 가장 유용한 일반적인 절차를 제공하려고 노력합니다. + +다음 목록은 모델을 추가하는 데 수행해야 할 모든 작업의 요약이며 To-Do 목록으로 사용할 수 있습니다: + +☐ (선택 사항) BrandNewBert의 이론적 측면 이해
+☐ Hugging Face 개발 환경 준비
+☐ 원본 리포지토리의 디버깅 환경 설정
+☐ 원본 리포지토리와 체크포인트를 사용하여 `forward()` pass가 성공적으로 실행되는 스크립트 작성
+☐ 🤗 Transformers에 모델 스켈레톤 성공적으로 추가
+☐ 원본 체크포인트를 🤗 Transformers 체크포인트로 성공적으로 변환
+☐ 🤗 Transformers에서 원본 체크포인트와 동일한 출력을 내주는 `forward()` pass 성공적으로 실행
+☐ 🤗 Transformers에서 모델 테스트 완료
+☐ 🤗 Transformers에 토크나이저 성공적으로 추가
+☐ 종단 간 통합 테스트 실행
+☐ 문서 작성 완료
+☐ 모델 가중치를 허브에 업로드
+☐ Pull request 제출
+☐ (선택 사항) 데모 노트북 추가 + +우선, 일반적으로는 `BrandNewBert`의 이론적인 이해로 시작하는 것을 권장합니다. 그러나 이론적 측면을 직접 이해하는 대신 *직접 해보면서* 모델의 이론적 측면을 이해하는 것을 선호하는 경우 바로 `BrandNewBert` 코드 베이스로 빠져드는 것도 괜찮습니다. 이 옵션은 엔지니어링 기술이 이론적 기술보다 더 뛰어난 경우, `BrandNewBert`의 논문을 이해하는 데 어려움이 있는 경우, 또는 과학적인 논문을 읽는 것보다 프로그래밍에 훨씬 더 흥미 있는 경우에 더 적합할 수 있습니다. + +### 1. (선택 사항) BrandNewBert의 이론적 측면 [[1-optional-theoretical-aspects-of-brandnewbert]] + +만약 그런 서술적인 작업이 존재한다면, *BrandNewBert*의 논문을 읽어보는 시간을 가져야 합니다. 이해하기 어려운 섹션이 많을 수 있습니다. 그렇더라도 걱정하지 마세요! 목표는 논문의 깊은 이론적 이해가 아니라 *BrandNewBert*를 🤗 Transformers에서 효과적으로 재구현하기 위해 필요한 정보를 추출하는 것입니다. 이를 위해 이론적 측면에 너무 많은 시간을 투자할 필요는 없지만 다음과 같은 실제적인 측면에 집중해야 합니다: + +- *BrandNewBert*는 어떤 유형의 모델인가요? BERT와 유사한 인코더 모델인가요? GPT2와 유사한 디코더 모델인가요? BART와 유사한 인코더-디코더 모델인가요? 이들 간의 차이점에 익숙하지 않은 경우[model_summary](model_summary)를 참조하세요. +- *BrandNewBert*의 응용 분야는 무엇인가요? 텍스트 분류인가요? 텍스트 생성인가요? 요약과 같은 Seq2Seq 작업인가요? +- *brand_new_bert*와 BERT/GPT-2/BART의 차이점은 무엇인가요? +- *brand_new_bert*와 가장 유사한 [🤗 Transformers 모델](https://huggingface.co/transformers/#contents)은 무엇인가요? +- 어떤 종류의 토크나이저가 사용되나요? Sentencepiece 토크나이저인가요? Word piece 토크나이저인가요? BERT 또는 BART에 사용되는 동일한 토크나이저인가요? + +모델의 아키텍처에 대해 충분히 이해했다는 생각이 든 후, 궁금한 사항이 있으면 Hugging Face 팀에 문의하십시오. 이는 모델의 아키텍처, 어텐션 레이어 등에 관한 질문을 포함할 수 있습니다. Hugging Face의 유지 관리자들은 보통 코드를 검토하는 것에 대해 매우 기뻐하므로 당신을 돕는 일을 매우 환영할 것입니다! + +### 2. 개발 환경 설정 [[2-next-prepare-your-environment]] + +1. 저장소 페이지에서 "Fork" 버튼을 클릭하여 저장소의 사본을 GitHub 사용자 계정으로 만듭니다. + +2. `transformers` fork를 로컬 디스크에 클론하고 베이스 저장소를 원격 저장소로 추가합니다: + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + +3. 개발 환경을 설정합니다. 다음 명령을 실행하여 개발 환경을 설정할 수 있습니다: + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +각 운영 체제에 따라 Transformers의 선택적 의존성이 개수가 증가하면 이 명령이 실패할 수 있습니다. 그런 경우에는 작업 중인 딥 러닝 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)을 설치한 후, 다음 명령을 수행하면 됩니다: + +```bash +pip install -e ".[quality]" +``` + +대부분의 경우에는 이것으로 충분합니다. 그런 다음 상위 디렉토리로 돌아갑니다. + +```bash +cd .. +``` + +4. Transformers에 *brand_new_bert*의 PyTorch 버전을 추가하는 것을 권장합니다. PyTorch를 설치하려면 다음 링크의 지침을 따르십시오: https://pytorch.org/get-started/locally/. + +**참고:** CUDA를 설치할 필요는 없습니다. 새로운 모델이 CPU에서 작동하도록 만드는 것으로 충분합니다. + +5. *brand_new_bert*를 이식하기 위해서는 해당 원본 저장소에 접근할 수 있어야 합니다: + +```bash +git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +cd brand_new_bert +pip install -e . +``` + +이제 *brand_new_bert*를 🤗 Transformers로 이식하기 위한 개발 환경을 설정하였습니다. + +### 3.-4. 원본 저장소에서 사전 훈련된 체크포인트 실행하기 [[3.-4.-run-a-pretrained-checkpoint-using-the-original-repository]] + +먼저, 원본 *brand_new_bert* 저장소에서 작업을 시작합니다. 원본 구현은 보통 "연구용"으로 많이 사용됩니다. 즉, 문서화가 부족하고 코드가 이해하기 어려울 수 있습니다. 그러나 이것이 바로 *brand_new_bert*를 다시 구현하려는 동기가 되어야 합니다. Hugging Face에서의 주요 목표 중 하나는 **거인의 어깨 위에 서는 것**이며, 이는 여기에서 쉽게 해석되어 동작하는 모델을 가져와서 가능한 한 **접근 가능하고 사용자 친화적이며 아름답게** 만드는 것입니다. 이것은 🤗 Transformers에서 모델을 다시 구현하는 가장 중요한 동기입니다 - 새로운 복잡한 NLP 기술을 **모두에게** 접근 가능하게 만드는 것을 목표로 합니다. + +따라서 원본 저장소에 대해 자세히 살펴보는 것으로 시작해야 합니다. + +원본 저장소에서 공식 사전 훈련된 모델을 성공적으로 실행하는 것은 종종 **가장 어려운** 단계입니다. 우리의 경험에 따르면, 원본 코드 베이스에 익숙해지는 데 시간을 투자하는 것이 매우 중요합니다. 다음을 파악해야 합니다: + +- 사전 훈련된 가중치를 어디서 찾을 수 있는지? +- 사전 훈련된 가중치를 해당 모델에로드하는 방법은? +- 모델과 독립적으로 토크나이저를 실행하는 방법은? +- 간단한 forward pass에 필요한 클래스와 함수를 파악하기 위해 forward pass를 한 번 추적해 보세요. 일반적으로 해당 함수들만 다시 구현하면 됩니다. +- 모델의 중요한 구성 요소를 찾을 수 있어야 합니다. 모델 클래스는 어디에 있나요? 모델 하위 클래스(*EncoderModel*, *DecoderModel* 등)가 있나요? self-attention 레이어는 어디에 있나요? self-attention, cross-attention 등 여러 가지 다른 어텐션 레이어가 있나요? +- 원본 환경에서 모델을 디버그할 수 있는 방법은 무엇인가요? *print* 문을 추가해야 하나요? *ipdb*와 같은 대화식 디버거를 사용할 수 있나요? PyCharm과 같은 효율적인 IDE를 사용해 모델을 디버그할 수 있나요? + +원본 저장소에서 코드를 이식하는 작업을 시작하기 전에 원본 저장소에서 코드를 **효율적으로** 디버그할 수 있어야 합니다! 또한, 오픈 소스 라이브러리로 작업하고 있다는 것을 기억해야 합니다. 따라서 원본 저장소에서 issue를 열거나 pull request를 열기를 주저하지 마십시오. 이 저장소의 유지 관리자들은 누군가가 자신들의 코드를 살펴본다는 것에 대해 매우 기뻐할 것입니다! + +현재 시점에서, 원래 모델을 디버깅하기 위해 어떤 디버깅 환경과 전략을 선호하는지는 당신에게 달렸습니다. 우리는 고가의 GPU 환경을 구축하는 것은 비추천합니다. 대신, 원래 저장소로 들어가서 작업을 시작할 때와 🤗 Transformers 모델의 구현을 시작할 때에도 CPU에서 작업하는 것이 좋습니다. 모델이 이미 🤗 Transformers로 성공적으로 이식되었을 때에만 모델이 GPU에서도 예상대로 작동하는지 확인해야합니다. + +일반적으로, 원래 모델을 실행하기 위한 두 가지 가능한 디버깅 환경이 있습니다. + +- [Jupyter 노트북](https://jupyter.org/) / [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) +- 로컬 Python 스크립트 + +Jupyter 노트북의 장점은 셀 단위로 실행할 수 있다는 것입니다. 이는 논리적인 구성 요소를 더 잘 분리하고 중간 결과를 저장할 수 있으므로 디버깅 사이클이 더 빨라질 수 있습니다. 또한, 노트북은 다른 기여자와 쉽게 공유할 수 있으므로 Hugging Face 팀의 도움을 요청하려는 경우 매우 유용할 수 있습니다. Jupyter 노트북에 익숙하다면 이를 사용하는 것을 강력히 추천합니다. + +Jupyter 노트북의 단점은 사용에 익숙하지 않은 경우 새로운 프로그래밍 환경에 적응하는 데 시간을 할애해야 하며, `ipdb`와 같은 알려진 디버깅 도구를 더 이상 사용할 수 없을 수도 있다는 것입니다. + +각 코드 베이스에 대해 좋은 첫 번째 단계는 항상 **작은** 사전 훈련된 체크포인트를 로드하고 더미 정수 벡터 입력을 사용하여 단일 forward pass를 재현하는 것입니다. 이와 같은 스크립트는 다음과 같을 수 있습니다(의사 코드로 작성): + +```python +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +``` + +다음으로, 디버깅 전략에 대해 일반적으로 다음과 같은 몇 가지 선택지가 있습니다: + +- 원본 모델을 많은 작은 테스트 가능한 구성 요소로 분해하고 각각에 대해 forward pass를 실행하여 검증합니다. +- 원본 모델을 원본 *tokenizer*과 원본 *model*로만 분해하고 해당 부분에 대해 forward pass를 실행한 후 검증을 위해 중간 출력(print 문 또는 중단점)을 사용합니다. + +다시 말하지만, 어떤 전략을 선택할지는 당신에게 달려 있습니다. 원본 코드 베이스에 따라 하나 또는 다른 전략이 유리할 수 있습니다. + +원본 코드 베이스를 모델의 작은 하위 구성 요소로 분해할 수 있는지 여부, 예를 들어 원본 코드 베이스가 즉시 실행 모드에서 간단히 실행될 수 있는 경우, 그런 경우에는 그 노력이 가치가 있다는 것이 일반적입니다. 초기에 더 어려운 방법을 선택하는 것에는 몇 가지 중요한 장점이 있습니다. + +- 원본 모델을 🤗 Transformers 구현과 비교할 때 각 구성 요소가 일치하는지 자동으로 확인할 수 있습니다. 즉, 시각적인 비교(print 문을 통한 비교가 아닌) 대신 🤗 Transformers 구현과 그에 대응하는 원본 구성 요소가 일치하는지 확인할 수 있습니다. +- 전체 모델을 모듈별로, 즉 작은 구성 요소로 분해함으로써 모델을 이식하는 큰 문제를 단순히 개별 구성 요소를 이식하는 작은 문제로 분해할 수 있으므로 작업을 더 잘 구조화할 수 있습니다. +- 모델을 논리적으로 의미 있는 구성 요소로 분리하는 것은 모델의 설계에 대한 더 나은 개요를 얻고 모델을 더 잘 이해하는 데 도움이 됩니다. +- 이러한 구성 요소별 테스트를 통해 코드를 변경하면서 회귀가 발생하지 않도록 보장할 수 있습니다. + +[Lysandre의 ELECTRA 통합 검사](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed)는 이를 수행하는 좋은 예제입니다. + +그러나 원본 코드 베이스가 매우 복잡하거나 중간 구성 요소를 컴파일된 모드에서 실행하는 것만 허용하는 경우, 모델을 테스트 가능한 작은 하위 구성 요소로 분해하는 것이 시간이 많이 소요되거나 불가능할 수도 있습니다. [T5의 MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) 라이브러리는 매우 복잡하며 모델을 하위 구성 요소로 분해하는 간단한 방법을 제공하지 않습니다. 이러한 라이브러리의 경우, 보통 print 문을 통해 확인합니다. + +어떤 전략을 선택하더라도 권장되는 절차는 동일합니다. 먼저 시작 레이어를 디버그하고 마지막 레이어를 마지막에 디버그하는 것이 좋습니다. + +다음 순서로 각 레이어의 출력을 검색하는 것이 좋습니다: + +1. 모델에 전달된 입력 ID 가져오기 +2. 워드 임베딩 가져오기 +3. 첫 번째 Transformer 레이어의 입력 가져오기 +4. 첫 번째 Transformer 레이어의 출력 가져오기 +5. 다음 n-1개의 Transformer 레이어의 출력 가져오기 +6. BrandNewBert 모델의 출력 가져오기 + +입력 ID는 정수 배열로 구성되며, 예를 들어 `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`와 같을 수 있습니다. + +다음 레이어의 출력은 종종 다차원 실수 배열로 구성되며, 다음과 같이 나타낼 수 있습니다: + +``` +[[ + [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], + ..., + [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], +``` + +🤗 Transformers에 추가되는 모든 모델은 통합 테스트를 통과해야 합니다. 즉, 원본 모델과 🤗 Transformers의 재구현 버전이 0.001의 정밀도로 정확히 동일한 출력을 내야 합니다! 동일한 모델이 다른 라이브러리에서 작성되었을 때 라이브러리 프레임워크에 따라 약간 다른 출력을 얻는 것은 정상이므로 1e-3(0.001)의 오차는 허용합니다. 거의 동일한 출력을 내는 것만으로는 충분하지 않으며, 완벽히 일치하는 수준이어야 합니다. 따라서 🤗 Transformers 버전의 중간 출력을 *brand_new_bert*의 원래 구현의 중간 출력과 여러 번 비교해야 합니다. 이 경우 원본 저장소의 **효율적인** 디버깅 환경이 절대적으로 중요합니다. 디버깅 환경을 가능한 한 효율적으로 만드는 몇 가지 조언을 제시합니다. + +- 중간 결과를 디버그하는 가장 좋은 방법을 찾으세요. 원본 저장소가 PyTorch로 작성되었다면 원본 모델을 더 작은 하위 구성 요소로 분해하여 중간 값을 검색하는 긴 스크립트를 작성하는 것에 시간을 투자할 가치가 있습니다. 원본 저장소가 Tensorflow 1로 작성되었다면 [tf.print](https://www.tensorflow.org/api_docs/python/tf/print)와 같은 Tensorflow 출력 작업을 사용하여 중간 값을 출력해야 할 수도 있습니다. 원본 저장소가 Jax로 작성되었다면 forward pass를 실행할 때 모델이 **jit 되지 않도록** 해야 합니다. 예를 들어 [이 링크](https://github.com/google/jax/issues/196)를 확인해 보세요. +- 사용 가능한 가장 작은 사전 훈련된 체크포인트를 사용하세요. 체크포인트가 작을수록 디버그 사이클이 더 빨라집니다. 전반적으로 forward pass에 10초 이상이 걸리는 경우 효율적이지 않습니다. 매우 큰 체크포인트만 사용할 수 있는 경우, 새 환경에서 임의로 초기화된 가중치로 더미 모델을 만들고 해당 가중치를 🤗 Transformers 버전과 비교하기 위해 저장하는 것이 더 의미가 있을 수 있습니다. +- 디버깅 설정에서 가장 쉽게 forward pass를 호출하는 방법을 사용하세요. 원본 저장소에서 **단일** forward pass만 호출하는 함수를 찾는 것이 이상적입니다. 이 함수는 일반적으로 `predict`, `evaluate`, `forward`, `__call__`과 같이 호출됩니다. `autoregressive_sample`과 같은 텍스트 생성에서 `forward`를 여러 번 호출하여 텍스트를 생성하는 등의 작업을 수행하는 함수를 디버그하고 싶지 않을 것입니다. +- 토큰화 과정을 모델의 *forward* pass와 분리하려고 노력하세요. 원본 저장소에서 입력 문자열을 입력해야 하는 예제가 있는 경우, 입력 문자열이 입력 ID로 변경되는 순간을 찾아서 시작하세요. 이 경우 직접 ID를 입력할 수 있도록 작은 스크립트를 작성하거나 원본 코드를 수정해야 할 수도 있습니다. +- 디버깅 설정에서 모델이 훈련 모드가 아니라는 것을 확인하세요. 훈련 모드에서는 모델의 여러 드롭아웃 레이어 때문에 무작위 출력이 생성될 수 있습니다. 디버깅 환경에서 forward pass가 **결정론적**이도록 해야 합니다. 또는 동일한 프레임워크에 있는 경우 *transformers.utils.set_seed*를 사용하세요. + +다음 섹션에서는 *brand_new_bert*에 대해 이 작업을 수행하는 데 더 구체적인 세부 사항/팁을 제공합니다. + +### 5.-14. 🤗 Transformers에 BrandNewBert를 이식하기 [[5.-14.-port-brandnewbert-to-transformers]] + +이제, 마침내 🤗 Transformers에 새로운 코드를 추가할 수 있습니다. 🤗 Transformers 포크의 클론으로 이동하세요: + +```bash +cd transformers +``` + +다음과 같이 이미 존재하는 모델의 모델 아키텍처와 정확히 일치하는 모델을 추가하는 특별한 경우에는 [이 섹션](#write-a-conversion-script)에 설명된대로 변환 스크립트만 추가하면 됩니다. 이 경우에는 이미 존재하는 모델의 전체 모델 아키텍처를 그대로 재사용할 수 있습니다. + +그렇지 않으면 새로운 모델 생성을 시작합시다. 여기에서 두 가지 선택지가 있습니다: + +- `transformers-cli add-new-model-like`를 사용하여 기존 모델과 유사한 새로운 모델 추가하기 +- `transformers-cli add-new-model`을 사용하여 템플릿을 기반으로 한 새로운 모델 추가하기 (선택한 모델 유형에 따라 BERT 또는 Bart와 유사한 모습일 것입니다) + +두 경우 모두, 모델의 기본 정보를 입력하는 설문조사가 제시됩니다. 두 번째 명령어는 `cookiecutter`를 설치해야 합니다. 자세한 정보는 [여기](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model)에서 확인할 수 있습니다. + +**huggingface/transformers 메인 저장소에 Pull Request 열기** + +자동으로 생성된 코드를 수정하기 전에, 지금은 "작업 진행 중 (WIP)" 풀 리퀘스트를 열기 위한 시기입니다. 예를 들어, 🤗 Transformers에 "*brand_new_bert* 추가"라는 제목의 "[WIP] Add *brand_new_bert*" 풀 리퀘스트를 엽니다. 이렇게 하면 당신과 Hugging Face 팀이 🤗 Transformers에 모델을 통합하는 작업을 함께할 수 있습니다. + +다음을 수행해야 합니다: + +1. 메인 브랜치에서 작업을 잘 설명하는 이름으로 브랜치 생성 + +```bash +git checkout -b add_brand_new_bert +``` + +2. 자동으로 생성된 코드 커밋 + +```bash +git add . +git commit +``` + +3. 현재 메인을 가져오고 리베이스 + +```bash +git fetch upstream +git rebase upstream/main +``` + +4. 변경 사항을 계정에 푸시 + +```bash +git push -u origin a-descriptive-name-for-my-changes +``` + +5. 만족스럽다면, GitHub에서 자신의 포크한 웹 페이지로 이동합니다. "Pull request"를 클릭합니다. Hugging Face 팀의 일부 멤버의 GitHub 핸들을 리뷰어로 추가하여 Hugging Face 팀이 앞으로의 변경 사항에 대해 알림을 받을 수 있도록 합니다. + +6. GitHub 풀 리퀘스트 웹 페이지 오른쪽에 있는 "Convert to draft"를 클릭하여 PR을 초안으로 변경합니다. + +다음으로, 어떤 진전을 이루었다면 작업을 커밋하고 계정에 푸시하여 풀 리퀘스트에 표시되도록 해야 합니다. 또한, 다음과 같이 현재 메인과 작업을 업데이트해야 합니다: + +```bash +git fetch upstream +git merge upstream/main +``` + +일반적으로, 모델 또는 구현에 관한 모든 질문은 자신의 PR에서 해야 하며, PR에서 토론되고 해결되어야 합니다. 이렇게 하면 Hugging Face 팀이 새로운 코드를 커밋하거나 질문을 할 때 항상 알림을 받을 수 있습니다. Hugging Face 팀에게 문제 또는 질문을 효율적으로 이해할 수 있도록 추가한 코드를 명시하는 것이 도움이 될 때가 많습니다. + +이를 위해, 변경 사항을 모두 볼 수 있는 "Files changed" 탭으로 이동하여 질문하고자 하는 줄로 이동한 다음 "+" 기호를 클릭하여 코멘트를 추가할 수 있습니다. 질문이나 문제가 해결되면, 생성된 코멘트의 "Resolve" 버튼을 클릭할 수 있습니다. + +마찬가지로, Hugging Face 팀은 코드를 리뷰할 때 코멘트를 남길 것입니다. 우리는 PR에서 대부분의 질문을 GitHub에서 묻는 것을 권장합니다. 공개에 크게 도움이 되지 않는 매우 일반적인 질문의 경우, Slack이나 이메일을 통해 Hugging Face 팀에게 문의할 수 있습니다. + +**5. brand_new_bert에 대해 생성된 모델 코드를 적용하기** + +먼저, 우리는 모델 자체에만 초점을 맞추고 토크나이저에 대해서는 신경 쓰지 않을 것입니다. 모든 관련 코드는 다음의 생성된 파일에서 찾을 수 있습니다: `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 및 `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. + +이제 마침내 코딩을 시작할 수 있습니다 :). `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`의 생성된 코드는 인코더 전용 모델인 경우 BERT와 동일한 아키텍처를 가지거나, 인코더-디코더 모델인 경우 BART와 동일한 아키텍처를 가질 것입니다. 이 시점에서, 모델의 이론적 측면에 대해 배운 내용을 다시 상기해야 합니다: *모델이 BERT 또는 BART와 어떻게 다른가요?*. 자주 변경해야 하는 것은 *self-attention* 레이어, 정규화 레이어의 순서 등을 변경하는 것입니다. 다시 말하지만, 자신의 모델을 구현하는 데 도움이 되도록 Transformers에서 이미 존재하는 모델의 유사한 아키텍처를 살펴보는 것이 유용할 수 있습니다. + +**참고로** 이 시점에서, 코드가 완전히 정확하거나 깨끗하다고 확신할 필요는 없습니다. 오히려 처음에는 원본 코드의 첫 번째 *불완전하고* 복사된 버전을 `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`에 추가하는 것이 좋습니다. 필요한 모든 코드가 추가될 때까지 이러한 작업을 진행한 후, 다음 섹션에서 설명한 변환 스크립트를 사용하여 코드를 점진적으로 개선하고 수정하는 것이 훨씬 효율적입니다. 이 시점에서 작동해야 하는 유일한 것은 다음 명령이 작동하는 것입니다: + +```python +from transformers import BrandNewBertModel, BrandNewBertConfig + +model = BrandNewBertModel(BrandNewBertConfig()) +``` + +위의 명령은 `BrandNewBertConfig()`에 정의된 기본 매개변수에 따라 무작위 가중치로 모델을 생성하며, 이로써 모든 구성 요소의 `init()` 메서드가 작동함을 보장합니다. + +모든 무작위 초기화는 `BrandnewBertPreTrainedModel` 클래스의 `_init_weights` 메서드에서 수행되어야 합니다. 이 메서드는 구성 설정 변수에 따라 모든 리프 모듈을 초기화해야 합니다. BERT의 `_init_weights` 메서드 예제는 다음과 같습니다: + +```py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) +``` + +몇 가지 모듈에 대해 특별한 초기화가 필요한 경우 사용자 정의 방식을 사용할 수도 있습니다. 예를 들어, `Wav2Vec2ForPreTraining`에서 마지막 두 개의 선형 레이어는 일반적인 PyTorch `nn.Linear`의 초기화를 가져야 하지만, 다른 모든 레이어는 위와 같은 초기화를 사용해야 합니다. 이는 다음과 같이 코드화됩니다: + +```py +def _init_weights(self, module): + """Initialize the weights""" + if isinstnace(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() +``` + +`_is_hf_initialized` 플래그는 서브모듈을 한 번만 초기화하도록 내부적으로 사용됩니다. `module.project_q` 및 `module.project_hid`에 대해 `True`로 설정함으로써, 우리가 수행한 사용자 정의 초기화가 이후에 덮어쓰이지 않도록 합니다. 즉, `_init_weights` 함수가 이들에게 적용되지 않습니다. + +**6. 변환 스크립트 작성하기** + +다음으로, 디버그에 사용한 체크포인트를 기존 저장소에서 만든 🤗 Transformers 구현과 호환되는 체크포인트로 변환할 수 있는 변환 스크립트를 작성해야 합니다. 변환 스크립트를 처음부터 작성하는 것보다는 *brand_new_bert*와 동일한 프레임워크로 작성된 유사한 모델을 변환한 기존 변환 스크립트를 찾아보는 것이 좋습니다. 일반적으로 기존 변환 스크립트를 복사하여 사용 사례에 맞게 약간 수정하는 것으로 충분합니다. 모델에 대해 유사한 기존 변환 스크립트를 어디에서 찾을 수 있는지 Hugging Face 팀에게 문의하는 것을 망설이지 마세요. + +- TensorFlow에서 PyTorch로 모델을 이전하는 경우, 좋은 참고 자료로 BERT의 변환 스크립트 [여기](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)를 참조할 수 있습니다. +- PyTorch에서 PyTorch로 모델을 이전하는 경우, 좋은 참고 자료로 BART의 변환 스크립트 [여기](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)를 참조할 수 있습니다. + +다음에서는 PyTorch 모델이 레이어 가중치를 저장하고 레이어 이름을 정의하는 방법에 대해 간단히 설명하겠습니다. PyTorch에서 레이어의 이름은 레이어에 지정한 클래스 속성의 이름으로 정의됩니다. 다음과 같이 PyTorch에서 `SimpleModel`이라는 더미 모델을 정의해 봅시다: + +```python +from torch import nn + + +class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.dense = nn.Linear(10, 10) + self.intermediate = nn.Linear(10, 10) + self.layer_norm = nn.LayerNorm(10) +``` + +이제 이 모델 정의의 인스턴스를 생성할 수 있으며 `dense`, `intermediate`, `layer_norm` 등의 가중치가 랜덤하게 할당됩니다. 모델을 출력하여 아키텍처를 확인할 수 있습니다. + +```python +model = SimpleModel() + +print(model) +``` + +이는 다음과 같이 출력됩니다: + +``` +SimpleModel( + (dense): Linear(in_features=10, out_features=10, bias=True) + (intermediate): Linear(in_features=10, out_features=10, bias=True) + (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) +) +``` + +우리는 레이어의 이름이 PyTorch에서 클래스 속성의 이름으로 정의되어 있는 것을 볼 수 있습니다. 특정 레이어의 가중치 값을 출력하여 확인할 수 있습니다: + +```python +print(model.dense.weight.data) +``` + +가중치가 무작위로 초기화되었음을 확인할 수 있습니다. + +``` +tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, + -0.2077, 0.2157], + [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, + 0.2166, -0.0212], + [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, + -0.1023, -0.0447], + [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, + -0.1876, -0.2467], + [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, + 0.2577, 0.0402], + [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, + 0.2132, 0.1680], + [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, + 0.2707, -0.2509], + [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, + 0.1829, -0.1568], + [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, + 0.0333, -0.0536], + [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, + 0.2220, 0.2358]]). +``` + +변환 스크립트에서는 이러한 무작위로 초기화된 가중치를 체크포인트의 해당 레이어의 정확한 가중치로 채워야 합니다. 예를 들면 다음과 같습니다: + +```python +# retrieve matching layer weights, e.g. by +# recursive algorithm +layer_name = "dense" +pretrained_weight = array_of_dense_layer + +model_pointer = getattr(model, "dense") + +model_pointer.weight.data = torch.from_numpy(pretrained_weight) +``` + +이렇게 하면 PyTorch 모델의 무작위로 초기화된 각 가중치와 해당 체크포인트 가중치가 **모양과 이름** 모두에서 정확히 일치하는지 확인해야 합니다. 이를 위해 모양에 대한 assert 문을 추가하고 체크포인트 가중치의 이름을 출력해야 합니다. 예를 들어 다음과 같은 문장을 추가해야 합니다: + +```python +assert ( + model_pointer.weight.shape == pretrained_weight.shape +), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" +``` + +또한 두 가중치의 이름을 출력하여 일치하는지 확인해야 합니다. *예시*: + +```python +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +``` + +모양 또는 이름이 일치하지 않는 경우, 랜덤으로 초기화된 레이어에 잘못된 체크포인트 가중치를 할당한 것으로 추측됩니다. + +잘못된 모양은 `BrandNewBertConfig()`의 구성 매개변수 설정이 변환하려는 체크포인트에 사용된 설정과 정확히 일치하지 않기 때문일 가능성이 가장 큽니다. 그러나 PyTorch의 레이어 구현 자체에서 가중치를 전치해야 할 수도 있습니다. + +마지막으로, **모든** 필요한 가중치가 초기화되었는지 확인하고 초기화에 사용되지 않은 모든 체크포인트 가중치를 출력하여 모델이 올바르게 변환되었는지 확인해야 합니다. 잘못된 모양 문장이나 잘못된 이름 할당으로 인해 변환 시도가 실패하는 것은 완전히 정상입니다. 이는 `BrandNewBertConfig()`에서 잘못된 매개변수를 사용하거나 🤗 Transformers 구현에서 잘못된 아키텍처, 🤗 Transformers 구현의 구성 요소 중 하나의 `init()` 함수에 버그가 있는 경우이거나 체크포인트 가중치 중 하나를 전치해야 하는 경우일 가능성이 가장 높습니다. + +이 단계는 이전 단계와 함께 반복되어야 하며 모든 체크포인트의 가중치가 Transformers 모델에 올바르게 로드되었을 때까지 계속되어야 합니다. 🤗 Transformers 구현에 체크포인트를 올바르게 로드한 후에는 `/path/to/converted/checkpoint/folder`와 같은 원하는 폴더에 모델을 저장할 수 있어야 합니다. 해당 폴더에는 `pytorch_model.bin` 파일과 `config.json` 파일이 모두 포함되어야 합니다. + +```python +model.save_pretrained("/path/to/converted/checkpoint/folder") +``` + +**7. 순방향 패스 구현하기** + +🤗 Transformers 구현에 사전 훈련된 가중치를 정확하게 로드한 후에는 순방향 패스가 올바르게 구현되었는지 확인해야 합니다. [원본 저장소에 익숙해지기](#34-run-a-pretrained-checkpoint-using-the-original-repository)에서 이미 원본 저장소를 사용하여 모델의 순방향 패스를 실행하는 스크립트를 만들었습니다. 이제 원본 대신 🤗 Transformers 구현을 사용하는 유사한 스크립트를 작성해야 합니다. 다음과 같이 작성되어야 합니다: + +```python +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states +``` + +🤗 Transformers 구현과 원본 모델 구현이 처음부터 정확히 동일한 출력을 제공하지 않거나 순방향 패스에서 오류가 발생할 가능성이 매우 높습니다. 실망하지 마세요. 예상된 일입니다! 먼저, 순방향 패스에서 오류가 발생하지 않도록 해야 합니다. 종종 잘못된 차원이 사용되어 *차원 불일치* 오류가 발생하거나 잘못된 데이터 유형 개체가 사용되는 경우가 있습니다. 예를 들면 `torch.long` 대신에 `torch.float32`가 사용된 경우입니다. 해결할 수 없는 오류가 발생하면 Hugging Face 팀에 도움을 요청하는 것이 좋습니다. + +🤗 Transformers 구현이 올바르게 작동하는지 확인하는 마지막 단계는 출력이 `1e-3`의 정밀도로 동일한지 확인하는 것입니다. 먼저, 출력 모양이 동일하도록 보장해야 합니다. 즉, 🤗 Transformers 구현 스크립트와 원본 구현 사이에서 `outputs.shape`는 동일한 값을 반환해야 합니다. 그 다음으로, 출력 값이 동일하도록 해야 합니다. 이는 새로운 모델을 추가할 때 가장 어려운 부분 중 하나입니다. 출력이 동일하지 않은 일반적인 실수 사례는 다음과 같습니다: + +- 일부 레이어가 추가되지 않았습니다. 즉, *활성화* 레이어가 추가되지 않았거나 잔차 연결이 빠졌습니다. +- 단어 임베딩 행렬이 연결되지 않았습니다. +- 잘못된 위치 임베딩이 사용되었습니다. 원본 구현에서는 오프셋을 사용합니다. +- 순방향 패스 중에 Dropout이 적용되었습니다. 이를 수정하려면 *model.training이 False*인지 확인하고 순방향 패스 중에 Dropout 레이어가 잘못 활성화되지 않도록 하세요. 즉, [PyTorch의 기능적 Dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)에 *self.training*을 전달하세요. + +문제를 해결하는 가장 좋은 방법은 일반적으로 원본 구현과 🤗 Transformers 구현의 순방향 패스를 나란히 놓고 차이점이 있는지 확인하는 것입니다. 이상적으로는 순방향 패스의 중간 출력을 디버그/출력하여 원본 구현과 🤗 Transformers 구현의 정확한 위치를 찾을 수 있어야 합니다. 먼저, 두 스크립트의 하드코딩된 `input_ids`가 동일한지 확인하세요. 다음으로, `input_ids`의 첫 번째 변환의 출력(일반적으로 단어 임베딩)이 동일한지 확인하세요. 그런 다음 네트워크의 가장 마지막 레이어까지 진행해보세요. 어느 시점에서 두 구현 사이에 차이가 있는 것을 알게 되는데, 이는 🤗 Transformers 구현의 버그 위치를 가리킬 것입니다. 저희 경험상으로는 원본 구현과 🤗 Transformers 구현 모두에서 동일한 위치에 많은 출력 문을 추가하고 이들의 중간 표현에 대해 동일한 값을 보이는 출력 문을 연속적으로 제거하는 것이 간단하고 효과적인 방법입니다. + +`torch.allclose(original_output, output, atol=1e-3)`로 출력을 확인하여 두 구현이 동일한 출력을 하는 것을 확신한다면, 가장 어려운 부분은 끝났습니다! 축하드립니다. 남은 작업은 쉬운 일이 될 것입니다 😊. + +**8. 필요한 모든 모델 테스트 추가하기** + +이 시점에서 새로운 모델을 성공적으로 추가했습니다. 그러나 해당 모델이 요구되는 디자인에 완전히 부합하지 않을 수도 있습니다. 🤗 Transformers와 완벽하게 호환되는 구현인지 확인하기 위해 모든 일반 테스트를 통과해야 합니다. Cookiecutter는 아마도 모델을 위한 테스트 파일을 자동으로 추가했을 것입니다. 아마도 `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`와 같은 경로에 위치할 것입니다. 이 테스트 파일을 실행하여 일반 테스트가 모두 통과하는지 확인하세요. + +```bash +pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py +``` + +모든 일반 테스트를 수정한 후, 이제 수행한 작업을 충분히 테스트하여 다음 사항을 보장해야 합니다. + +- a) 커뮤니티가 *brand_new_bert*의 특정 테스트를 살펴봄으로써 작업을 쉽게 이해할 수 있도록 함 +- b) 모델에 대한 향후 변경 사항이 모델의 중요한 기능을 손상시키지 않도록 함 + +먼저 통합 테스트를 추가해야 합니다. 이러한 통합 테스트는 이전에 모델을 🤗 Transformers로 구현하기 위해 사용한 디버깅 스크립트와 동일한 작업을 수행합니다. Cookiecutter에 이미 이러한 모델 테스트의 템플릿인 `BrandNewBertModelIntegrationTests`가 추가되어 있으며, 여러분이 작성해야 할 내용으로만 채워 넣으면 됩니다. 이러한 테스트가 통과하는지 확인하려면 다음을 실행하세요. + +```bash +RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +``` + + + +Windows를 사용하는 경우 `RUN_SLOW=1`을 `SET RUN_SLOW=1`로 바꿔야 합니다. + + + +둘째로, *brand_new_bert*에 특화된 모든 기능도 별도의 테스트에서 추가로 테스트해야 합니다. 이 부분은 종종 잊히는데, 두 가지 측면에서 굉장히 유용합니다. + +- *brand_new_bert*의 특수 기능이 어떻게 작동해야 하는지 보여줌으로써 커뮤니티에게 모델 추가 과정에서 습득한 지식을 전달하는 데 도움이 됩니다. +- 향후 기여자는 이러한 특수 테스트를 실행하여 모델에 대한 변경 사항을 빠르게 테스트할 수 있습니다. + + +**9. 토크나이저 구현하기** + +다음으로, *brand_new_bert*의 토크나이저를 추가해야 합니다. 보통 토크나이저는 🤗 Transformers의 기존 토크나이저와 동일하거나 매우 유사합니다. + +토크나이저가 올바르게 작동하는지 확인하기 위해 먼저 원본 리포지토리에서 문자열을 입력하고 `input_ids`를 반환하는 스크립트를 생성하는 것이 좋습니다. 다음과 같은 유사한 스크립트일 수 있습니다 (의사 코드로 작성): + +```python +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = model.tokenize(input_str) +``` + +원본 리포지토리를 자세히 살펴보고 올바른 토크나이저 함수를 찾거나, 복제본에서 변경 사항을 적용하여 `input_ids`만 출력하도록 해야 합니다. 원본 리포지토리를 사용하는 기능적인 토큰화 스크립트를 작성한 후, 🤗 Transformers의 유사한 스크립트를 생성해야 합니다. 다음과 같이 작성되어야 합니다: + +```python +from transformers import BrandNewBertTokenizer + +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." + +tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") + +input_ids = tokenizer(input_str).input_ids +``` + +두 개의 `input_ids`가 동일한 값을 반환할 때, 마지막 단계로 토크나이저 테스트 파일도 추가해야 합니다. + +*brand_new_bert*의 모델링 테스트 파일과 유사하게, *brand_new_bert*의 토크나이제이션 테스트 파일에는 몇 가지 하드코딩된 통합 테스트가 포함되어야 합니다. + +**10. 종단 간 통합 테스트 실행** + +토크나이저를 추가한 후에는 모델과 토크나이저를 사용하여 몇 가지 종단 간 통합 테스트를 추가해야 합니다. `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`에 추가해주세요. 이러한 테스트는 🤗 Transformers 구현이 예상대로 작동하는지를 의미 있는 text-to-text 예시로 보여줘야 합니다. 그 예시로는 *예를 들어* source-to-target 번역 쌍, article-to-summary 쌍, question-to-answer 쌍 등이 포함될 수 있습니다. 불러온 체크포인트 중 어느 것도 다운스트림 작업에서 미세 조정되지 않았다면, 모델 테스트만으로 충분합니다. 모델이 완전히 기능을 갖추었는지 확인하기 위해 마지막 단계로 GPU에서 모든 테스트를 실행하는 것이 좋습니다. 모델의 내부 텐서의 일부에 `.to(self.device)` 문을 추가하는 것을 잊었을 수 있으며, 이 경우 테스트에서 오류로 표시됩니다. GPU에 액세스할 수 없는 경우, Hugging Face 팀이 테스트를 대신 실행할 수 있습니다. + +**11. 기술문서 추가** + +이제 *brand_new_bert*에 필요한 모든 기능이 추가되었습니다. 거의 끝났습니다! 추가해야 할 것은 멋진 기술문서과 기술문서 페이지입니다. Cookiecutter가 `docs/source/model_doc/brand_new_bert.md`라는 템플릿 파일을 추가해줬을 것입니다. 이 페이지를 사용하기 전에 모델을 사용하는 사용자들은 일반적으로 이 페이지를 먼저 확인합니다. 따라서 문서는 이해하기 쉽고 간결해야 합니다. 모델을 사용하는 방법을 보여주기 위해 *팁*을 추가하는 것이 커뮤니티에 매우 유용합니다. 독스트링에 관련하여 Hugging Face 팀에 문의하는 것을 주저하지 마세요. + +다음으로, `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`에 추가된 독스트링이 올바르며 필요한 모든 입력 및 출력을 포함하도록 확인하세요. [여기](writing-documentation)에서 우리의 문서 작성 가이드와 독스트링 형식에 대한 상세 가이드가 있습니다. 문서는 일반적으로 커뮤니티와 모델의 첫 번째 접점이기 때문에, 문서는 적어도 코드만큼의 주의를 기울여야 합니다. + +**코드 리팩토링** + +좋아요, 이제 *brand_new_bert*를 위한 모든 필요한 코드를 추가했습니다. 이 시점에서 다음을 실행하여 잠재적으로 잘못된 코드 스타일을 수정해야 합니다: + +그리고 코딩 스타일이 품질 점검을 통과하는지 확인하기 위해 다음을 실행하고 확인해야 합니다: + +```bash +make style +``` + +🤗 Transformers에는 여전히 실패할 수 있는 몇 가지 매우 엄격한 디자인 테스트가 있습니다. 이는 독스트링에 누락된 정보나 잘못된 명명 때문에 종종 발생합니다. 여기서 막히면 Hugging Face 팀이 도움을 줄 것입니다. + +```bash +make quality +``` + +마지막으로, 코드가 정확히 작동하는 것을 확인한 후에는 항상 코드를 리팩토링하는 것이 좋은 생각입니다. 모든 테스트가 통과된 지금은 추가한 코드를 다시 검토하고 리팩토링하는 좋은 시기입니다. + +이제 코딩 부분을 완료했습니다. 축하합니다! 🎉 멋져요! 😎 + +**12. 모델을 모델 허브에 업로드하세요** + +이 마지막 파트에서는 모든 체크포인트를 변환하여 모델 허브에 업로드하고 각 업로드된 모델 체크포인트에 대한 모델 카드를 추가해야 합니다. [Model sharing and uploading Page](model_sharing)를 읽고 허브 기능에 익숙해지세요. *brand_new_bert*의 저자 조직 아래에 모델을 업로드할 수 있는 필요한 액세스 권한을 얻기 위해 Hugging Face 팀과 협업해야 합니다. `transformers`의 모든 모델에 있는 `push_to_hub` 메서드는 체크포인트를 허브에 빠르고 효율적으로 업로드하는 방법입니다. 아래에 작은 코드 조각이 붙여져 있습니다: + +각 체크포인트에 적합한 모델 카드를 만드는 데 시간을 할애하는 것은 가치가 있습니다. 모델 카드는 체크포인트의 특성을 강조해야 합니다. *예를 들어* 이 체크포인트는 어떤 데이터셋에서 사전 훈련/세부 훈련되었는지? 이 모델은 어떤 하위 작업에서 사용해야 하는지? 그리고 모델을 올바르게 사용하는 방법에 대한 몇 가지 코드도 포함해야 합니다. + +```python +brand_new_bert.push_to_hub("brand_new_bert") +# Uncomment the following line to push to an organization. +# brand_new_bert.push_to_hub("/brand_new_bert") +``` + +**13. (선택 사항) 노트북 추가** + +*brand_new_bert*를 다운스트림 작업에서 추론 또는 미세 조정에 사용하는 방법을 자세히 보여주는 노트북을 추가하는 것이 매우 유용합니다. 이것은 PR을 병합하는 데 필수적이지는 않지만 커뮤니티에 매우 유용합니다. + +**14. 완료된 PR 제출** + +이제 프로그래밍을 마쳤으며, 마지막 단계로 PR을 메인 브랜치에 병합해야 합니다. 보통 Hugging Face 팀은 이미 여기까지 도움을 주었을 것입니다. 그러나 PR에 멋진 설명을 추가하고 리뷰어에게 특정 디자인 선택 사항을 강조하려면 완료된 PR에 약간의 설명을 추가하는 시간을 할애하는 것이 가치가 있습니다. + +### 작업물을 공유하세요!! [[share-your-work]] + +이제 커뮤니티에서 작업물을 인정받을 시간입니다! 모델 추가 작업을 완료하는 것은 Transformers와 전체 NLP 커뮤니티에 큰 기여입니다. 당신의 코드와 이식된 사전 훈련된 모델은 수백, 심지어 수천 명의 개발자와 연구원에 의해 확실히 사용될 것입니다. 당신의 작업에 자랑스러워해야 하며 이를 커뮤니티와 공유해야 합니다. + +**당신은 커뮤니티 내 모든 사람들에게 매우 쉽게 접근 가능한 또 다른 모델을 만들었습니다! 🤯** diff --git a/docs/source/ko/add_new_pipeline.md b/docs/source/ko/add_new_pipeline.md new file mode 100644 index 000000000000..554300928b51 --- /dev/null +++ b/docs/source/ko/add_new_pipeline.md @@ -0,0 +1,248 @@ + + +# 어떻게 사용자 정의 파이프라인을 생성하나요? [[how-to-create-a-custom-pipeline]] + +이 가이드에서는 사용자 정의 파이프라인을 어떻게 생성하고 [허브](hf.co/models)에 공유하거나 🤗 Transformers 라이브러리에 추가하는 방법을 살펴보겠습니다. + +먼저 파이프라인이 수용할 수 있는 원시 입력을 결정해야 합니다. +문자열, 원시 바이트, 딕셔너리 또는 가장 원하는 입력일 가능성이 높은 것이면 무엇이든 가능합니다. +이 입력을 가능한 한 순수한 Python 형식으로 유지해야 (JSON을 통해 다른 언어와도) 호환성이 좋아집니다. +이것이 전처리(`preprocess`) 파이프라인의 입력(`inputs`)이 될 것입니다. + +그런 다음 `outputs`를 정의하세요. +`inputs`와 같은 정책을 따르고, 간단할수록 좋습니다. +이것이 후처리(`postprocess`) 메소드의 출력이 될 것입니다. + +먼저 4개의 메소드(`preprocess`, `_forward`, `postprocess` 및 `_sanitize_parameters`)를 구현하기 위해 기본 클래스 `Pipeline`을 상속하여 시작합니다. + + +```python +from transformers import Pipeline + + +class MyPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} + + def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} + + def _forward(self, model_inputs): + # model_inputs == {"model_input": model_input} + outputs = self.model(**model_inputs) + # Maybe {"logits": Tensor(...)} + return outputs + + def postprocess(self, model_outputs): + best_class = model_outputs["logits"].softmax(-1) + return best_class +``` + +이 분할 구조는 CPU/GPU에 대한 비교적 원활한 지원을 제공하는 동시에, 다른 스레드에서 CPU에 대한 사전/사후 처리를 수행할 수 있게 지원하는 것입니다. + +`preprocess`는 원래 정의된 입력을 가져와 모델에 공급할 수 있는 형식으로 변환합니다. +더 많은 정보를 포함할 수 있으며 일반적으로 `Dict` 형태입니다. + +`_forward`는 구현 세부 사항이며 직접 호출할 수 없습니다. +`forward`는 예상 장치에서 모든 것이 작동하는지 확인하기 위한 안전장치가 포함되어 있어 선호되는 호출 메소드입니다. +실제 모델과 관련된 것은 `_forward` 메소드에 속하며, 나머지는 전처리/후처리 과정에 있습니다. + +`postprocess` 메소드는 `_forward`의 출력을 가져와 이전에 결정한 최종 출력 형식으로 변환합니다. + +`_sanitize_parameters`는 초기화 시간에 `pipeline(...., maybe_arg=4)`이나 호출 시간에 `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`과 같이, 사용자가 원하는 경우 언제든지 매개변수를 전달할 수 있도록 허용합니다. + +`_sanitize_parameters`의 반환 값은 `preprocess`, `_forward`, `postprocess`에 직접 전달되는 3개의 kwargs 딕셔너리입니다. +호출자가 추가 매개변수로 호출하지 않았다면 아무것도 채우지 마십시오. +이렇게 하면 항상 더 "자연스러운" 함수 정의의 기본 인수를 유지할 수 있습니다. + +분류 작업에서 `top_k` 매개변수가 대표적인 예입니다. + +```python +>>> pipe = pipeline("my-new-task") +>>> pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] + +>>> pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] +``` + +이를 달성하기 위해 우리는 `postprocess` 메소드를 기본 매개변수인 `5`로 업데이트하고 `_sanitize_parameters`를 수정하여 이 새 매개변수를 허용합니다. + + +```python +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # top_k를 처리하는 로직 추가 + return best_class + + +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + + postprocess_kwargs = {} + if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] + return preprocess_kwargs, {}, postprocess_kwargs +``` + +입/출력을 가능한 한 간단하고 완전히 JSON 직렬화 가능한 형식으로 유지하려고 노력하십시오. +이렇게 하면 사용자가 새로운 종류의 개체를 이해하지 않고도 파이프라인을 쉽게 사용할 수 있습니다. +또한 사용 용이성을 위해 여러 가지 유형의 인수(오디오 파일은 파일 이름, URL 또는 순수한 바이트일 수 있음)를 지원하는 것이 비교적 일반적입니다. + + + +## 지원되는 작업 목록에 추가하기 [[adding-it-to-the-list-of-supported-tasks]] + +`new-task`를 지원되는 작업 목록에 등록하려면 `PIPELINE_REGISTRY`에 추가해야 합니다: + +```python +from transformers.pipelines import PIPELINE_REGISTRY + +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) +``` + +원하는 경우 기본 모델을 지정할 수 있으며, 이 경우 특정 개정(분기 이름 또는 커밋 해시일 수 있음, 여기서는 "abcdef")과 타입을 함께 가져와야 합니다: + +```python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # 현재 지원 유형: text, audio, image, multimodal +) +``` + +## Hub에 파이프라인 공유하기 [[share-your-pipeline-on-the-hub]] + +Hub에 사용자 정의 파이프라인을 공유하려면 `Pipeline` 하위 클래스의 사용자 정의 코드를 Python 파일에 저장하기만 하면 됩니다. +예를 들어, 다음과 같이 문장 쌍 분류를 위한 사용자 정의 파이프라인을 사용한다고 가정해 보겠습니다: + +```py +import numpy as np + +from transformers import Pipeline + + +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} + + def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + + def _forward(self, model_inputs): + return self.model(**model_inputs) + + def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} +``` + +구현은 프레임워크에 구애받지 않으며, PyTorch와 TensorFlow 모델에 대해 작동합니다. +이를 `pair_classification.py`라는 파일에 저장한 경우, 다음과 같이 가져오고 등록할 수 있습니다: + +```py +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification + +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) +``` + +이 작업이 완료되면 사전훈련된 모델과 함께 사용할 수 있습니다. +예를 들어, `sgugger/finetuned-bert-mrpc`은 MRPC 데이터 세트에서 미세 조정되어 문장 쌍을 패러프레이즈인지 아닌지를 분류합니다. + +```py +from transformers import pipeline + +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") +``` + +그런 다음 `Repository`의 `save_pretrained` 메소드를 사용하여 허브에 공유할 수 있습니다: + +```py +from huggingface_hub import Repository + +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() +``` + +이렇게 하면 "test-dynamic-pipeline" 폴더 내에 `PairClassificationPipeline`을 정의한 파일이 복사되며, 파이프라인의 모델과 토크나이저도 저장한 후, `{your_username}/test-dynamic-pipeline` 저장소에 있는 모든 것을 푸시합니다. +이후에는 `trust_remote_code=True` 옵션만 제공하면 누구나 사용할 수 있습니다. + +```py +from transformers import pipeline + +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) +``` + +## 🤗 Transformers에 파이프라인 추가하기 [[add-the-pipeline-to-transformers]] + +🤗 Transformers에 사용자 정의 파이프라인을 기여하려면, `pipelines` 하위 모듈에 사용자 정의 파이프라인 코드와 함께 새 모듈을 추가한 다음, `pipelines/__init__.py`에서 정의된 작업 목록에 추가해야 합니다. + +그런 다음 테스트를 추가해야 합니다. +`tests/test_pipelines_MY_PIPELINE.py`라는 새 파일을 만들고 다른 테스트와 예제를 함께 작성합니다. + +`run_pipeline_test` 함수는 매우 일반적이며, `model_mapping` 및 `tf_model_mapping`에서 정의된 가능한 모든 아키텍처의 작은 무작위 모델에서 실행됩니다. + +이는 향후 호환성을 테스트하는 데 매우 중요하며, 누군가 `XXXForQuestionAnswering`을 위한 새 모델을 추가하면 파이프라인 테스트가 해당 모델에서 실행을 시도한다는 의미입니다. +모델이 무작위이기 때문에 실제 값을 확인하는 것은 불가능하므로, 단순히 파이프라인 출력 `TYPE`과 일치시키기 위한 도우미 `ANY`가 있습니다. + +또한 2개(이상적으로는 4개)의 테스트를 구현해야 합니다. + +- `test_small_model_pt`: 이 파이프라인에 대한 작은 모델 1개를 정의(결과가 의미 없어도 상관없음)하고 파이프라인 출력을 테스트합니다. +결과는 `test_small_model_tf`와 동일해야 합니다. +- `test_small_model_tf`: 이 파이프라인에 대한 작은 모델 1개를 정의(결과가 의미 없어도 상관없음)하고 파이프라인 출력을 테스트합니다. +결과는 `test_small_model_pt`와 동일해야 합니다. +- `test_large_model_pt`(`선택사항`): 결과가 의미 있을 것으로 예상되는 실제 파이프라인에서 파이프라인을 테스트합니다. +이러한 테스트는 속도가 느리므로 이를 표시해야 합니다. +여기서의 목표는 파이프라인을 보여주고 향후 릴리즈에서의 변화가 없는지 확인하는 것입니다. +- `test_large_model_tf`(`선택사항`): 결과가 의미 있을 것으로 예상되는 실제 파이프라인에서 파이프라인을 테스트합니다. +이러한 테스트는 속도가 느리므로 이를 표시해야 합니다. +여기서의 목표는 파이프라인을 보여주고 향후 릴리즈에서의 변화가 없는지 확인하는 것입니다. diff --git a/docs/source/ko/add_tensorflow_model.md b/docs/source/ko/add_tensorflow_model.md new file mode 100644 index 000000000000..378f2163b5db --- /dev/null +++ b/docs/source/ko/add_tensorflow_model.md @@ -0,0 +1,262 @@ + + +# 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요? [[how-to-convert-a-transformers-model-to-tensorflow]] + +🤗 Transformers에서처럼 사용할 수 있는 여러 가지 프레임워크가 있다는 것은 애플리케이션을 설계할 때 그들의 강점을 유연하게 이용할 수 있다는 장점이 있지만, 모델 별로 호환성을 추가해야 한다는 단점 또한 존재한다는 것을 의미합니다. 좋은 소식은 기존 모델에 TensorFlow 호환성을 추가하는 것이 [처음부터 새로운 모델을 추가하는 것](add_new_model)보다도 간단하다는 것입니다! + +만약 대규모 TensorFlow 모델을 더 깊이 이해하려거나, 오픈 소스에 큰 기여를 하려거나, 선택한 모델에 Tensorflow를 활용하려한다면, 이 안내서는 여러분께 도움이 될 것입니다. + +이 가이드는 Hugging Face 팀의 최소한의 감독 아래에서 🤗 Transformers에서 사용되는 TensorFlow 모델 가중치와/또는 아키텍처를 기여할 수 있는 커뮤니티 구성원인 여러분을 대상으로 합니다. +새로운 모델을 작성하는 것은 쉬운 일이 아니지만, 이 가이드를 통해 조금 덜 힘들고 훨씬 쉬운 작업으로 만들 수 있습니다. +모두의 경험을 모으는 것은 이 작업을 점차적으로 더 쉽게 만드는 데 굉장히 중요하기 때문에, 이 가이드를 개선시킬만한 제안이 떠오르면 공유하시는걸 적극적으로 권장합니다! + +더 깊이 알아보기 전에, 🤗 Transformers를 처음 접하는 경우 다음 자료를 확인하는 것이 좋습니다: +- [🤗 Transformers의 일반 개요](add_new_model#general-overview-of-transformers) +- [Hugging Face의 TensorFlow 철학](https://huggingface.co/blog/tensorflow-philosophy) + +이 가이드의 나머지 부분에서는 새로운 TensorFlow 모델 아키텍처를 추가하는 데 필요한 단계, Pytorch를 TensorFlow 모델 가중치로 변환하는 절차 및 ML 프레임워크 간의 불일치를 효율적으로 디버깅하는 방법을 알게 될 것입니다. 시작해봅시다! + + + +사용하려는 모델이 이미 해당하는 TensorFlow 아키텍처가 있는지 확실하지 않나요? + +선택한 모델([예](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14))의 `config.json`의 `model_type` 필드를 확인해보세요. 🤗 Transformers의 해당 모델 폴더에는 "modeling_tf"로 시작하는 파일이 있는 경우, 해당 모델에는 해당 TensorFlow 아키텍처([예](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert))가 있다는 의미입니다. + + + +## TensorFlow 모델 아키텍처 코드 추가하는 단계별 가이드 [[step-by-step-guide-to add-tensorFlow-model-architecture-code]] + +대규모 아키텍처를 가진 모델을 설계하는 방법에는 여러가지가 있으며, 해당 설계를 구현하는 방법도 여러 가지입니다. +그러나 우리는 [🤗 Transformers 일반 개요](add_new_model#general-overview-of-transformers)에서 언급한 대로 일관된 설계 선택에 따라야지만 🤗 Transformers를 사용하기 편할 것이라는 확고한 의견을 가지고 있습니다. +우리의 경험을 통해 TensorFlow 모델을 추가하는 데 관련된 중요한 몇 가지 사항을 알려 드릴 수 있습니다: + +- 이미 있는걸 다시 개발하려 하지 마세요! 최소한 2개의 이미 구현된 모델을 대개 참조해야 합니다. 구현하려는 모델과 기능상 동일한 Pytorch 모델 하나와 같은 문제 유형을 풀고 있는 다른 TensorFlow 모델 하나를 살펴보세요. +- 우수한 모델 구현은 시간이 지나도 남아있습니다. 이것은 코드가 아름답다는 이유가 아니라 코드가 명확하고 디버깅 및 개선이 쉽기 때문입니다. TensorFlow 구현에서 다른 모델들과 패턴을 똑같이 하고 Pytorch 구현과의 불일치를 최소화하여 메인테이너의 업무를 쉽게 한다면, 기여한 코드가 오래도록 유지될 수 있습니다. +- 필요하다면 도움을 요청하세요! 🤗 Transformers 팀은 여러분을 돕기 위해 있으며, 여러분이 직면한 동일한 문제에 대한 해결책을 이미 찾은 경우도 있을 수 있습니다. + +TensorFlow 모델 아키텍처를 추가하는 데 필요한 단계를 개략적으로 써보면: +1. 변환하려는 모델 선택 +2. transformers 개발 환경 준비 +3. (선택 사항) 이론적 측면 및 기존 구현 이해 +4. 모델 아키텍처 구현 +5. 모델 테스트 구현 +6. PR (pull request) 제출 +7. (선택 사항) 데모 빌드 및 공유 + +### 1.-3. 모델 기여 준비 [[1.-3.-prepare-your-model-contribution]] + +**1. 변환하려는 모델 선택** + +우선 기본 사항부터 시작해 보겠습니다. 먼저 변환하려는 아키텍처를 알아야 합니다. +특정 아키텍처에 대한 관심 없는 경우, 🤗 Transformers 팀에게 제안을 요청하는 것은 여러분의 영향력을 극대화하는 좋은 방법입니다. +우리는 TensorFlow에서 빠져 있는 가장 유명한 아키텍처로 이끌어 드리겠습니다. +TensorFlow에서 사용할 모델이 이미 🤗 Transformers에 TensorFlow 아키텍처 구현이 있지만 가중치가 없는 경우, +이 페이지의 [가중치 추가 섹션](#adding-tensorflow-weights-to-hub)으로 바로 이동하셔도 됩니다. + +간단히 말해서, 이 안내서의 나머지 부분은 TensorFlow 버전의 *BrandNewBert*([가이드](add_new_model)와 동일한 예제)를 기여하려고 결정했다고 가정합니다. + + + +TensorFlow 모델 아키텍처에 작업을 시작하기 전에 해당 작업이 진행 중인지 확인하세요. +`BrandNewBert`를 검색하여 +[pull request GitHub 페이지](https://github.com/huggingface/transformers/pulls?q=is%3Apr)에서 TensorFlow 관련 pull request가 없는지 확인할 수 있습니다. + + + +**2. transformers 개발 환경 준비** + + +모델 아키텍처를 선택한 후, 관련 작업을 수행할 의도를 미리 알리기 위해 Draft PR을 여세요. 아래 지침대로 하시면 환경을 설정하고 Draft PR을 열 수 있습니다. + +1. 'Fork' 버튼을 클릭하여 [리포지터리](https://github.com/huggingface/transformers)를 포크하세요. 이렇게 하면 GitHub 사용자 계정에 코드의 사본이 생성됩니다. + + +2. `transformers` 포크를 로컬 디스크에 클론하고 원본 리포지터리를 원격 리포지터리로 추가하세요. + +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` + +3. 개발 환경을 설정하세요. 예를 들어, 다음 명령을 실행하여 개발 환경을 설정할 수 있습니다. + +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +운영 체제에 따라서 Transformers의 선택적 종속성이 증가하면서 위 명령이 실패할 수도 있습니다. 그런 경우 TensorFlow를 설치한 후 다음을 실행하세요. + +```bash +pip install -e ".[quality]" +``` + +**참고:** CUDA를 설치할 필요는 없습니다. 새로운 모델이 CPU에서 작동하도록 만드는 것만으로 충분합니다. + +4. 메인 브랜치에서 만드려는 기능이 잘 표현되는 이름으로 브랜치를 만듭니다. + +```bash +git checkout -b add_tf_brand_new_bert +``` + +5. 메인 브랜치의 현재 상태를 페치(fetch)하고 리베이스하세요. + +```bash +git fetch upstream +git rebase upstream/main +``` + +6. `transformers/src/models/brandnewbert/`에 `modeling_tf_brandnewbert.py`라는 빈 `.py` 파일을 추가하세요. 이 파일이 TensorFlow 모델 파일이 될 것입니다. + +7. 변경 사항을 계정에 푸시하세요. + +```bash +git add . +git commit -m "initial commit" +git push -u origin add_tf_brand_new_bert +``` + +8. 만족스러운 경우 GitHub에서 포크된 웹 페이지로 이동합니다. "Pull request"를 클릭합니다. Hugging Face 팀의 GitHub ID를 리뷰어로 추가해서, 앞으로의 변경 사항에 대해 Hugging Face 팀이 알림을 받을 수 있도록 합니다. + + +9. GitHub Pull Requests 페이지의 오른쪽에 있는 "Convert to draft"를 클릭하여 PR을 초안으로 변경하세요. + +이제 🤗 Transformers에서 *BrandNewBert*를 TensorFlow로 변환할 개발 환경을 설정했습니다. + + +**3. (선택 사항) 이론적 측면 및 기존 구현 이해** + + +*BrandNewBert*처럼 자세한 글이 있다면 시간을 내어 논문을 읽는걸 추천드립니다. 이해하기 어려운 부분이 많을 수 있습니다. 그렇다고 해서 걱정하지 마세요! 목표는 논문의 심도있는 이론적 이해가 아니라 TensorFlow를 사용하여 🤗 Transformers에 모델을 효과적으로 다시 구현하는 데 필요한 필수 정보를 추출하는 것입니다. 많은 시간을 이론적 이해에 투자할 필요는 없지만 실용적인 측면에서 현재 존재하는 모델 문서 페이지(e.g. [model docs for BERT](model_doc/bert))에 집중하는 것이 좋습니다. + + +모델의 기본 사항을 이해한 후, 기존 구현을 이해하는 것이 중요합니다. 이는 작업 중인 모델에 대한 실제 구현이 여러분의 기대와 일치함을 확인하고, TensorFlow 측면에서의 기술적 문제를 예상할 수 있습니다. + +막대한 양의 정보를 처음으로 학습할 때 압도당하는 것은 자연스러운 일입니다. 이 단계에서 모델의 모든 측면을 이해해야 하는 필요는 전혀 없습니다. 그러나 우리는 Hugging Face의 [포럼](https://discuss.huggingface.co/)을 통해 질문이 있는 경우 대답을 구할 것을 권장합니다. + +### 4. 모델 구현 [[4-model-implementation]] + + +이제 드디어 코딩을 시작할 시간입니다. 우리의 제안된 시작점은 PyTorch 파일 자체입니다: `modeling_brand_new_bert.py`의 내용을 +`src/transformers/models/brand_new_bert/` 내부의 +`modeling_tf_brand_new_bert.py`에 복사합니다. 이 섹션의 목표는 파일을 수정하고 🤗 Transformers의 import 구조를 업데이트하여 `TFBrandNewBert` 및 `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`가 성공적으로 작동하는 TensorFlow *BrandNewBert* 모델을 가져올 수 있도록 하는 것입니다. + +유감스럽게도, PyTorch 모델을 TensorFlow로 변환하는 규칙은 없습니다. 그러나 프로세스를 가능한한 원활하게 만들기 위해 다음 팁을 따를 수 있습니다. + +- 모든 클래스 이름 앞에 `TF`를 붙입니다(예: `BrandNewBert`는 `TFBrandNewBert`가 됩니다). +- 대부분의 PyTorch 작업에는 직접적인 TensorFlow 대체가 있습니다. 예를 들어, `torch.nn.Linear`는 `tf.keras.layers.Dense`에 해당하고, `torch.nn.Dropout`은 `tf.keras.layers.Dropout`에 해당합니다. 특정 작업에 대해 확신이 없는 경우 [TensorFlow 문서](https://www.tensorflow.org/api_docs/python/tf)나 [PyTorch 문서](https://pytorch.org/docs/stable/)를 참조할 수 있습니다. +- 🤗 Transformers 코드베이스에서 패턴을 찾으세요. 직접적인 대체가 없는 특정 작업을 만나면 다른 사람이 이미 동일한 문제를 해결한 경우가 많습니다. +- 기본적으로 PyTorch와 동일한 변수 이름과 구조를 유지하세요. 이렇게 하면 디버깅과 문제 추적, 그리고 문제 해결 추가가 더 쉬워집니다. +- 일부 레이어는 각 프레임워크마다 다른 기본값을 가지고 있습니다. 대표적인 예로 배치 정규화 레이어의 epsilon은 [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)에서 `1e-5`이고 [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)에서 `1e-3`입니다. 문서를 모두 확인하세요! +- PyTorch의 `nn.Parameter` 변수는 일반적으로 TF 레이어의 `build()` 내에서 초기화해야 합니다. 다음 예를 참조하세요: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) / + [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220) +- PyTorch 모델의 함수 상단에 `#copied from ...`가 있는 경우, TensorFlow 모델에 TensorFlow 아키텍처가 있다면 TensorFlow 모델이 해당 함수를 복사한 아키텍처에서 사용할 수 있습니다. +- TensorFlow 함수에서 `name` 속성을 올바르게 할당하는 것은 `from_pt=True` 가중치 교차 로딩을 수행하는 데 중요합니다. `name`은 대부분 PyTorch 코드의 해당 변수의 이름입니다. `name`이 제대로 설정되지 않으면 모델 가중치를 로드할 때 오류 메시지에서 확인할 수 있습니다. +- 기본 모델 클래스인 `BrandNewBertModel`의 로직은 실제로 Keras 레이어 서브클래스([예시](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719))인 `TFBrandNewBertMainLayer`에 있습니다. `TFBrandNewBertModel`은 이 레이어를 감싸기만 하는 래퍼 역할을 합니다. +- Keras 모델은 사전 훈련된 가중치를 로드하기 위해 빌드되어야 합니다. 따라서 `TFBrandNewBertPreTrainedModel`은 모델의 입력 예제인 `dummy_inputs`([예시](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)) 유지해야 합니다. +- 도움이 필요한 경우 도움을 요청하세요. 우리는 여기 있어서 도움을 드리기 위해 있는 것입니다! 🤗 + +모델 파일 자체 외에도 모델 클래스 및 관련 문서 페이지에 대한 포인터를 추가해야 합니다. 이 부분은 다른 PR([예시](https://github.com/huggingface/transformers/pull/18020/files))의 패턴을 따라 완전히 완료할 수 있습니다. 다음은 필요한 수동 변경 목록입니다. + +- `src/transformers/__init__.py`에 *BrandNewBert*의 모든 공개 클래스를 포함합니다. +- `src/transformers/models/auto/modeling_tf_auto.py`에서 *BrandNewBert* 클래스를 해당 Auto 클래스에 추가합니다. +- `src/transformers/utils/dummy_tf_objects.py`에 *BrandNewBert*와 관련된 레이지 로딩 클래스를 추가합니다. +- `src/transformers/models/brand_new_bert/__init__.py`에서 공개 클래스에 대한 import 구조를 업데이트합니다. +- `docs/source/en/model_doc/brand_new_bert.md`에서 *BrandNewBert*의 공개 메서드에 대한 문서 포인터를 추가합니다. +- `docs/source/en/model_doc/brand_new_bert.md`의 *BrandNewBert* 기여자 목록에 자신을 추가합니다. +- 마지막으로 ✅ 녹색 체크박스를 TensorFlow 열 docs/source/en/index.md 안 BrandNewBert에 추가합니다. + +구현이 만족하면 다음 체크리스트를 실행하여 모델 아키텍처가 준비되었는지 확인하세요. + +1. 훈련 시간에 다르게 동작하는 `training` 인수로 불리는 모든 레이어(예: Dropout)는 최상위 클래스에서 전파됩니다. +2. #copied from ...가능할 때마다 사용했습니다. +3. `TFBrandNewBertMainLayer`와 그것을 사용하는 모든 클래스는 `call`함수로 `@unpack_inputs`와 함께 데코레이터 됩니다. +4. `TFBrandNewBertMainLayer`는 `@keras_serializable`로 데코레이터 됩니다. +5. TensorFlow 모델은 `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`를 사용하여 PyTorch 가중치에서 로드할 수 있습니다. +6. 예상 입력 형식을 사용하여 TensorFlow 모델을 호출할 수 있습니다. + +### 5. 모델 테스트 구현 [[5-add-model-tests]] + +TensorFlow 모델 아키텍처를 구현하는 데 성공했습니다! 이제 TensorFlow 모델을 테스트하는 구현을 작성할 차례입니다. 이를 통해 모델이 예상대로 작동하는지 확인할 수 있습니다. 이전에 우리는 `test_modeling_brand_new_bert.py` 파일을 `tests/models/brand_new_bert/ into test_modeling_tf_brand_new_bert.py`에 복사한 뒤, TensorFlow로 교체하는 것이 좋습니다. 지금은, 모든 `.from_pretrained()`을 `from_pt=True`를 사용하여 존재하는 Pytorch 가중치를 가져오도록 해야합니다. + +완료하셨으면, 이제 진실의 순간이 찾아왔습니다: 테스트를 실행해 보세요! 😬 + +```bash +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +``` + +오류가 많이 나타날 것이지만 괜찮습니다! 기계 학습 모델을 디버깅하는 것은 악명높게 어려우며 성공의 핵심 요소는 인내심입니다 (`breakpoint()`도 필요합니다). 우리의 경험상으로는 ML 프레임워크 사이의 미묘한 불일치로 인해 가장 어려운 문제가 발생합니다. 이에 대한 몇 가지 지침이 이 가이드의 끝 부분에 있습니다. 다른 경우에는 일반 테스트가 직접 모델에 적용되지 않을 수 있으며, 이 경우 모델 테스트 클래스 레벨에서 재정의를 제안합니다. 문제가 무엇이든지 상관없이 문제가 있으면 당신이 고립되었다면 draft pull request에서 도움을 요청하는 것이 좋습니다. + +모든 테스트가 통과되면 축하합니다. 이제 모델을 🤗 Transformers 라이브러리에 추가할 준비가 거의 완료된 것입니다! 🎉 + + +테스트를 추가하는 방법에 대한 자세한 내용은 [🤗 Transformers의 테스트 가이드](https://huggingface.co/transformers/contributing.html#running-tests)를 참조하세요. + +### 6.-7. 모든 사용자가 당신의 모델을 사용할 수 있게 하기 [[6.-7.-ensure-everyone -can-use-your-model]] + +**6. 풀 요청 제출하기** + +구현과 테스트가 완료되면 풀 요청을 제출할 시간입니다. 코드를 푸시하기 전에 코드 서식 맞추기 유틸리티인 `make fixup` 🪄 를 실행하세요. 이렇게 하면 자동으로 서식 오류를 수정하며 자동 검사가 실패하는 것을 방지할 수 있습니다. + +이제 드래프트 풀 요청을 실제 풀 요청으로 변환하는 시간입니다. "리뷰 준비됨" 버튼을 클릭하고 Joao (`@gante`)와 Matt (`@Rocketknight1`)를 리뷰어로 추가하세요. 모델 풀 요청에는 적어도 3명의 리뷰어가 필요하지만, 그들이 당신의 모델에 적절한 추가 리뷰어를 찾을 것입니다. + +모든 리뷰어들이 PR 상태에 만족하면 마지막으로 `.from_pretrained()` 호출에서 `from_pt=True` 플래그를 제거하는 것입니다. TensorFlow 가중치가 없기 때문에 이를 추가해야 합니다! 이를 수행하는 방법은 아래 섹션의 지침을 확인하세요. + +마침내 TensorFlow 가중치가 병합되고, 적어도 3명의 리뷰어 승인을 받았으며 모든 CI 검사가 통과되었다면, 로컬로 테스트를 한 번 더 확인하세요. + +```bash +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +``` + +그리고 우리는 당신의 PR을 병합할 것입니다! 마일스톤 달성을 축하드립니다! 🎉 + +**7. (선택 사항) 데모를 만들고 세상과 공유하기** + +오픈 소스의 가장 어려운 부분 중 하나는 발견입니다. 다른 사용자들이 당신의 멋진 TensorFlow 기여를 어떻게 알 수 있을까요? 물론 적절한 커뮤니케이션으로 가능합니다! 📣 + +커뮤니티와 모델을 공유하는 두 가지 주요 방법이 있습니다: +- 데모 만들기. Gradio 데모, 노트북 및 모델을 자랑하는 다른 재미있는 방법을 포함합니다. [커뮤니티 기반 데모](https://huggingface.co/docs/transformers/community)에 노트북을 추가하는 것을 적극 권장합니다. +- Twitter와 LinkedIn과 같은 소셜 미디어에 이야기 공유하기. 당신의 작업에 자랑스러워하고 커뮤니티와 당신의 업적을 공유해야 합니다. 이제 당신의 모델은 전 세계의 수천 명의 엔지니어와 연구원들에 의해 사용될 수 있습니다 🌍! 우리는 당신의 게시물을 리트윗하고 커뮤니티와 함께 당신의 작업을 공유하는 데 도움이 될 것입니다. + + +## 🤗 허브에 TensorFlow 가중치 추가하기 [[adding-tensorFlow-weights-to-🤗-hub]] + +TensorFlow 모델 아키텍처가 🤗 Transformers에서 사용 가능하다고 가정하고, PyTorch 가중치를 TensorFlow 가중치로 변환하는 것은 쉽습니다! + +다음은 그 방법입니다: +1. 터미널에서 Hugging Face 계정으로 로그인되어 있는지 확인하십시오. `huggingface-cli login` 명령어를 사용하여 로그인할 수 있습니다. (액세스 토큰은 [여기](https://huggingface.co/settings/tokens)에서 찾을 수 있습니다.) +2. `transformers-cli pt-to-tf --model-name foo/bar`를 실행하십시오. 여기서 `foo/bar`는 변환하려는 PyTorch 가중치가 있는 모델 저장소의 이름입니다. +3. 방금 만든 🤗 허브 PR에서 `@joaogante`와 `@Rocketknight1`을 태그합니다. + +그게 다입니다! 🎉 + + +## ML 프레임워크 간 디버깅 🐛[[debugging-mismatches-across-ml-frameworks]] + +새로운 아키텍처를 추가하거나 기존 아키텍처에 대한 TensorFlow 가중치를 생성할 때, PyTorch와 TensorFlow 간의 불일치로 인한 오류가 발생할 수 있습니다. 심지어 두 프레임워크의 모델 아키텍처 코드가 동일해 보일 수도 있습니다. 무슨 일이 벌어지고 있는 걸까요? 🤔 + +먼저, 이러한 불일치를 이해하는 이유에 대해 이야기해 보겠습니다. 많은 커뮤니티 멤버들은 🤗 Transformers 모델을 그대로 사용하고, 우리의 모델이 예상대로 작동할 것이라고 믿습니다. 두 프레임워크 간에 큰 불일치가 있으면 모델이 적어도 하나의 프레임워크에 대한 참조 구현을 따르지 않음을 의미합니다. 이는 모델이 의도한 대로 작동하지 않을 수 있음을 나타냅니다. 이는 아예 실행되지 않는 모델보다 나쁠 수 있습니다! 따라서 우리는 모든 모델의 프레임워크 불일치를 `1e-5`보다 작게 유지하는 것을 목표로 합니다. + +기타 숫자 문제와 마찬가지로, 세세한 문제가 있습니다. 그리고 세세함에 집중하는 공정에서 필수 요소는 인내심입니다. 이러한 종류의 문제가 발생할 때 권장되는 작업 흐름은 다음과 같습니다: +1. 불일치의 원인을 찾아보십시오. 변환 중인 모델은 아마도 특정 지점까지 거의 동일한 내부 변수를 가지고 있을 것입니다. 두 프레임워크의 아키텍처에 `breakpoint()` 문을 넣고, 위에서 아래로 숫자 변수의 값을 비교하여 문제의 근원을 찾아냅니다. +2. 이제 문제의 근원을 찾았으므로 🤗 Transformers 팀에 연락하세요. 우리는 비슷한 문제를 이전에 겪었을 수 있으며 빠르게 해결책을 제공할 수 있습니다. 예외적인 경우에는 StackOverflow와 GitHub 이슈와 같은 인기있는 페이지를 확인하십시오. +3. 더 이상 해결책이 없는 경우, 더 깊이 들어가야 합니다. 좋은 소식은 문제의 원인을 찾았으므로 나머지 모델을 추상화하고 문제가 있는 명령어에 초점을 맞출 수 있습니다! 나쁜 소식은 해당 명령어의 소스 구현에 대해 알아봐야 한다는 것입니다. 일부 경우에는 참조 구현에 문제가 있을 수도 있으니 업스트림 저장소에서 이슈를 열기를 꺼리지 마십시오. + +어떤 경우에는 🤗 Transformers 팀과의 토론을 통해 불일치를 수정할 수 없을 수도 있습니다. 모델의 출력 레이어에서 불일치가 매우 작지만 숨겨진 상태에서 크게 나타날 수 있기 때문입니다. 이 경우 모델을 배포하는 것을 우선시하기 위해 불일치를 무시하기로 결정할 수도 있습니다. 위에서 언급한 `pt-to-tf` CLI에는 가중치 변환 시 오류 메시지를 무시하는 `--max-error` 플래그가 있습니다. diff --git a/docs/source/ko/attention.md b/docs/source/ko/attention.md new file mode 100644 index 000000000000..8f82a4b851e4 --- /dev/null +++ b/docs/source/ko/attention.md @@ -0,0 +1,54 @@ + + +# 어텐션 메커니즘[[attention_mechanisms]] + +대부분의 트랜스포머 모델은 정방행렬인 전체 어텐션을 사용합니다. +하지만 이는 긴 텍스트를 다룰 때는 큰 계산 병목 현상을 유발할 수 있습니다. +`Longformer`와 `Reformer`는 훈련 속도를 높이기 위해 어텐션 행렬의 희소 버전을 사용하여 효율을 높이려는 모델입니다. + +## LSH 어텐션[[lsh_attention]] + + +[Reformer](#reformer)는 LSH(Locality Sensitive Hashing) 어텐션을 사용합니다. softmax(QK^t)에서는 행렬 QK^t의 (softmax 차원에서) 가장 큰 요소들만 유용한 기여를 할 것입니다. +따라서 각각의 쿼리 q에 대해, q와 가까운 키 k만 고려할 수 있습니다. 해시 함수는 q와 k가 가까운지 여부를 결정하는 데 사용됩니다. +어텐션 마스크는 현재 토큰을 마스킹하여 변경됩니다. 이 때 첫 번째 위치의 토큰은 제외합니다. 왜냐하면 쿼리와 키가 동일한 값을 갖게 되기 때문입니다(서로 매우 유사함). +해시는 약간의 무작위성을 가질 수 있으므로, 실제로는 여러 개의 해시 함수가 사용되고 (`n_rounds` 매개변수에 의해 결정됨) 그 후에 평균값을 취하게 됩니다. + +## 지역 어텐션[[local_attention]] + +[Longformer](#longformer)는 지역 어텐션을 사용합니다. 종종 특정 토큰에 대해 지역 컨텍스트(예: 왼쪽과 오른쪽에 있는 두 개의 토큰은 무엇인가요?)만으로도 작업을 수행하는데 충분합니다. +또한 작은 창(window)을 가진 어텐션 레이어를 쌓음으로써 마지막 레이어는 창 내의 토큰뿐만 아니라 더 많은 수의 토큰에 대한 수용 영역(receptive field)을 갖게 되어 전체 문장의 표현을 구축할 수 있습니다. + +사전에 선택된 일부 입력 토큰들은 전역 어텐션을 받습니다. 이 몇 개의 토큰에 대해서는 어텐션 행렬이 모든 토큰에 접근할 수 있으며, 이 과정은 대칭적으로 이루어집니다. +다른 모든 토큰들은 로컬 창 내의 토큰들에 더해 해당 특정 토큰들에도 접근할 수 있습니다. 이는 논문의 Figure 2d에서 나타나며, 아래에 샘플 어텐션 마스크가 제시되어 있습니다: + + +
+ +
+ + +적은 파라미터의 어텐션 행렬을 사용하면 모델이 더 큰 시퀀스 입력 길이를 가질 수 있습니다. + +## 다른 방법들[[other_tricks]] + +### 축별 위치 인코딩[[axial_positional_encodings]] + +[Reformer](#reformer)는 축별 위치 인코딩(axial positional encodings)을 사용합니다. 기존의 트랜스포머 모델에서는 위치 인코딩 행렬 E는 크기가 \\(l \times d\\)인 행렬이며, +여기서 \\(l\\)은 시퀀스 길이(sequence length)이고 \\(d\\)는 숨겨진 상태(hidden state)의 차원입니다. 매우 긴 텍스트의 경우, 이 행렬은 매우 크며 GPU 상에서 공간을 많이 차지할 수 있습니다. +이를 완화하기 위해, 축별 위치 인코딩은 큰 행렬 E를 두 개의 작은 행렬 E1과 E2로 분해합니다. 이때 E1의 크기는 \\(l_{1} \times d_{1}\\)이고, E2의 크기는 \\(l_{2} \times d_{2}\\)입니다. +이때 \\(l_{1} \times l_{2} = l\\)이고 \\(d_{1} + d_{2} = d\\)(길이에 대한 곱셈 연산을 사용하면 훨씬 작아집니다). E의 시간 단계 j에 대한 임베딩은 E1에서 시간 단계 \\(j \% l1\\)의 임베딩과 E2에서 시간 단계 \\(j // l1\\)의 임베딩을 연결하여 얻습니다. \ No newline at end of file diff --git a/docs/source/ko/autoclass_tutorial.md b/docs/source/ko/autoclass_tutorial.md new file mode 100644 index 000000000000..9ecfd9c2015d --- /dev/null +++ b/docs/source/ko/autoclass_tutorial.md @@ -0,0 +1,144 @@ + + +# AutoClass로 사전 학습된 인스턴스 로드[[load-pretrained-instances-with-an-autoclass]] + +트랜스포머 아키텍처가 매우 다양하기 때문에 체크포인트에 맞는 아키텍처를 생성하는 것이 어려울 수 있습니다. 라이브러리를 쉽고 간단하며 유연하게 사용하기 위한 Transformer 핵심 철학의 일환으로, `AutoClass`는 주어진 체크포인트에서 올바른 아키텍처를 자동으로 추론하여 로드합니다. `from_pretrained()` 메서드를 사용하면 모든 아키텍처에 대해 사전 학습된 모델을 빠르게 로드할 수 있으므로 모델을 처음부터 학습하는 데 시간과 리소스를 투입할 필요가 없습니다. +체크포인트에 구애받지 않는 코드를 생성한다는 것은 코드가 한 체크포인트에서 작동하면 아키텍처가 다르더라도 다른 체크포인트(유사한 작업에 대해 학습된 경우)에서도 작동한다는 것을 의미합니다. + + + +아키텍처는 모델의 골격을 의미하며 체크포인트는 주어진 아키텍처에 대한 가중치입니다. 예를 들어, [BERT](https://huggingface.co/bert-base-uncased)는 아키텍처이고, `bert-base-uncased`는 체크포인트입니다. 모델은 아키텍처 또는 체크포인트를 의미할 수 있는 일반적인 용어입니다. + + + +이 튜토리얼에서는 다음을 학습합니다: + +* 사전 학습된 토크나이저 로드하기. +* 사전 학습된 이미지 프로세서 로드하기. +* 사전 학습된 특징 추출기 로드하기. +* 사전 훈련된 프로세서 로드하기. +* 사전 학습된 모델 로드하기. + +## AutoTokenizer[[autotokenizer]] + +거의 모든 NLP 작업은 토크나이저로 시작됩니다. 토크나이저는 사용자의 입력을 모델에서 처리할 수 있는 형식으로 변환합니다. +[`AutoTokenizer.from_pretrained`]로 토크나이저를 로드합니다: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +``` + +그리고 아래와 같이 입력을 토큰화합니다: + +```py +>>> sequence = "In a hole in the ground there lived a hobbit." +>>> print(tokenizer(sequence)) +{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +## AutoImageProcessor[[autoimageprocessor]] + +비전 작업의 경우 이미지 프로세서가 이미지를 올바른 입력 형식으로 처리합니다. + +```py +>>> from transformers import AutoImageProcessor + +>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +``` + + +## AutoFeatureExtractor[[autofeatureextractor]] + +오디오 작업의 경우 특징 추출기가 오디오 신호를 올바른 입력 형식으로 처리합니다. + +[`AutoFeatureExtractor.from_pretrained`]로 특징 추출기를 로드합니다: + +```py +>>> from transformers import AutoFeatureExtractor + +>>> feature_extractor = AutoFeatureExtractor.from_pretrained( +... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" +... ) +``` + +## AutoProcessor[[autoprocessor]] + +멀티모달 작업에는 두 가지 유형의 전처리 도구를 결합한 프로세서가 필요합니다. 예를 들어 LayoutLMV2 모델에는 이미지를 처리하는 이미지 프로세서와 텍스트를 처리하는 토크나이저가 필요하며, 프로세서는 이 두 가지를 결합합니다. + +[`AutoProcessor.from_pretrained()`]로 프로세서를 로드합니다: + +```py +>>> from transformers import AutoProcessor + +>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") +``` + +## AutoModel[[automodel]] + + + +마지막으로 AutoModelFor클래스를 사용하면 주어진 작업에 대해 미리 학습된 모델을 로드할 수 있습니다 (사용 가능한 작업의 전체 목록은 [여기](model_doc/auto)를 참조하세요). 예를 들어, [`AutoModelForSequenceClassification.from_pretrained`]를 사용하여 시퀀스 분류용 모델을 로드할 수 있습니다: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +동일한 체크포인트를 쉽게 재사용하여 다른 작업에 아키텍처를 로드할 수 있습니다: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + + + +PyTorch모델의 경우 `from_pretrained()` 메서드는 내부적으로 피클을 사용하여 안전하지 않은 것으로 알려진 `torch.load()`를 사용합니다. +일반적으로 신뢰할 수 없는 소스에서 가져왔거나 변조되었을 수 있는 모델은 로드하지 마세요. 허깅 페이스 허브에서 호스팅되는 공개 모델의 경우 이러한 보안 위험이 부분적으로 완화되며, 각 커밋 시 멀웨어를 [검사합니다](https://huggingface.co/docs/hub/security-malware). GPG를 사용해 서명된 [커밋 검증](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg)과 같은 모범사례는 [문서](https://huggingface.co/docs/hub/security)를 참조하세요. + +텐서플로우와 Flax 체크포인트는 영향을 받지 않으며, `from_pretrained`메서드에 `from_tf` 와 `from_flax` 키워드 가변 인자를 사용하여 이 문제를 우회할 수 있습니다. + + + +일반적으로 AutoTokenizer 클래스와 AutoModelFor 클래스를 사용하여 미리 학습된 모델 인스턴스를 로드하는 것이 좋습니다. 이렇게 하면 매번 올바른 아키텍처를 로드할 수 있습니다. 다음 [튜토리얼](preprocessing)에서는 새롭게 로드한 토크나이저, 이미지 프로세서, 특징 추출기를 사용하여 미세 튜닝용 데이터 세트를 전처리하는 방법에 대해 알아봅니다. + + +마지막으로 `TFAutoModelFor` 클래스를 사용하면 주어진 작업에 대해 사전 훈련된 모델을 로드할 수 있습니다. (사용 가능한 작업의 전체 목록은 [여기](model_doc/auto)를 참조하세요. 예를 들어, [`TFAutoModelForSequenceClassification.from_pretrained`]로 시퀀스 분류를 위한 모델을 로드합니다: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +쉽게 동일한 체크포인트를 재사용하여 다른 작업에 아키텍처를 로드할 수 있습니다: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased") +``` + +일반적으로, `AutoTokenizer`클래스와 `TFAutoModelFor` 클래스를 사용하여 미리 학습된 모델 인스턴스를 로드하는 것이 좋습니다. 이렇게 하면 매번 올바른 아키텍처를 로드할 수 있습니다. 다음 [튜토리얼](preprocessing)에서는 새롭게 로드한 토크나이저, 이미지 프로세서, 특징 추출기를 사용하여 미세 튜닝용 데이터 세트를 전처리하는 방법에 대해 알아봅니다. + + diff --git a/docs/source/ko/bertology.md b/docs/source/ko/bertology.md new file mode 100644 index 000000000000..7b4f3dc4c493 --- /dev/null +++ b/docs/source/ko/bertology.md @@ -0,0 +1,41 @@ + + +# BERTology + +BERT와 같은 대규모 트랜스포머의 내부 동작을 조사하는 연구 분야가 점점 더 중요해지고 있습니다. +혹자는 "BERTology"라 칭하기도 합니다. 이 분야의 좋은 예시는 다음과 같습니다: + + +- BERT는 고전적인 NLP 파이프라인의 재발견 - Ian Tenney, Dipanjan Das, Ellie Pavlick: + https://arxiv.org/abs/1905.05950 +- 16개의 헤드가 정말로 1개보다 나은가? - Paul Michel, Omer Levy, Graham Neubig: + https://arxiv.org/abs/1905.10650 +- BERT는 무엇을 보는가? BERT의 어텐션 분석 - Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: + https://arxiv.org/abs/1906.04341 +- CAT-probing: 프로그래밍 언어에 대해 사전훈련된 모델이 어떻게 코드 구조를 보는지 알아보기 위한 메트릭 기반 접근 방법: + https://arxiv.org/abs/2210.04633 + +우리는 이 새로운 연구 분야의 발전을 돕기 위해, BERT/GPT/GPT-2 모델에 내부 표현을 살펴볼 수 있는 몇 가지 기능을 추가했습니다. +이 기능들은 주로 Paul Michel의 훌륭한 작업을 참고하여 개발되었습니다 +(https://arxiv.org/abs/1905.10650): + + +- BERT/GPT/GPT-2의 모든 은닉 상태에 접근하기, +- BERT/GPT/GPT-2의 각 헤드의 모든 어텐션 가중치에 접근하기, +- 헤드의 출력 값과 그래디언트를 검색하여 헤드 중요도 점수를 계산하고 https://arxiv.org/abs/1905.10650에서 설명된 대로 헤드를 제거하는 기능을 제공합니다. + +이러한 기능들을 이해하고 직접 사용해볼 수 있도록 [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) 예제 스크립트를 추가했습니다. 이 예제 스크립트에서는 GLUE에 대해 사전훈련된 모델에서 정보를 추출하고 모델을 가지치기(prune)해봅니다. diff --git a/docs/source/ko/community.md b/docs/source/ko/community.md new file mode 100644 index 000000000000..2d12e9de4a28 --- /dev/null +++ b/docs/source/ko/community.md @@ -0,0 +1,69 @@ + + +# 커뮤니티 [[community]] + +이 페이지는 커뮤니티에서 개발한 🤗 Transformers 리소스를 재구성한 페이지입니다. + +## 커뮤니티 리소스: [[community-resources]] + +| 리소스 | 설명 | 만든이 | +|:----------|:-------------|------:| +| [Hugging Face Transformers 용어집 플래시카드](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | [Transformers 문서 용어집](glossary)을 기반으로 한 플래시카드 세트로, 지식을 장기적으로 유지하기 위해 특별히 설계된 오픈소스 크로스 플랫폼 앱인 [Anki](https://apps.ankiweb.net/)를 사용하여 쉽게 학습/수정할 수 있는 형태로 제작되었습니다. [플래시카드 사용법에 대한 소개 동영상](https://www.youtube.com/watch?v=Dji_h7PILrw)을 참조하세요. | [Darigov 리서치](https://www.darigovresearch.com/) | + +## 커뮤니티 노트북: [[community-notebooks]] + +| 노트북 | 설명 | 만든이 | | +|:----------|:-------------|:-------------|------:| +| [가사를 생성하기 위해 사전훈련된 트랜스포머를 미세 조정하기](https://github.com/AlekseyKorshuk/huggingartists) | GPT-2 모델을 미세 조정하여 좋아하는 아티스트의 스타일로 가사를 생성하는 방법 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Tensorflow 2로 T5 훈련하기](https://github.com/snapthat/TF-T5-text-to-text) | Tensorflow 2를 사용하여 T5를 훈련시키는 방법. 이 노트북은 Tensorflow 2로 SQUAD를 사용하여 구현한 질의응답 작업을 보여줍니다. | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [TPU에서 T5 훈련하기](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | Transformers와 Nlp를 사용하여 SQUAD로 T5를 훈련하는 방법 | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [분류 및 객관식 문제를 위해 T5 미세 조정하기](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | 분류 및 객관식 문제에 맞게 텍스트-텍스트 형식을 사용하여 PyTorch Lightning으로 T5를 미세 조정하는 방법 | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [새로운 데이터 세트와 언어로 DialoGPT 미세 조정하기](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | 자유 대화형 챗봇을 만들기 위해 새로운 데이터 세트로 DialoGPT 모델을 미세 조정하는 방법 | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Reformer로 긴 시퀀스 모델링하기](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | Reformer로 최대 50만 토큰의 시퀀스를 훈련하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [요약을 위해 BART 미세 조정하기](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | blurr를 사용하여 fastai로 요약하기 위해 BART를 미세 조정하는 방법 | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [다른 사람의 트윗으로 사전훈련된 트랜스포머 미세 조정하기](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | GPT-2 모델을 미세 조정하여 좋아하는 트위터 계정 스타일로 트윗을 생성하는 방법 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Weights & Biases로 🤗 Hugging Face 모델 최적화하기](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | W&B와 Hugging Face의 통합을 보여주는 전체 튜토리얼 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Longformer 사전훈련하기](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | 기존 사전훈련된 모델의 "긴" 버전을 빌드하는 방법 | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [QA를 위해 Longformer 미세 조정하기](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | QA 작업을 위해 Longformer를 미세 조정하는 방법 | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [🤗 Nlp로 모델 평가하기](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | `Nlp`로 TriviaQA에서 Longformer를 평가하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [감정 범위 추출을 위해 T5 미세 조정하기](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | 감정 범위 추출을 위해 텍스트-텍스트 형식을 사용하여 PyTorch Lightning으로 T5를 미세 조정하는 방법 | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [다중 클래스 분류를 위해 DistilBert 미세 조정하기](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | 다중 클래스 분류를 위해 PyTorch를 사용하여 DistilBert를 미세 조정하는 방법 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +| [다중 레이블 분류를 위해 BERT 미세 조정하기](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb) | 다중 레이블 분류를 위해 PyTorch를 사용하여 BERT를 미세 조정하는 방법 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +| [요약을 위해 T5 미세 조정하기](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb) | 요약을 위해 PyTorch로 T5를 미세 조정하고 WandB로 실험을 추적하는 방법 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +| [동적 패딩/버켓팅으로 Transformers 미세 조정 속도 높이기](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)| 동적 패딩/버켓팅을 사용하여 미세 조정 속도를 2배로 높이는 방법 |[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[마스킹된 언어 모델링을 위해 Reformer 사전훈련하기](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| 양방향 셀프 어텐션 레이어를 이용해서 Reformer 모델을 훈련하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +| [Sci-BERT 확장 및 미세 조정하기](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| CORD 데이터 세트로 AllenAI에서 사전훈련된 SciBERT 모델의 어휘를 늘리고 파이프라인을 구축하는 방법 | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +| [요약을 위해 Trainer API로 BlenderBotSmall 미세 조정하기](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| 요약을 위해 Trainer API를 사용하여 사용자 지정 데이터 세트로 BlenderBotSmall 미세 조정하기 | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +| [통합 기울기(Integrated Gradient)를 이용하여 Electra 미세 조정하고 해석하기](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | 감정 분석을 위해 Electra를 미세 조정하고 Captum 통합 기울기로 예측을 해석하는 방법 | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +| [Trainer 클래스로 비영어권 GPT-2 모델 미세 조정하기](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | Trainer 클래스로 비영어권 GPT-2 모델을 미세 조정하는 방법 | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[다중 라벨 분류 작업을 위해 DistilBERT 모델 미세 조정하기](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | 다중 라벨 분류 작업을 위해 DistilBERT 모델을 미세 조정하는 방법 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[문장쌍 분류를 위해 ALBERT 미세 조정하기](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | 문장쌍 분류 작업을 위해 ALBERT 모델 또는 다른 BERT 기반 모델을 미세 조정하는 방법 | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[감정 분석을 위해 Roberta 미세 조정하기](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | 감정 분석을 위해 Roberta 모델을 미세 조정하는 방법 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[질문 생성 모델 평가하기](https://github.com/flexudy-pipe/qugeev) | seq2seq 트랜스포머 모델이 생성한 질문과 이에 대한 답변이 얼마나 정확한가요? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[DistilBERT와 Tensorflow로 텍스트 분류하기](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | 텍스트 분류를 위해 TensorFlow로 DistilBERT를 미세 조정하는 방법 | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[CNN/Dailail 요약을 위해 인코더-디코더 모델에 BERT 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | CNN/Dailail 요약을 위해 *bert-base-uncased* 체크포인트를 활용하여 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[BBC XSum 요약을 위해 인코더-디코더 모델에 RoBERTa 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | BBC/XSum 요약을 위해 *roberta-base* 체크포인트를 활용하여 공유 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[순차적 질문 답변(SQA)을 위해 TAPAS 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | *tapas-base* 체크포인트를 활용하여 순차적 질문 답변(SQA) 데이터 세트로 *TapasForQuestionAnswering*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[표 사실 검사(TabFact)로 TAPAS 평가하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | 🤗 Datasets와 🤗 Transformer 라이브러리를 함께 사용하여 *tapas-base-finetuned-tabfact* 체크포인트로 미세 조정된 *TapasForSequenceClassification*을 평가하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[번역을 위해 mBART 미세 조정하기](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | 힌디어에서 영어로 번역하기 위해 Seq2SeqTrainer를 사용하여 mBART를 미세 조정하는 방법 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[FUNSD(양식 이해 데이터 세트)로 LayoutLM 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | 스캔한 문서에서 정보 추출을 위해 FUNSD 데이터 세트로 *LayoutLMForTokenClassification*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[DistilGPT2 미세 조정하고 및 텍스트 생성하기](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | DistilGPT2를 미세 조정하고 텍스트를 생성하는 방법 | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[최대 8K 토큰에서 LED 미세 조정하기](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | 긴 범위를 요약하기 위해 PubMed로 LED를 미세 조정하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Arxiv로 LED 평가하기](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | 긴 범위 요약에 대해 LED를 효과적으로 평가하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[RVL-CDIP(문서 이미지 분류 데이터 세트)로 LayoutLM 미세 조정하기)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | 스캔 문서 분류를 위해 RVL-CDIP 데이터 세트로 *LayoutLMForSequenceClassification*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[GPT2 조정을 통한 Wav2Vec2 CTC 디코딩](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | 언어 모델 조정을 통해 CTC 시퀀스를 디코딩하는 방법 | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Trainer 클래스로 두 개 언어로 요약하기 위해 BART 미세 조정하기](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | Trainer 클래스로 두 개 언어로 요약하기 위해 BART 미세 조정하는 방법 | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Trivia QA로 Big Bird 평가하기](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | Trivia QA로 긴 문서 질문에 대한 답변에 대해 BigBird를 평가하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Wav2Vec2를 사용하여 동영상 캡션 만들기](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | Wav2Vec으로 오디오를 텍스트로 변환하여 모든 동영상에서 YouTube 캡션 만드는 방법 | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [PyTorch Lightning을 사용하여 CIFAR-10으로 비전 트랜스포머 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | HuggingFace Transformers, Datasets, PyTorch Lightning을 사용하여 CIFAR-10으로 비전 트랜스포머(ViT)를 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [🤗 Trainer를 사용하여 CIFAR-10에서 비전 트랜스포머 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | Datasets, 🤗 Trainer를 사용하여 CIFAR-10에서 비전 트랜스포머(ViT)를 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [개체 입력 데이터 세트인 Open Entity로 LUKE 평가하기](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | Open Entity 데이터 세트로 *LukeForEntityClassification*을 평가하는 방법 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [관계 추출 데이터 세트인 TACRED로 LUKE 평가하기](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | TACRED 데이터 세트로 *LukeForEntityPairClassification*을 평가하는 방법 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [중요 NER 벤치마크인 CoNLL-2003으로 LUKE 평가하기](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | CoNLL-2003 데이터 세트로 *LukeForEntitySpanClassification*를 평가하는 방법 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [PubMed 데이터 세트로 BigBird-Pegasus 평가하기](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | PubMed 데이터 세트로 *BigBirdPegasusForConditionalGeneration*를 평가하는 방법 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Wav2Vec2를 사용해서 음성 감정 분류하기](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | 감정 분류를 위해 사전훈련된 Wav2Vec2 모델을 MEGA 데이터 세트에 활용하는 방법 | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [DETR로 이미지에서 객체 탐지하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | 훈련된 *DetrForObjectDetection* 모델을 사용하여 이미지에서 객체를 탐지하고 어텐션을 시각화하는 방법 | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [사용자 지정 객체 탐지 데이터 세트로 DETR 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | 사용자 지정 객체 탐지 데이터 세트로 *DetrForObjectDetection*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [개체명 인식을 위해 T5 미세 조정하기](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | 개체명 인식 작업을 위해 *T5*를 미세 조정하는 방법 | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | diff --git a/docs/source/ko/contributing.md b/docs/source/ko/contributing.md new file mode 100644 index 000000000000..0f37c2b09265 --- /dev/null +++ b/docs/source/ko/contributing.md @@ -0,0 +1,332 @@ + + +# 🤗 Transformers에 기여하기 [[contribute-to-transformers]] + +누구나 🤗 Transformers에 기여할 수 있으며, 우리는 모든 사람의 기여를 소중히 생각합니다. 코드 기여는 커뮤니티를 돕는 유일한 방법이 아닙니다. 질문에 답하거나 다른 사람을 도와 문서를 개선하는 것도 매우 가치가 있습니다. + +🤗 Transformers를 널리 알리는 것도 큰 도움이 됩니다! 멋진 프로젝트들을 가능하게 한 🤗 Transformers 라이브러리에 대해 블로그 게시글에 언급하거나, 도움이 되었을 때마다 Twitter에 알리거나, 저장소에 ⭐️ 를 표시하여 감사 인사를 전해주세요. + +어떤 방식으로 기여하든 [행동 규칙](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md)을 숙지하고 존중해주세요. + +**이 안내서는 멋진 [scikit-learn 기여 안내서](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md)에서 큰 영감을 받았습니다.** + +## 기여하는 방법 [[ways-to-contribute]] + +여러 가지 방법으로 🤗 Transformers에 기여할 수 있습니다: + +* 기존 코드의 미해결된 문제를 수정합니다. +* 버그 또는 새로 추가되길 원하는 기능과 관련된 이슈를 제출합니다. +* 새로운 모델을 구현합니다. +* 예제나 문서에 기여합니다. + +어디서부터 시작할지 모르겠다면, [Good First Issue](https://github.com/huggingface/transformers/contribute) 목록을 확인해보세요. 이 목록은 초보자도 참여하기 쉬운 오픈 이슈 목록을 제공하며, 당신이 오픈소스에 처음으로 기여하는 데 큰 도움이 될 것입니다. 그저 작업하고 싶은 이슈에 댓글만 달아주면 됩니다. + +조금 더 도전적인 작업을 원한다면, [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) 목록도 확인해보세요. 이미 당신이 잘 하고 있다고 생각되더라도, 한 번 시도해보세요! 우리도 여러분을 도울 것입니다. 🚀 + +> 커뮤니티에 이루어지는 모든 기여는 똑같이 소중합니다. 🥰 + +## 미해결된 문제 수정하기 [[fixing-outstanding-issues]] + +기존 코드에서 발견한 문제점에 대한 해결책이 떠오른 경우, 언제든지 [기여를 시작](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#create-a-pull-request)하고 Pull Request를 생성해주세요! + +## 버그 관련 이슈를 제기하거나 새로운 기능 요청하기 [[submitting-a-bugrelated-issue-or-feature-request]] + +버그 관련 이슈를 제기하거나 새로운 기능을 요청할 때는 다음 가이드라인을 최대한 준수해주세요. 이렇게 하면 좋은 피드백과 함께 빠르게 답변해 드릴 수 있습니다. + +### 버그를 발견하셨나요? [[did-you-find-a-bug]] + +🤗 Transformers 라이브러리는 사용 중에 겪는 문제를 보고해주는 사용자들 덕분에 더욱 견고해지고 신뢰할 수 있게 되었습니다. + +이슈를 보고하기 전에, 버그가 이미 **보고되지 않았는지** 확인해주세요. (GitHub의 이슈 탭 아래의 검색 바를 사용하세요). 이슈는 라이브러리 자체에서 발생한 버그어야 하며, 코드의 다른 부분과 관련된 것이 아니어야 합니다. 버그가 라이브러리의 문제로 발생하였는지 확실하지 않은 경우 먼저 [포럼](https://discuss.huggingface.co/)에서 질문해 주세요. 이렇게 하면 일반적인 질문보다 라이브러리와 관련된 문제를 더 빠르게 해결할 수 있습니다. + +버그가 이미 보고되지 않았다는 것을 확인했다면, 다음 정보를 포함하여 이슈를 제출해 주세요. 그러면 우리가 빠르게 해결할 수 있습니다: + +* 사용 중인 **운영체제 종류와 버전**, 그리고 **Python**, **PyTorch** 또는 **TensorFlow** 버전. +* 버그를 30초 이내로 재현할 수 있는 간단하고 독립적인 코드 스니펫. +* 예외가 발생한 경우 *전체* 트레이스백. +* 스크린샷과 같이 도움이 될 것으로 생각되는 추가 정보를 첨부해 주세요. + +운영체제와 소프트웨어 버전을 자동으로 가져오려면 다음 명령을 실행하세요: + +```bash +transformers-cli env +``` + +저장소의 루트 디렉터리에서도 같은 명령을 실행할 수 있습니다: + +```bash +python src/transformers/commands/transformers_cli.py env +``` + + +### 새로운 기능을 원하시나요? [[do-you-want-a-new-feature]] + +🤗 Transformers에서 사용하고 싶은 새로운 기능이 있다면, 다음 내용을 포함하여 이슈를 제출해 주세요: + +1. 이 기능이 필요한 *이유*는 무엇인가요? 라이브러리에 대한 문제나 불만과 관련이 있나요? 프로젝트에 필요한 기능인가요? 커뮤니티에 도움이 될 만한 기능인가요? + + 어떤 내용이든 여러분의 이야기를 듣고 싶습니다! + +2. 요청하는 기능을 최대한 자세히 설명해 주세요. 더 많은 정보를 제공할수록 더 나은 도움을 드릴 수 있습니다. +3. 해당 기능의 사용법을 보여주는 *코드 스니펫*을 제공해 주세요. +4. 기능과 관련된 논문이 있는 경우 링크를 포함해 주세요. + +이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다. + +이슈를 제기하는 데 도움이 될 만한 [템플릿](https://github.com/huggingface/transformers/tree/main/templates)도 준비되어 있습니다. + +## 새로운 모델을 구현하고 싶으신가요? [[do-you-want-to-implement-a-new-model]] + +새로운 모델은 계속해서 출시됩니다. 만약 여러분이 새로운 모델을 구현하고 싶다면 다음 정보를 제공해 주세요. + +* 모델에 대한 간단한 설명과 논문 링크. +* 구현이 공개되어 있다면 구현 링크. +* 모델 가중치가 사용 가능하다면 가중치 링크. + +만약 모델을 직접 기여하고 싶으시다면, 알려주세요. 🤗 Transformers에 추가할 수 있도록 도와드리겠습니다! + +새로운 모델을 추가하는 방법에 대한 [상세 안내서와 템플릿](https://github.com/huggingface/transformers/tree/main/templates)을 제공하고 있으며, [🤗 Transformers에 새로운 모델을 추가하는 방법](https://huggingface.co/docs/transformers/add_new_model)에 대한 기술적인 안내서도 있습니다. + +## 문서를 추가하고 싶으신가요? [[do-you-want-to-add-documentation]] + +우리는 언제나 더 명확하고 정확한 문서를 제공하기 위하여 개선점을 찾고 있습니다. 오탈자나 부족한 내용, 분명하지 않거나 부정확한 내용 등을 알려주시면 개선하는 데 도움이 됩니다. 관심이 있으시다면 변경하거나 기여하실 수 있도록 도와드리겠습니다! + +문서를 생성, 빌드 및 작성하는 방법에 대한 자세한 내용은 [README](https://github.com/huggingface/transformers/tree/main/docs) 문서를 확인해 주세요. + +## 풀 리퀘스트(Pull Request) 생성하기 [[create-a-pull-request]] + +코드를 작성하기 전에 기존의 Pull Request나 이슈를 검색하여 누군가 이미 동일한 작업을 하고 있는지 확인하는 것이 좋습니다. 확실하지 않다면 피드백을 받기 위해 이슈를 열어보는 것이 좋습니다. + +🤗 Transformers에 기여하기 위해서는 기본적인 `git` 사용 능력이 필요합니다. `git`은 사용하기 쉬운 도구는 아니지만, 매우 훌륭한 매뉴얼을 제공합니다. 쉘(shell)에서 `git --help`을 입력하여 확인해보세요! 만약 책을 선호한다면, [Pro Git](https://git-scm.com/book/en/v2)은 매우 좋은 참고 자료가 될 것입니다. + +🤗 Transformers에 기여하려면 **[Python 3.8]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요: + +1. 저장소 페이지에서 **[Fork](https://github.com/huggingface/transformers/fork)** 버튼을 클릭하여 저장소를 포크하세요. 이렇게 하면 코드의 복사본이 여러분의 GitHub 사용자 계정 아래에 생성됩니다. + +2. 포크한 저장소를 로컬 디스크로 클론하고, 기본 저장소를 원격(remote)으로 추가하세요: + + ```bash + git clone git@github.com:/transformers.git + cd transformers + git remote add upstream https://github.com/huggingface/transformers.git + ``` + +3. 개발 변경 사항을 저장할 새 브랜치를 생성하세요: + + ```bash + git checkout -b a-descriptive-name-for-my-changes + ``` + + 🚨 절대 `main` 브랜치에서 작업하지 **마세요!** + +4. 가상 환경에서 다음 명령을 실행하여 개발 환경을 설정하세요: + + ```bash + pip install -e ".[dev]" + ``` + + 만약 이미 가상 환경에 🤗 Transformers가 설치되어 있다면, `-e` 플래그를 사용하여 설치하기 전에 `pip uninstall transformers`로 제거해주세요. + + 여러분의 운영체제에 따라서, 그리고 🤗 Transformers의 선택적 의존성의 수가 증가하면서, 이 명령이 실패할 수도 있습니다. 그럴 경우 사용하려는 딥러닝 프레임워크(PyTorch, TensorFlow, 그리고/또는 Flax)를 설치한 후 아래 명령을 실행해주세요: + + ```bash + pip install -e ".[quality]" + ``` + + 대부분의 경우 이것으로 충분할 것입니다. + +5. 브랜치에서 기능을 개발하세요. + + 코드를 작업하는 동안 테스트 스위트(test suite)가 통과하는지 확인하세요. 다음과 같이 변경 사항에 영향을 받는 테스트를 실행하세요: + + ```bash + pytest tests/.py + ``` + + 테스트에 대한 더 많은 정보는 [테스트](https://huggingface.co/docs/transformers/testing) 가이드를 확인하세요. + + 🤗 Transformers는 `black`과 `ruff`를 사용하여 소스 코드의 형식을 일관되게 유지합니다. 변경 사항을 적용한 후에는 다음 명령으로 자동으로 스타일 교정 및 코드 검증을 수행하세요: + + ```bash + make fixup + ``` + + 이것은 또한 작업 중인 PR에서 수정한 파일에서만 작동하도록 최적화되어 있습니다. + + 검사를 하나씩 실행하려는 경우, 다음 명령으로 스타일 교정을 적용할 수 있습니다: + + ```bash + make style + ``` + + 🤗 Transformers는 또한 `ruff`와 몇 가지 사용자 정의 스크립트를 사용하여 코딩 실수를 확인합니다. CI를 통해 품질 관리가 수행되지만, 다음 명령으로 동일한 검사를 실행할 수 있습니다: + + ```bash + make quality + ``` + + 마지막으로, 새 모델을 추가할 때 일부 파일을 업데이트하는 것을 잊지 않도록 하기 위한 많은 스크립트가 있습니다. 다음 명령으로 이러한 스크립트를 실행할 수 있습니다: + + ```bash + make repo-consistency + ``` + + 이러한 검사에 대해 자세히 알아보고 관련 문제를 해결하는 방법은 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요. + + 만약 `docs/source` 디렉터리 아래의 문서를 수정하는 경우, 문서가 빌드될 수 있는지 확인하세요. 이 검사는 Pull Request를 열 때도 CI에서 실행됩니다. 로컬 검사를 실행하려면 문서 빌더를 설치해야 합니다: + + ```bash + pip install ".[docs]" + ``` + + 저장소의 루트 디렉터리에서 다음 명령을 실행하세요: + + ```bash + doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build + ``` + + 이 명령은 `~/tmp/test-build` 폴더에 문서를 빌드하며, 생성된 Markdown 파일을 선호하는 편집기로 확인할 수 있습니다. Pull Request를 열 때 GitHub에서 문서를 미리 볼 수도 있습니다. + + 변경 사항에 만족하면 `git add`로 변경된 파일을 추가하고, `git commit`으로 변경 사항을 로컬에 기록하세요: + + ```bash + git add modified_file.py + git commit + ``` + + [좋은 커밋 메시지](https://chris.beams.io/posts/git-commit/)를 작성하여 변경 사항을 명확하게 전달하세요! + + 변경 사항을 프로젝트 원본 저장소와 동기화하려면, PR을 *열기 전에* 브랜치를 `upstream/branch`로 리베이스(rebase)하세요. 또는 관리자의 요청에 이 작업이 필요할 수 있습니다: + + ```bash + git fetch upstream + git rebase upstream/main + ``` + + 변경 사항을 브랜치에 푸시하세요: + + ```bash + git push -u origin a-descriptive-name-for-my-changes + ``` + + 이미 PR을 열었다면, `--force` 플래그와 함께 강제 푸시해야 합니다. 아직 PR이 열리지 않았다면 정상적으로 변경 사항을 푸시하면 됩니다. + +6. 이제 GitHub에서 포크한 저장소로 이동하고 **Pull request(풀 리퀘스트)**를 클릭하여 Pull Request를 열 수 있습니다. 아래의 [체크리스트](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist)에서 모든 항목에 체크 표시를 하세요. 준비가 완료되면 프로젝트 관리자에게 변경 사항을 보내 검토를 요청할 수 있습니다. + +7. 관리자가 변경 사항을 요청해도 괜찮습니다. 핵심 기여자들도 동일한 상황을 겪습니다! 모두가 변경 사항을 Pull Request에서 볼 수 있도록, 로컬 브랜치에서 작업하고 변경 사항을 포크한 저장소로 푸시하세요. 그러면 변경 사항이 자동으로 Pull Request에 나타납니다. + +### Pull Request 체크리스트 [[pull-request-checklist]] + +☐ Pull Request 제목은 기여 내용을 요약해야 합니다.
+☐ Pull Request가 이슈를 해결하는 경우, Pull Request 설명에 이슈 번호를 언급하여 연관되어 있음을 알려주세요. (이슈를 확인하는 사람들이 해당 이슈에 대한 작업이 진행 중임을 알 수 있게 합니다).
+☐ 작업이 진행중이라면 제목 앞에 `[WIP]`를 붙여주세요. 중복 작업을 피하고 병합할 준비가 된 PR과 구분하기에 유용합니다.
+☐ 기존 테스트를 통과하는지 확인하세요.
+☐ 새로운 기능을 추가하는 경우, 해당 기능에 대한 테스트도 추가하세요.
+ - 새 모델을 추가하는 경우, `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`을 사용하여 일반적인 테스트를 활성화하세요. + - 새 `@slow` 테스트를 추가하는 경우, 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`. + - 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`. + - CircleCI에서는 느린 테스트를 실행하지 않지만, GitHub Actions에서는 매일 밤 실행됩니다!
+ +☐ 모든 공개 메소드는 유용한 기술문서를 가져야 합니다 (예를 들어 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) 참조).
+☐ 저장소가 빠르게 성장하고 있으므로 저장소에 상당한 부담을 주는 이미지, 동영상 및 기타 텍스트가 아닌 파일은 추가하지 마세요. 대신 [`hf-internal-testing`](https://huggingface.co/hf-internal-testing)과 같은 Hub 저장소를 사용하여 이러한 파일을 호스팅하고 URL로 참조하세요. 문서와 관련된 이미지는 다음 저장소에 배치하는 것을 권장합니다: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images). 이 데이터셋 저장소에서 PR을 열어서 Hugging Face 멤버에게 병합을 요청할 수 있습니다. + +Pull Request에서 실행되는 검사에 대한 자세한 정보는 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요. + +### 테스트 [[tests]] + +라이브러리 동작과 여러 예제를 테스트할 수 있는 광범위한 테스트 스위트가 포함되어 있습니다. 라이브러리 테스트는 [tests](https://github.com/huggingface/transformers/tree/main/tests) 폴더에, 예제 테스트는 [examples](https://github.com/huggingface/transformers/tree/main/examples) 폴더에 있습니다. + +속도가 빠른 `pytest`와 `pytest-xdist`를 선호합니다. 저장소의 루트 디렉터리에서 테스트를 실행할 *하위 폴더 경로 또는 테스트 파일 경로*를 지정하세요. + +```bash +python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model +``` + +마찬가지로 `examples` 디렉터리에서도 *하위 폴더 경로 또는 테스트 파일 경로*를 지정하세요. 예를 들어, 다음 명령은 PyTorch `examples` 디렉터리의 텍스트 분류 하위 폴더를 테스트합니다: + +```bash +pip install -r examples/xxx/requirements.txt # only needed the first time +python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification +``` + +이것이 실제로 `make test` 및 `make test-examples` 명령이 구현되는 방식입니다 (`pip install`은 제외합니다)! + +또한 특정 기능만 테스트하기 위한 더 작은 테스트를 지정할 수 있습니다. + +기본적으로 느린 테스트는 건너뛰지만 `RUN_SLOW` 환경 변수를 `yes`로 설정하여 실행할 수 있습니다. 이렇게 하면 많은 기가바이트 단위의 모델이 다운로드되므로 충분한 디스크 공간, 좋은 인터넷 연결과 많은 인내가 필요합니다! + + + +테스트를 실행하려면 *하위 폴더 경로 또는 테스트 파일 경로*를 지정하세요. 그렇지 않으면 `tests` 또는 `examples` 폴더의 모든 테스트를 실행하게 되어 매우 긴 시간이 걸립니다! + + + +```bash +RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model +RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification +``` + +느린 테스트와 마찬가지로, 다음과 같이 테스트 중에 기본적으로 활성화되지 않는 다른 환경 변수도 있습니다: +- `RUN_CUSTOM_TOKENIZERS`: 사용자 정의 토크나이저 테스트를 활성화합니다. +- `RUN_PT_FLAX_CROSS_TESTS`: PyTorch + Flax 통합 테스트를 활성화합니다. +- `RUN_PT_TF_CROSS_TESTS`: TensorFlow + PyTorch 통합 테스트를 활성화합니다. + +더 많은 환경 변수와 추가 정보는 [testing_utils.py](src/transformers/testing_utils.py)에서 찾을 수 있습니다. + +🤗 Transformers는 테스트 실행기로 `pytest`를 사용합니다. 그러나 테스트 스위트 자체에서는 `pytest` 관련 기능을 사용하지 않습니다. + +이것은 `unittest`가 완전히 지원된다는 것을 의미합니다. 다음은 `unittest`로 테스트를 실행하는 방법입니다: + +```bash +python -m unittest discover -s tests -t . -v +python -m unittest discover -s examples -t examples -v +``` + +### 스타일 가이드 [[style-guide]] + +문서는 [Google Python 스타일 가이드](https://google.github.io/styleguide/pyguide.html)를 따릅니다. 자세한 정보는 [문서 작성 가이드](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)를 확인하세요. + +### Windows에서 개발 [[develop-on-windows]] + +Windows에서 개발할 경우([Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) 또는 WSL에서 작업하지 않는 한) Windows `CRLF` 줄 바꿈을 Linux `LF` 줄 바꿈으로 변환하도록 git을 구성해야 합니다: + +```bash +git config core.autocrlf input +``` + +Windows에서 `make` 명령을 실행하는 한 가지 방법은 MSYS2를 사용하는 것입니다: + +1. [MSYS2](https://www.msys2.org/)를 다운로드합니다. `C:\msys64`에 설치되었다고 가정합니다. +2. CLI에서 `C:\msys64\msys2.exe`를 엽니다 (시작 메뉴에서 사용 가능해야 함). +3. 쉘에서 다음을 실행하여: `pacman -Syu` 및 `pacman -S make`로 `make`를 설치합니다. +4. 환경 변수 PATH에 `C:\msys64\usr\bin`을 추가하세요. + +이제 모든 터미널 (Powershell, cmd.exe 등)에서 `make`를 사용할 수 있습니다! 🎉 + +### 포크한 저장소를 상위 원본 브랜치(main)과 동기화하기 (Hugging Face 저장소) [[sync-a-forked-repository-with-upstream-main-the-hugging-face-repository]] + +포크한 저장소의 main 브랜치를 업데이트할 때, 다음 단계를 따라 수행해주세요. 이렇게 하면 각 upstream PR에 참조 노트가 추가되는 것을 피하고 이러한 PR에 관여하는 개발자들에게 불필요한 알림이 전송되는 것을 방지할 수 있습니다. + +1. 가능하면 포크된 저장소의 브랜치 및 PR을 사용하여 upstream과 동기화하지 마세요. 대신 포크된 main 저장소에 직접 병합하세요. +2. PR이 반드시 필요한 경우, 브랜치를 확인한 후 다음 단계를 사용하세요: + +```bash +git checkout -b your-branch-for-syncing +git pull --squash --no-commit upstream main +git commit -m '' +git push --set-upstream origin your-branch-for-syncing +``` \ No newline at end of file diff --git a/docs/source/ko/create_a_model.md b/docs/source/ko/create_a_model.md new file mode 100644 index 000000000000..8c7be3291e24 --- /dev/null +++ b/docs/source/ko/create_a_model.md @@ -0,0 +1,388 @@ + + +# 맞춤형 아키텍처 만들기[[create-a-custom-architecture]] + +[`AutoClass`](model_doc/auto)는 모델 아키텍처를 자동으로 추론하고 미리 학습된 configuration과 가중치를 다운로드합니다. 일반적으로 체크포인트에 구애받지 않는 코드를 생성하려면 `AutoClass`를 사용하는 것이 좋습니다. 하지만 특정 모델 파라미터를 보다 세밀하게 제어하고자 하는 사용자는 몇 가지 기본 클래스만으로 커스텀 🤗 Transformers 모델을 생성할 수 있습니다. 이는 🤗 Transformers 모델을 연구, 교육 또는 실험하는 데 관심이 있는 모든 사용자에게 특히 유용할 수 있습니다. 이 가이드에서는 'AutoClass'를 사용하지 않고 커스텀 모델을 만드는 방법에 대해 알아보겠습니다: + +- 모델 configuration을 가져오고 사용자 지정합니다. +- 모델 아키텍처를 생성합니다. +- 텍스트에 사용할 느리거나 빠른 토큰화기를 만듭니다. +- 비전 작업을 위한 이미지 프로세서를 생성합니다. +- 오디오 작업을 위한 특성 추출기를 생성합니다. +- 멀티모달 작업용 프로세서를 생성합니다. + +## Configuration[[configuration]] + +[configuration](main_classes/configuration)은 모델의 특정 속성을 나타냅니다. 각 모델 구성에는 서로 다른 속성이 있습니다. 예를 들어, 모든 NLP 모델에는 `hidden_size`, `num_attention_heads`, `num_hidden_layers` 및 `vocab_size` 속성이 공통으로 있습니다. 이러한 속성은 모델을 구성할 attention heads 또는 hidden layers의 수를 지정합니다. + +[DistilBERT](model_doc/distilbert) 속성을 검사하기 위해 [`DistilBertConfig`]에 접근하여 자세히 살펴봅니다: + +```py +>>> from transformers import DistilBertConfig + +>>> config = DistilBertConfig() +>>> print(config) +DistilBertConfig { + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +[`DistilBertConfig`]는 기본 [`DistilBertModel`]을 빌드하는 데 사용되는 모든 기본 속성을 표시합니다. 모든 속성은 커스터마이징이 가능하므로 실험을 위한 공간을 만들 수 있습니다. 예를 들어 기본 모델을 다음과 같이 커스터마이즈할 수 있습니다: + +- `activation` 파라미터로 다른 활성화 함수를 사용해 보세요. +- `attention_dropout` 파라미터를 사용하여 어텐션 확률에 더 높은 드롭아웃 비율을 사용하세요. + +```py +>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) +>>> print(my_config) +DistilBertConfig { + "activation": "relu", + "attention_dropout": 0.4, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +사전 학습된 모델 속성은 [`~PretrainedConfig.from_pretrained`] 함수에서 수정할 수 있습니다: + +```py +>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4) +``` + +모델 구성이 만족스러우면 [`~PretrainedConfig.save_pretrained`]로 저장할 수 있습니다. 설정 파일은 지정된 작업 경로에 JSON 파일로 저장됩니다: + +```py +>>> my_config.save_pretrained(save_directory="./your_model_save_path") +``` + +configuration 파일을 재사용하려면 [`~PretrainedConfig.from_pretrained`]를 사용하여 가져오세요: + +```py +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +``` + + + +configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configuration 속성과 기본 configuration 속성의 차이점만 저장할 수도 있습니다! 자세한 내용은 [configuration](main_classes/configuration) 문서를 참조하세요. + + + +## 모델[[model]] + +다음 단계는 [모델(model)](main_classes/models)을 만드는 것입니다. 느슨하게 아키텍처라고도 불리는 모델은 각 계층이 수행하는 동작과 발생하는 작업을 정의합니다. configuration의 `num_hidden_layers`와 같은 속성은 아키텍처를 정의하는 데 사용됩니다. 모든 모델은 기본 클래스 [`PreTrainedModel`]과 입력 임베딩 크기 조정 및 셀프 어텐션 헤드 가지 치기와 같은 몇 가지 일반적인 메소드를 공유합니다. 또한 모든 모델은 [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 또는 [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module)의 서브클래스이기도 합니다. 즉, 모델은 각 프레임워크의 사용법과 호환됩니다. + + + +사용자 지정 configuration 속성을 모델에 가져옵니다: + +```py +>>> from transformers import DistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +>>> model = DistilBertModel(my_config) +``` + +이제 사전 학습된 가중치 대신 임의의 값을 가진 모델이 생성됩니다. 이 모델을 훈련하기 전까지는 유용하게 사용할 수 없습니다. 훈련은 비용과 시간이 많이 소요되는 프로세스입니다. 일반적으로 훈련에 필요한 리소스의 일부만 사용하면서 더 나은 결과를 더 빨리 얻으려면 사전 훈련된 모델을 사용하는 것이 좋습니다. + +사전 학습된 모델을 [`~PreTrainedModel.from_pretrained`]로 생성합니다: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +🤗 Transformers에서 제공한 모델의 사전 학습된 가중치를 사용하는 경우 기본 모델 configuration을 자동으로 불러옵니다. 그러나 원하는 경우 기본 모델 configuration 속성의 일부 또는 전부를 사용자 지정으로 바꿀 수 있습니다: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + +사용자 지정 configuration 속성을 모델에 불러옵니다: + +```py +>>> from transformers import TFDistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> tf_model = TFDistilBertModel(my_config) +``` + +이제 사전 학습된 가중치 대신 임의의 값을 가진 모델이 생성됩니다. 이 모델을 훈련하기 전까지는 유용하게 사용할 수 없습니다. 훈련은 비용과 시간이 많이 소요되는 프로세스입니다. 일반적으로 훈련에 필요한 리소스의 일부만 사용하면서 더 나은 결과를 더 빨리 얻으려면 사전 훈련된 모델을 사용하는 것이 좋습니다. + +사전 학습된 모델을 [`~TFPreTrainedModel.from_pretrained`]로 생성합니다: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") +``` + +🤗 Transformers에서 제공한 모델의 사전 학습된 가중치를 사용하는 경우 기본 모델 configuration을 자동으로 불러옵니다. 그러나 원하는 경우 기본 모델 configuration 속성의 일부 또는 전부를 사용자 지정으로 바꿀 수 있습니다: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config) +``` + + + +### 모델 헤드[[model-heads]] + +이 시점에서 *은닉 상태(hidden state)*를 출력하는 기본 DistilBERT 모델을 갖게 됩니다. 은닉 상태는 최종 출력을 생성하기 위해 모델 헤드에 입력으로 전달됩니다. 🤗 Transformers는 모델이 해당 작업을 지원하는 한 각 작업마다 다른 모델 헤드를 제공합니다(즉, 번역과 같은 시퀀스 간 작업에는 DistilBERT를 사용할 수 없음). + + + +예를 들어, [`DistilBertForSequenceClassification`]은 시퀀스 분류 헤드가 있는 기본 DistilBERT 모델입니다. 시퀀스 분류 헤드는 풀링된 출력 위에 있는 선형 레이어입니다. + +```py +>>> from transformers import DistilBertForSequenceClassification + +>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +다른 모델 헤드로 전환하여 이 체크포인트를 다른 작업에 쉽게 재사용할 수 있습니다. 질의응답 작업의 경우, [`DistilBertForQuestionAnswering`] 모델 헤드를 사용할 수 있습니다. 질의응답 헤드는 숨겨진 상태 출력 위에 선형 레이어가 있다는 점을 제외하면 시퀀스 분류 헤드와 유사합니다. + +```py +>>> from transformers import DistilBertForQuestionAnswering + +>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + +예를 들어, [`TFDistilBertForSequenceClassification`]은 시퀀스 분류 헤드가 있는 기본 DistilBERT 모델입니다. 시퀀스 분류 헤드는 풀링된 출력 위에 있는 선형 레이어입니다. + +```py +>>> from transformers import TFDistilBertForSequenceClassification + +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") +``` + +다른 모델 헤드로 전환하여 이 체크포인트를 다른 작업에 쉽게 재사용할 수 있습니다. 질의응답 작업의 경우, [`TFDistilBertForQuestionAnswering`] 모델 헤드를 사용할 수 있습니다. 질의응답 헤드는 숨겨진 상태 출력 위에 선형 레이어가 있다는 점을 제외하면 시퀀스 분류 헤드와 유사합니다. + +```py +>>> from transformers import TFDistilBertForQuestionAnswering + +>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") +``` + + + +## 토크나이저[[tokenizer]] + +텍스트 데이터에 모델을 사용하기 전에 마지막으로 필요한 기본 클래스는 원시 텍스트를 텐서로 변환하는 [토크나이저](main_classes/tokenizer)입니다. 🤗 Transformers에 사용할 수 있는 토크나이저는 두 가지 유형이 있습니다: + +- [`PreTrainedTokenizer`]: 파이썬으로 구현된 토크나이저입니다. +- [`PreTrainedTokenizerFast`]: Rust 기반 [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) 라이브러리로 만들어진 토크나이저입니다. 이 토크나이저는 Rust로 구현되어 배치 토큰화에서 특히 빠릅니다. 빠른 토크나이저는 토큰을 원래 단어나 문자에 매핑하는 *오프셋 매핑*과 같은 추가 메소드도 제공합니다. +두 토크나이저 모두 인코딩 및 디코딩, 새 토큰 추가, 특수 토큰 관리와 같은 일반적인 방법을 지원합니다. + + + +모든 모델이 빠른 토크나이저를 지원하는 것은 아닙니다. 이 [표](index#supported-frameworks)에서 모델의 빠른 토크나이저 지원 여부를 확인하세요. + + + +토크나이저를 직접 학습한 경우, *어휘(vocabulary)* 파일에서 토크나이저를 만들 수 있습니다: + +```py +>>> from transformers import DistilBertTokenizer + +>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") +``` + +사용자 지정 토크나이저의 어휘는 사전 학습된 모델의 토크나이저에서 생성된 어휘와 다를 수 있다는 점을 기억하는 것이 중요합니다. 사전 학습된 모델을 사용하는 경우 사전 학습된 모델의 어휘를 사용해야 하며, 그렇지 않으면 입력이 의미를 갖지 못합니다. [`DistilBertTokenizer`] 클래스를 사용하여 사전 학습된 모델의 어휘로 토크나이저를 생성합니다: + +```py +>>> from transformers import DistilBertTokenizer + +>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") +``` + +[`DistilBertTokenizerFast`] 클래스로 빠른 토크나이저를 생성합니다: + +```py +>>> from transformers import DistilBertTokenizerFast + +>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") +``` + + + +[`AutoTokenizer`]는 기본적으로 빠른 토크나이저를 가져오려고 합니다. 이 동작을 비활성화하려면 `from_pretrained`에서 `use_fast=False`를 설정하면 됩니다. + + + +## 이미지 프로세서[[image-processor]] + +이미지 프로세서(image processor)는 비전 입력을 처리합니다. 기본 [`~image_processing_utils.ImageProcessingMixin`] 클래스에서 상속합니다. + +사용하려면 사용 중인 모델과 연결된 이미지 프로세서를 생성합니다. 예를 들어, 이미지 분류에 [ViT](model_doc/vit)를 사용하는 경우 기본 [`ViTImageProcessor`]를 생성합니다: + +```py +>>> from transformers import ViTImageProcessor + +>>> vit_extractor = ViTImageProcessor() +>>> print(vit_extractor) +ViTImageProcessor { + "do_normalize": true, + "do_resize": true, + "feature_extractor_type": "ViTImageProcessor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} +``` + + + +사용자 지정을 원하지 않는 경우 `from_pretrained` 메소드를 사용하여 모델의 기본 이미지 프로세서 매개변수를 불러오면 됩니다. + + + +사용자 지정 이미지 프로세서를 생성하려면 [`ViTImageProcessor`] 파라미터를 수정합니다: + +```py +>>> from transformers import ViTImageProcessor + +>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +>>> print(my_vit_extractor) +ViTImageProcessor { + "do_normalize": false, + "do_resize": true, + "feature_extractor_type": "ViTImageProcessor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} +``` + +## 특성 추출기[[feature-extractor]] + +특성 추출기(feature extractor)는 오디오 입력을 처리합니다. 기본 [`~feature_extraction_utils.FeatureExtractionMixin`] 클래스에서 상속되며, 오디오 입력을 처리하기 위해 [`SequenceFeatureExtractor`] 클래스에서 상속할 수도 있습니다. + +사용하려면 사용 중인 모델과 연결된 특성 추출기를 생성합니다. 예를 들어, 오디오 분류에 [Wav2Vec2](model_doc/wav2vec2)를 사용하는 경우 기본 [`Wav2Vec2FeatureExtractor`]를 생성합니다: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor() +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} +``` + + + +사용자 지정이 필요하지 않은 경우 `from_pretrained` 메소드를 사용하여 모델의 기본 특성 추출기 ㅁ개변수를 불러 오면 됩니다. + + + +사용자 지정 특성 추출기를 만들려면 [`Wav2Vec2FeatureExtractor`] 매개변수를 수정합니다: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False) +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 8000 +} +``` + + +## 프로세서[[processor]] + +멀티모달 작업을 지원하는 모델의 경우, 🤗 Transformers는 특성 추출기 및 토크나이저와 같은 처리 클래스를 단일 객체로 편리하게 래핑하는 프로세서 클래스를 제공합니다. 예를 들어, 자동 음성 인식 작업(Automatic Speech Recognition task (ASR))에 [`Wav2Vec2Processor`]를 사용한다고 가정해 보겠습니다. 자동 음성 인식 작업은 오디오를 텍스트로 변환하므로 특성 추출기와 토크나이저가 필요합니다. + +오디오 입력을 처리할 특성 추출기를 만듭니다: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) +``` + +텍스트 입력을 처리할 토크나이저를 만듭니다: + +```py +>>> from transformers import Wav2Vec2CTCTokenizer + +>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") +``` + +[`Wav2Vec2Processor`]에서 특성 추출기와 토크나이저를 결합합니다: + +```py +>>> from transformers import Wav2Vec2Processor + +>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) +``` + +configuration과 모델이라는 두 가지 기본 클래스와 추가 전처리 클래스(토크나이저, 이미지 프로세서, 특성 추출기 또는 프로세서)를 사용하면 🤗 Transformers에서 지원하는 모든 모델을 만들 수 있습니다. 이러한 각 기본 클래스는 구성이 가능하므로 원하는 특정 속성을 사용할 수 있습니다. 학습을 위해 모델을 쉽게 설정하거나 기존의 사전 학습된 모델을 수정하여 미세 조정할 수 있습니다. diff --git a/docs/source/ko/custom_models.md b/docs/source/ko/custom_models.md new file mode 100644 index 000000000000..72dad7caaff2 --- /dev/null +++ b/docs/source/ko/custom_models.md @@ -0,0 +1,346 @@ + + +# 사용자 정의 모델 공유하기[[sharing-custom-models]] + +🤗 Transformers 라이브러리는 쉽게 확장할 수 있도록 설계되었습니다. +모든 모델은 추상화 없이 저장소의 지정된 하위 폴더에 완전히 코딩되어 있으므로, 손쉽게 모델링 파일을 복사하고 필요에 따라 조정할 수 있습니다. + +완전히 새로운 모델을 만드는 경우에는 처음부터 시작하는 것이 더 쉬울 수 있습니다. +이 튜토리얼에서는 Transformers 내에서 사용할 수 있도록 사용자 정의 모델과 구성을 작성하는 방법과 +🤗 Transformers 라이브러리에 없는 경우에도 누구나 사용할 수 있도록 (의존성과 함께) 커뮤니티에 공유하는 방법을 배울 수 있습니다. + +[timm 라이브러리](https://github.com/rwightman/pytorch-image-models)의 ResNet 클래스를 [`PreTrainedModel`]로 래핑한 ResNet 모델을 예로 모든 것을 설명합니다. + +## 사용자 정의 구성 작성하기[[writing-a-custom-configuration]] + +모델에 들어가기 전에 먼저 구성을 작성해보도록 하겠습니다. +모델의 `configuration`은 모델을 만들기 위해 필요한 모든 중요한 것들을 포함하고 있는 객체입니다. +다음 섹션에서 볼 수 있듯이, 모델은 `config`를 사용해서만 초기화할 수 있기 때문에 완벽한 구성이 필요합니다. + +아래 예시에서는 ResNet 클래스의 인수(argument)를 조정해보겠습니다. +다른 구성은 가능한 ResNet 중 다른 유형을 제공합니다. +그런 다음 몇 가지 유효성을 확인한 후 해당 인수를 저장합니다. + +```python +from transformers import PretrainedConfig +from typing import List + + +class ResnetConfig(PretrainedConfig): + model_type = "resnet" + + def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, + ): + if block_type not in ["basic", "bottleneck"]: + raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") + if stem_type not in ["", "deep", "deep-tiered"]: + raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") + + self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) +``` + +사용자 정의 `configuration`을 작성할 때 기억해야 할 세 가지 중요한 사항은 다음과 같습니다: +- `PretrainedConfig`을 상속해야 합니다. +- `PretrainedConfig`의 `__init__`은 모든 kwargs를 허용해야 하고, +- 이러한 `kwargs`는 상위 클래스 `__init__`에 전달되어야 합니다. + +상속은 🤗 Transformers 라이브러리에서 모든 기능을 가져오는 것입니다. +이러한 점으로부터 비롯되는 두 가지 제약 조건은 `PretrainedConfig`에 설정하는 것보다 더 많은 필드가 있습니다. +`from_pretrained` 메서드로 구성을 다시 로드할 때 해당 필드는 구성에서 수락한 후 상위 클래스로 보내야 합니다. + +모델을 auto 클래스에 등록하지 않는 한, `configuration`에서 `model_type`을 정의(여기서 `model_type="resnet"`)하는 것은 필수 사항이 아닙니다 (마지막 섹션 참조). + +이렇게 하면 라이브러리의 다른 모델 구성과 마찬가지로 구성을 쉽게 만들고 저장할 수 있습니다. +다음은 resnet50d 구성을 생성하고 저장하는 방법입니다: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d_config.save_pretrained("custom-resnet") +``` + +이렇게 하면 `custom-resnet` 폴더 안에 `config.json`이라는 파일이 저장됩니다. +그런 다음 `from_pretrained` 메서드를 사용하여 구성을 다시 로드할 수 있습니다. + +```py +resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") +``` + +구성을 Hub에 직접 업로드하기 위해 [`PretrainedConfig`] 클래스의 [`~PretrainedConfig.push_to_hub`]와 같은 다른 메서드를 사용할 수 있습니다. + + +## 사용자 정의 모델 작성하기[[writing-a-custom-model]] + +이제 ResNet 구성이 있으므로 모델을 작성할 수 있습니다. +실제로는 두 개를 작성할 것입니다. 하나는 이미지 배치에서 hidden features를 추출하는 것([`BertModel`]과 같이), 다른 하나는 이미지 분류에 적합한 것입니다([`BertForSequenceClassification`]과 같이). + +이전에 언급했듯이 이 예제에서는 단순하게 하기 위해 모델의 느슨한 래퍼(loose wrapper)만 작성할 것입니다. +이 클래스를 작성하기 전에 블록 유형과 실제 블록 클래스 간의 매핑 작업만 하면 됩니다. +그런 다음 `ResNet` 클래스로 전달되어 `configuration`을 통해 모델이 선언됩니다: + +```py +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig + + +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} + + +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor): + return self.model.forward_features(tensor) +``` + +이미지 분류 모델을 만들기 위해서는 forward 메소드만 변경하면 됩니다: + +```py +import torch + + +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} +``` + +두 경우 모두 `PreTrainedModel`를 상속받고, `config`를 통해 상위 클래스 초기화를 호출하다는 점을 기억하세요 (일반적인 `torch.nn.Module`을 작성할 때와 비슷함). +모델을 auto 클래스에 등록하고 싶은 경우에는 `config_class`를 설정하는 부분이 필수입니다 (마지막 섹션 참조). + + + +라이브러리에 존재하는 모델과 굉장히 유사하다면, 모델을 생성할 때 구성을 참조해 재사용할 수 있습니다. + + + +원하는 것을 모델이 반환하도록 할 수 있지만, `ResnetModelForImageClassification`에서 했던 것 처럼 +레이블을 통과시켰을 때 손실과 함께 사전 형태로 반환하는 것이 [`Trainer`] 클래스 내에서 직접 모델을 사용하기에 유용합니다. +자신만의 학습 루프 또는 다른 학습 라이브러리를 사용할 계획이라면 다른 출력 형식을 사용해도 좋습니다. + +이제 모델 클래스가 있으므로 하나 생성해 보겠습니다: + +```py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +``` + +다시 말하지만, [`~PreTrainedModel.save_pretrained`]또는 [`~PreTrainedModel.push_to_hub`]처럼 [`PreTrainedModel`]에 속하는 모든 메소드를 사용할 수 있습니다. +다음 섹션에서 두 번째 메소드를 사용해 모델 코드와 모델 가중치를 업로드하는 방법을 살펴보겠습니다. +먼저, 모델 내부에 사전 훈련된 가중치를 로드해 보겠습니다. + +이 예제를 활용할 때는, 사용자 정의 모델을 자신만의 데이터로 학습시킬 것입니다. +이 튜토리얼에서는 빠르게 진행하기 위해 사전 훈련된 resnet50d를 사용하겠습니다. +아래 모델은 resnet50d의 래퍼이기 때문에, 가중치를 쉽게 로드할 수 있습니다. + + +```py +import timm + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +이제 [`~PreTrainedModel.save_pretrained`] 또는 [`~PreTrainedModel.push_to_hub`]를 사용할 때 모델 코드가 저장되는지 확인해봅시다. + +## Hub로 코드 업로드하기[[sending-the-code-to-the-hub]] + + + +이 API는 실험적이며 다음 릴리스에서 약간의 변경 사항이 있을 수 있습니다. + + + +먼저 모델이 `.py` 파일에 완전히 정의되어 있는지 확인하세요. +모든 파일이 동일한 작업 경로에 있기 때문에 상대경로 임포트(relative import)에 의존할 수 있습니다 (transformers에서는 이 기능에 대한 하위 모듈을 지원하지 않습니다). +이 예시에서는 작업 경로 안의 `resnet_model`에서 `modeling_resnet.py` 파일과 `configuration_resnet.py` 파일을 정의합니다. +구성 파일에는 `ResnetConfig`에 대한 코드가 있고 모델링 파일에는 `ResnetModel` 및 `ResnetModelForImageClassification`에 대한 코드가 있습니다. + +``` +. +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +``` + +Python이 `resnet_model`을 모듈로 사용할 수 있도록 감지하는 목적이기 때문에 `__init__.py`는 비어 있을 수 있습니다. + + + +라이브러리에서 모델링 파일을 복사하는 경우, +모든 파일 상단에 있는 상대 경로 임포트(relative import) 부분을 `transformers` 패키지에서 임포트 하도록 변경해야 합니다. + + + +기존 구성이나 모델을 재사용(또는 서브 클래스화)할 수 있습니다. + +커뮤니티에 모델을 공유하기 위해서는 다음 단계를 따라야 합니다: +먼저, 새로 만든 파일에 ResNet 모델과 구성을 임포트합니다: + +```py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +``` + +다음으로 `save_pretrained` 메소드를 사용해 해당 객체의 코드 파일을 복사하고, +복사한 파일을 Auto 클래스로 등록하고(모델인 경우) 실행합니다: + +```py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +``` + +`configuration`에 대한 auto 클래스를 지정할 필요는 없지만(`configuration` 관련 auto 클래스는 AutoConfig 클래스 하나만 있음), 모델의 경우에는 지정해야 합니다. +사용자 지정 모델은 다양한 작업에 적합할 수 있으므로, 모델에 맞는 auto 클래스를 지정해야 합니다. + +다음으로, 이전에 작업했던 것과 마찬가지로 구성과 모델을 작성합니다: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +이제 모델을 Hub로 업로드하기 위해 로그인 상태인지 확인하세요. +터미널에서 다음 코드를 실행해 확인할 수 있습니다: + +```bash +huggingface-cli login +``` + +주피터 노트북의 경우에는 다음과 같습니다: + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +그런 다음 이렇게 자신의 네임스페이스(또는 자신이 속한 조직)에 업로드할 수 있습니다: +```py +resnet50d.push_to_hub("custom-resnet50d") +``` + +On top of the modeling weights and the configuration in json format, this also copied the modeling and +configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result +in this [model repo](https://huggingface.co/sgugger/custom-resnet50d). +json 형식의 모델링 가중치와 구성 외에도 `custom-resnet50d` 폴더 안의 모델링과 구성 `.py` 파일을 복사하해 Hub에 업로드합니다. +[모델 저장소](https://huggingface.co/sgugger/custom-resnet50d)에서 결과를 확인할 수 있습니다. + +[sharing tutorial](model_sharing) 문서의 `push_to_hub` 메소드에서 자세한 내용을 확인할 수 있습니다. + + +## 사용자 정의 코드로 모델 사용하기[[using-a-model-with-custom-code]] + +auto 클래스와 `from_pretrained` 메소드를 사용하여 사용자 지정 코드 파일과 함께 모든 구성, 모델, 토크나이저를 사용할 수 있습니다. +Hub에 업로드된 모든 파일 및 코드는 멜웨어가 있는지 검사되지만 (자세한 내용은 [Hub 보안](https://huggingface.co/docs/hub/security#malware-scanning) 설명 참조), +자신의 컴퓨터에서 모델 코드와 작성자가 악성 코드를 실행하지 않는지 확인해야 합니다. +사용자 정의 코드로 모델을 사용하려면 `trust_remote_code=True`로 설정하세요: + +```py +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) +``` + +모델 작성자가 악의적으로 코드를 업데이트하지 않았다는 점을 확인하기 위해, 커밋 해시(commit hash)를 `revision`으로 전달하는 것도 강력히 권장됩니다 (모델 작성자를 완전히 신뢰하지 않는 경우). + +```py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +``` + +Hub에서 모델 저장소의 커밋 기록을 찾아볼 때, 모든 커밋의 커밋 해시를 쉽게 복사할 수 있는 버튼이 있습니다. + +## 사용자 정의 코드로 만든 모델을 auto 클래스로 등록하기[[registering-a-model-with-custom-code-to-the-auto-classes]] + +🤗 Transformers를 상속하는 라이브러리를 작성하는 경우 사용자 정의 모델을 auto 클래스에 추가할 수 있습니다. +사용자 정의 모델을 사용하기 위해 해당 라이브러리를 임포트해야 하기 때문에, 이는 Hub로 코드를 업로드하는 것과 다릅니다 (Hub에서 자동적으로 모델 코드를 다운로드 하는 것과 반대). + +구성에 기존 모델 유형과 다른 `model_type` 속성이 있고 모델 클래스에 올바른 `config_class` 속성이 있는 한, +다음과 같이 auto 클래스에 추가할 수 있습니다: + +```py +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification + +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) +``` + +사용자 정의 구성을 [`AutoConfig`]에 등록할 때 사용되는 첫 번째 인수는 사용자 정의 구성의 `model_type`과 일치해야 합니다. +또한, 사용자 정의 모델을 auto 클래스에 등록할 때 사용되는 첫 번째 인수는 해당 모델의 `config_class`와 일치해야 합니다. \ No newline at end of file diff --git a/docs/source/ko/custom_tools.md b/docs/source/ko/custom_tools.md new file mode 100644 index 000000000000..87017a68b524 --- /dev/null +++ b/docs/source/ko/custom_tools.md @@ -0,0 +1,748 @@ + + +# 사용자 정의 도구와 프롬프트[[custom-tools-and-prompts]] + + + +Transformers와 관련하여 어떤 도구와 에이전트가 있는지 잘 모르신다면 [Transformers Agents](transformers_agents) 페이지를 먼저 읽어보시기 바랍니다. + + + + + +Transformers Agents는 실험 중인 API로 언제든지 변경될 수 있습니다. +API 또는 기반 모델이 변경되기 쉽기 때문에 에이전트가 반환하는 결과도 달라질 수 있습니다. + + + +에이전트에게 권한을 부여하고 새로운 작업을 수행하게 하려면 사용자 정의 도구와 프롬프트를 만들고 사용하는 것이 무엇보다 중요합니다. +이 가이드에서는 다음과 같은 내용을 살펴보겠습니다: + +- 프롬프트를 사용자 정의하는 방법 +- 사용자 정의 도구를 사용하는 방법 +- 사용자 정의 도구를 만드는 방법 + +## 프롬프트를 사용자 정의하기[[customizing-the-prompt]] + +[Transformers Agents](transformers_agents)에서 설명한 것처럼 에이전트는 [`~Agent.run`] 및 [`~Agent.chat`] 모드에서 실행할 수 있습니다. +`run`(실행) 모드와 `chat`(채팅) 모드 모두 동일한 로직을 기반으로 합니다. +에이전트를 구동하는 언어 모델은 긴 프롬프트에 따라 조건이 지정되고, 중지 토큰에 도달할 때까지 다음 토큰을 생성하여 프롬프트를 완수합니다. +`chat` 모드에서는 프롬프트가 이전 사용자 입력 및 모델 생성으로 연장된다는 점이 두 모드의 유일한 차이점입니다. +이를 통해 에이전트가 과거 상호작용에 접근할 수 있게 되므로 에이전트에게 일종의 메모리를 제공하는 셈입니다. + +### 프롬프트의 구조[[structure-of-the-prompt]] + +어떻게 프롬프트 사용자 정의를 잘 할 수 있는지 이해하기 위해 프롬프트의 구조를 자세히 살펴봅시다. +프롬프트는 크게 네 부분으로 구성되어 있습니다. + +- 1. 도입: 에이전트가 어떻게 행동해야 하는지, 도구의 개념에 대한 설명. +- 2. 모든 도구에 대한 설명. 이는 런타임에 사용자가 정의/선택한 도구로 동적으로 대체되는 `<>` 토큰으로 정의됩니다. +- 3. 작업 예제 및 해당 솔루션 세트. +- 4. 현재 예제 및 해결 요청. + +각 부분을 더 잘 이해할 수 있도록 짧은 버전을 통해 `run` 프롬프트가 어떻게 보이는지 살펴보겠습니다: + +````text +I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task. +[...] +You can print intermediate results if it makes sense to do so. + +Tools: +- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question. +- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English. +[...] + +Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French." + +I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image. + +Answer: +```py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") +answer = image_qa(image=image, question=translated_question) +print(f"The answer is {answer}") +``` + +Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner." + +I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. + +Answer: +```py +answer = document_qa(document, question="What is the oldest person?") +print(f"The answer is {answer}.") +image = image_generator("A banner showing " + answer) +``` + +[...] + +Task: "Draw me a picture of rivers and lakes" + +I will use the following +```` + +도입(*"도구:"* 앞의 텍스트)에서는 모델이 어떻게 작동하고 무엇을 해야 하는지 정확하게 설명합니다. +에이전트는 항상 같은 방식으로 작동해야 하므로 이 부분은 사용자 정의할 필요가 없을 가능성이 높습니다. + +두 번째 부분(*"도구"* 아래의 글머리 기호)은 `run` 또는 `chat`을 호출할 때 동적으로 추가됩니다. +정확히 `agent.toolbox`에 있는 도구 수만큼 글머리 기호가 있고, 각 글머리 기호는 도구의 이름과 설명으로 구성됩니다: + +```text +- : +``` + +문서 질의응답 도구를 가져오고 이름과 설명을 출력해서 빠르게 확인해 보겠습니다. + +```py +from transformers import load_tool + +document_qa = load_tool("document-question-answering") +print(f"- {document_qa.name}: {document_qa.description}") +``` + +그러면 다음 결과가 출력됩니다: +```text +- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question. +``` + +여기서 도구 이름이 짧고 정확하다는 것을 알 수 있습니다. +설명은 두 부분으로 구성되어 있는데, 첫 번째 부분에서는 도구의 기능을 설명하고 두 번째 부분에서는 예상되는 입력 인수와 반환 값을 명시합니다. + +에이전트가 도구를 올바르게 사용하려면 좋은 도구 이름과 도구 설명이 매우 중요합니다. +에이전트가 도구에 대해 알 수 있는 유일한 정보는 이름과 설명뿐이므로, 이 두 가지를 정확하게 작성하고 도구 상자에 있는 기존 도구의 스타일과 일치하는지 확인해야 합니다. +특히 이름에 따라 예상되는 모든 인수가 설명에 코드 스타일로 언급되어 있는지, 예상되는 유형과 그 유형이 무엇인지에 대한 설명이 포함되어 있는지 확인하세요. + + + +도구에 어떤 이름과 설명이 있어야 하는지 이해하려면 엄선된 Transformers 도구의 이름과 설명을 확인하세요. +[`Agent.toolbox`] 속성을 가진 모든 도구를 볼 수 있습니다. + + + +세 번째 부분에는 에이전트가 어떤 종류의 사용자 요청에 대해 어떤 코드를 생성해야 하는지 정확하게 보여주는 엄선된 예제 세트가 포함되어 있습니다. +에이전트를 지원하는 대규모 언어 모델은 프롬프트에서 패턴을 인식하고 새로운 데이터로 패턴을 반복하는 데 매우 능숙합니다. +따라서 에이전트가 실제로 올바른 실행 가능한 코드를 생성할 가능성을 극대화하는 방식으로 예제를 작성하는 것이 매우 중요합니다. + +한 가지 예를 살펴보겠습니다: + +````text +Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner." + +I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. + +Answer: +```py +answer = document_qa(document, question="What is the oldest person?") +print(f"The answer is {answer}.") +image = image_generator("A banner showing " + answer) +``` + +```` +작업 설명, 에이전트가 수행하려는 작업에 대한 설명, 마지막으로 생성된 코드, 이 세 부분으로 구성된 프롬프트는 모델에 반복하여 제공됩니다. +프롬프트의 일부인 모든 예제는 이러한 정확한 패턴으로 되어 있으므로, 에이전트가 새 토큰을 생성할 때 정확히 동일한 패턴을 재현할 수 있습니다. + +프롬프트 예제는 Transformers 팀이 선별하고 일련의 [problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)에 따라 엄격하게 평가하여 +에이전트의 프롬프트가 에이전트의 실제 사용 사례를 최대한 잘 해결할 수 있도록 보장합니다. + +프롬프트의 마지막 부분은 다음에 해당합니다: +```text +Task: "Draw me a picture of rivers and lakes" + +I will use the following +``` + +이는 에이전트가 완료해야 할 최종적인 미완성 예제입니다. 미완성 예제는 실제 사용자 입력에 따라 동적으로 만들어집니다. +위 예시의 경우 사용자가 다음과 같이 실행했습니다: + +```py +agent.run("Draw me a picture of rivers and lakes") +``` + +사용자 입력 - *즉* Task: *"Draw me a picture of rivers and lakes"*가 프롬프트 템플릿에 맞춰 "Task: \n\n I will use the following"로 캐스팅됩니다. +이 문장은 에이전트에게 조건이 적용되는 프롬프트의 마지막 줄을 구성하므로 에이전트가 이전 예제에서 수행한 것과 정확히 동일한 방식으로 예제를 완료하도록 강력하게 영향을 미칩니다. + +너무 자세히 설명하지 않더라도 채팅 템플릿의 프롬프트 구조는 동일하지만 예제의 스타일이 약간 다릅니다. *예를 들면*: + +````text +[...] + +===== + +Human: Answer the question in the variable `question` about the image stored in the variable `image`. + +Assistant: I will use the tool `image_qa` to answer the question on the input image. + +```py +answer = image_qa(text=question, image=image) +print(f"The answer is {answer}") +``` + +Human: I tried this code, it worked but didn't give me a good result. The question is in French + +Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this. + +```py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") +answer = image_qa(text=translated_question, image=image) +print(f"The answer is {answer}") +``` + +===== + +[...] +```` + +`run` 프롬프트의 예와는 반대로, 각 `chat` 프롬프트의 예에는 *Human(사람)*과 *Assistant(어시스턴트)* 간에 하나 이상의 교환이 있습니다. 모든 교환은 `run` 프롬프트의 예와 유사한 구조로 되어 있습니다. +사용자의 입력이 *Human:* 뒤에 추가되며, 에이전트에게 코드를 생성하기 전에 수행해야 할 작업을 먼저 생성하라는 메시지가 표시됩니다. +교환은 이전 교환을 기반으로 할 수 있으므로 위와 같이 사용자가 "**이** 코드를 시도했습니다"라고 입력하면 이전에 생성된 에이전트의 코드를 참조하여 과거 교환을 참조할 수 있습니다. + +`.chat`을 실행하면 사용자의 입력 또는 *작업*이 미완성된 양식의 예시로 캐스팅됩니다: +```text +Human: \n\nAssistant: +``` +그러면 에이전트가 이를 완성합니다. `run` 명령과 달리 `chat` 명령은 완료된 예제를 프롬프트에 추가하여 에이전트에게 다음 `chat` 차례에 대한 더 많은 문맥을 제공합니다. + +이제 프롬프트가 어떻게 구성되어 있는지 알았으니 어떻게 사용자 정의할 수 있는지 살펴봅시다! + +### 좋은 사용자 입력 작성하기[[writing-good-user-inputs]] + +대규모 언어 모델이 사용자의 의도를 이해하는 능력이 점점 더 향상되고 있지만, 에이전트가 올바른 작업을 선택할 수 있도록 최대한 정확성을 유지하는 것은 큰 도움이 됩니다. +최대한 정확하다는 것은 무엇을 의미할까요? + +에이전트는 프롬프트에서 도구 이름 목록과 해당 설명을 볼 수 있습니다. +더 많은 도구가 추가될수록 에이전트가 올바른 도구를 선택하기가 더 어려워지고 실행할 도구의 올바른 순서를 선택하는 것은 더욱 어려워집니다. +일반적인 실패 사례를 살펴보겠습니다. 여기서는 분석할 코드만 반환하겠습니다. + +```py +from transformers import HfAgent + +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") + +agent.run("Show me a tree", return_code=True) +``` + +그러면 다음 결과가 출력됩니다: + +```text +==Explanation from the agent== +I will use the following tool: `image_segmenter` to create a segmentation mask for the image. + + +==Code generated by the agent== +mask = image_segmenter(image, prompt="tree") +``` + +우리가 원했던 결과가 아닐 수도 있습니다. 대신 나무 이미지가 생성되기를 원할 가능성이 더 높습니다. +따라서 에이전트가 특정 도구를 사용하도록 유도하려면 도구의 이름과 설명에 있는 중요한 키워드를 사용하는 것이 매우 유용할 수 있습니다. 한번 살펴보겠습니다. +```py +agent.toolbox["image_generator"].description +``` + +```text +'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image. +``` + +이름과 설명은 "image", "prompt", "create" 및 "generate" 키워드를 사용합니다. 이 단어들을 사용하면 더 잘 작동할 가능성이 높습니다. 프롬프트를 조금 더 구체화해 보겠습니다. + +```py +agent.run("Create an image of a tree", return_code=True) +``` + +이 코드는 다음 프롬프트를 만들어냅니다: +```text +==Explanation from the agent== +I will use the following tool `image_generator` to generate an image of a tree. + + +==Code generated by the agent== +image = image_generator(prompt="tree") +``` + +훨씬 낫네요! 저희가 원했던 것과 비슷해 보입니다. +즉, 에이전트가 작업을 올바른 도구에 올바르게 매핑하는 데 어려움을 겪고 있다면 도구 이름과 설명에서 가장 관련성이 높은 키워드를 찾아보고 이를 통해 작업 요청을 구체화해 보세요. + +### 도구 설명 사용자 정의하기[[customizing-the-tool-descriptions]] + +앞서 살펴본 것처럼 에이전트는 각 도구의 이름과 설명에 액세스할 수 있습니다. +기본 도구에는 매우 정확한 이름과 설명이 있어야 하지만 특정 사용 사례에 맞게 도구의 설명이나 이름을 변경하는 것이 도움이 될 수도 있습니다. +이는 매우 유사한 여러 도구를 추가했거나 특정 도메인(*예*: 이미지 생성 및 변환)에만 에이전트를 사용하려는 경우에 특히 중요해질 수 있습니다. + +일반적인 문제는 이미지 생성 작업에 많이 사용되는 경우 에이전트가 이미지 생성과 이미지 변환/수정을 혼동하는 것입니다. *예를 들어,* +```py +agent.run("Make an image of a house and a car", return_code=True) +``` +그러면 다음 결과가 출력됩니다: +```text +==Explanation from the agent== +I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house. + +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") +house_car_image = image_transformer(image=car_image, prompt="A house") +``` + +결과물이 우리가 여기서 원하는 것과 정확히 일치하지 않을 수 있습니다. 에이전트가 `image_generator`와 `image_transformer`의 차이점을 이해하기 어려워서 두 가지를 함께 사용하는 경우가 많은 것 같습니다. + +여기서 `image_transformer`의 도구 이름과 설명을 변경하여 에이전트가 도울 수 있습니다. +"image" 및 "prompt"와 약간 분리하기 위해 `modifier`라고 대신 부르겠습니다: +```py +agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer") +agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace( + "transforms an image according to a prompt", "modifies an image" +) +``` + +이제 "modify"은 새 이미지 프로세서를 사용하라는 강력한 신호이므로 위의 프롬프트에 도움이 될 것입니다. 다시 실행해 봅시다. + +```py +agent.run("Make an image of a house and a car", return_code=True) +``` + +여기서 다음과 같은 결과를 얻게 됩니다: +```text +==Explanation from the agent== +I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car. + + +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") +``` + +우리가 염두에 두었던 것과 확실히 더 가까워졌습니다! 하지만 집과 자동차가 모두 같은 이미지에 포함되면 좋겠습니다. 작업을 단일 이미지 생성에 더 집중하면 도움이 될 것입니다: + +```py +agent.run("Create image: 'A house and car'", return_code=True) +``` + +```text +==Explanation from the agent== +I will use the following tool: `image_generator` to generate an image. + + +==Code generated by the agent== +image = image_generator(prompt="A house and car") +``` + + + +에이전트는 여전히 특히 여러 개체의 이미지를 생성하는 것과 같이 약간 더 복잡한 사용 사례에서 취약한 경우가 많습니다. +앞으로 몇 달 안에 에이전트 자체와 기본 프롬프트가 더욱 개선되어 에이전트가 다양한 사용자 입력에 더욱 강력하게 대응할 수 있도록 할 예정입니다. + + + +### 전체 프롬프트 사용자 정의하기[[customizing-the-whole-prompt]] + +사용자에게 최대한의 유연성을 제공하기 위해 [위](#structure-of-the-prompt)에 설명된 전체 프롬프트 템플릿을 사용자가 덮어쓸 수 있습니다. +이 경우 사용자 정의 프롬프트에 소개 섹션, 도구 섹션, 예제 섹션 및 미완성 예제 섹션이 포함되어 있는지 확인하세요. +`run` 프롬프트 템플릿을 덮어쓰려면 다음과 같이 하면 됩니다: + +```py +template = """ [...] """ + +agent = HfAgent(your_endpoint, run_prompt_template=template) +``` + + + +에이전트가 사용 가능한 도구를 인식하고 사용자의 프롬프트를 올바르게 삽입할 수 있도록 `<>` 문자열과 `<>`를 `template` 어딘가에 정의해야 합니다. + + + +마찬가지로 `chat` 프롬프트 템플릿을 덮어쓸 수 있습니다. `chat` 모드에서는 항상 다음과 같은 교환 형식을 사용한다는 점에 유의하세요: + +```text +Human: <> + +Assistant: +``` + +따라서 사용자 정의 `chat` 프롬프트 템플릿의 예제에서도 이 형식을 사용하는 것이 중요합니다. +다음과 같이 인스턴스화 할 때 `chat` 템플릿을 덮어쓸 수 있습니다. + +``` +template = """ [...] """ + +agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template) +``` + + + +에이전트가 사용 가능한 도구를 인식할 수 있도록 `<>` 문자열을 `template` 어딘가에 정의해야 합니다. + + + +두 경우 모두 커뮤니티의 누군가가 호스팅하는 템플릿을 사용하려는 경우 프롬프트 템플릿 대신 저장소 ID를 전달할 수 있습니다. +기본 프롬프트는 [이 저장소](https://huggingface.co/datasets/huggingface-tools/default-prompts)를 예로 들 수 있습니다. + +Hub의 저장소에 사용자 정의 프롬프트를 업로드하여 커뮤니티와 공유하려면 다음을 확인하세요: +- 데이터 세트 저장소를 사용하세요. +- `run` 명령에 대한 프롬프트 템플릿을 `run_prompt_template.txt`라는 파일에 넣으세요. +- `chat` 명령에 대한 프롬프트 템플릿을 `chat_prompt_template.txt`라는 파일에 넣으세요. + +## 사용자 정의 도구 사용하기[[using-custom-tools]] + +이 섹션에서는 이미지 생성에 특화된 두 가지 기존 사용자 정의 도구를 활용하겠습니다: + +- 더 많은 이미지 수정을 허용하기 위해 [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation)을 + [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool)로 대체합니다. +- 기본 도구 상자에 이미지 업스케일링을 위한 새로운 도구가 추가되었습니다: + [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool)가 기존 이미지 변환 도구를 대체합니다. + +편리한 [`load_tool`] 함수를 사용하여 사용자 정의 도구를 가져오는 것으로 시작하겠습니다: + +```py +from transformers import load_tool + +controlnet_transformer = load_tool("diffusers/controlnet-canny-tool") +upscaler = load_tool("diffusers/latent-upscaler-tool") +``` + +에이전트에게 사용자 정의 도구를 추가하면 도구의 설명과 이름이 에이전트의 프롬프트에 자동으로 포함됩니다. +따라서 에이전트가 사용 방법을 이해할 수 있도록 사용자 정의 도구의 설명과 이름을 잘 작성해야 합니다. +`controlnet_transformer`의 설명과 이름을 살펴보겠습니다: + +```py +print(f"Description: '{controlnet_transformer.description}'") +print(f"Name: '{controlnet_transformer.name}'") +``` + +그러면 다음 결과가 출력됩니다: +```text +Description: 'This is a tool that transforms an image with ControlNet according to a prompt. +It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.' +Name: 'image_transformer' +``` + +이름과 설명이 정확하고 [큐레이팅 된 도구 세트(curated set of tools)](./transformers_agents#a-curated-set-of-tools)의 스타일에 맞습니다. +다음으로, `controlnet_transformer`와 `upscaler`로 에이전트를 인스턴스화해 봅시다: +```py +tools = [controlnet_transformer, upscaler] +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools) +``` + +이 명령을 실행하면 다음 정보가 표시됩니다: + +```text +image_transformer has been replaced by as provided in `additional_tools` +``` + +큐레이팅된 도구 세트에는 이미 'image_transformer' 도구가 있으며, 이 도구는 사용자 정의 도구로 대체됩니다. + + + +기존 도구와 똑같은 작업에 사용자 정의 도구를 사용하려는 경우 기존 도구를 덮어쓰는 것이 유용할 수 있습니다. +에이전트가 해당 작업에 능숙하기 때문입니다. +이 경우 사용자 정의 도구가 덮어쓴 도구와 정확히 동일한 API를 따라야 하며, 그렇지 않으면 해당 도구를 사용하는 모든 예제가 업데이트되도록 프롬프트 템플릿을 조정해야 한다는 점에 유의하세요. + + + +업스케일러 도구에 지정된 'image_upscaler'라는 이름 아직 기본 도구 상자에는 존재하지 않기 때문에, 도구 목록에 해당 이름이 간단히 추가되었습니다. +에이전트가 현재 사용할 수 있는 도구 상자는 언제든지 `agent.toolbox` 속성을 통해 확인할 수 있습니다: + +```py +print("\n".join([f"- {a}" for a in agent.toolbox.keys()])) +``` + +```text +- document_qa +- image_captioner +- image_qa +- image_segmenter +- transcriber +- summarizer +- text_classifier +- text_qa +- text_reader +- translator +- image_transformer +- text_downloader +- image_generator +- video_generator +- image_upscaler +``` + +에이전트의 도구 상자에 `image_upscaler`가 추가된 점을 주목하세요. + +이제 새로운 도구를 사용해봅시다! [Transformers Agents Quickstart](./transformers_agents#single-execution-run)에서 생성한 이미지를 다시 사용하겠습니다. + +```py +from diffusers.utils import load_image + +image = load_image( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" +) +``` + + + +이미지를 아름다운 겨울 풍경으로 바꿔 봅시다: + +```py +image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image) +``` + +```text +==Explanation from the agent== +I will use the following tool: `image_transformer` to transform the image. + + +==Code generated by the agent== +image = image_transformer(image, prompt="A frozen lake and snowy forest") +``` + + + +새로운 이미지 처리 도구는 이미지를 매우 강력하게 수정할 수 있는 ControlNet을 기반으로 합니다. +기본적으로 이미지 처리 도구는 512x512 픽셀 크기의 이미지를 반환합니다. 이를 업스케일링할 수 있는지 살펴봅시다. + +```py +image = agent.run("Upscale the image", image) +``` + +```text +==Explanation from the agent== +I will use the following tool: `image_upscaler` to upscale the image. + + +==Code generated by the agent== +upscaled_image = image_upscaler(image) +``` + + + +에이전트는 업스케일러 도구의 설명과 이름만 보고 방금 추가한 업스케일러 도구에 "이미지 업스케일링"이라는 프롬프트를 자동으로 매핑하여 올바르게 실행했습니다. + +다음으로 새 사용자 정의 도구를 만드는 방법을 살펴보겠습니다. + +### 새 도구 추가하기[[adding-new-tools]] + +이 섹션에서는 에이전트에게 추가할 수 있는 새 도구를 만드는 방법을 보여 드립니다. + +#### 새 도구 만들기[[creating-a-new-tool]] + +먼저 도구를 만드는 것부터 시작하겠습니다. +특정 작업에 대해 가장 많은 다운로드를 받은 Hugging Face Hub의 모델을 가져오는, 그다지 유용하지는 않지만 재미있는 작업을 추가하겠습니다. + +다음 코드를 사용하면 됩니다: + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` +`text-classification`(텍스트 분류) 작업의 경우 `'facebook/bart-large-mnli'`를 반환하고, `translation`(번역) 작업의 경우 `'t5-base'`를 반환합니다. + +이를 에이전트가 활용할 수 있는 도구로 변환하려면 어떻게 해야 할까요? +모든 도구는 필요한 주요 속성을 보유하는 슈퍼클래스 `Tool`에 의존합니다. 이를 상속하는 클래스를 만들어 보겠습니다: + +```python +from transformers import Tool + + +class HFModelDownloadsTool(Tool): + pass +``` + +이 클래스에는 몇 가지 요구사항이 있습니다: +- 도구 자체의 이름에 해당하는 `name` 속성. 수행명이 있는 다른 도구와 호환되도록 `model_download_counter`로 이름을 지정하겠습니다. +- 에이전트의 프롬프트를 채우는 데 사용되는 속성 `description`. +- `inputs` 및 `outputs` 속성. 이를 정의하면 Python 인터프리터가 유형에 대한 정보에 입각한 선택을 하는 데 도움이 되며, + 도구를 허브에 푸시할 때 gradio 데모를 생성할 수 있습니다. + 두 속성 모두 값은 '텍스트', '이미지' 또는 '오디오'가 될 수 있는 예상 값의 리스트입니다. +- 추론 코드가 포함된 `__call__` 메소드. 이것이 우리가 위에서 다루었던 코드입니다! + +이제 클래스의 모습은 다음과 같습니다: + +```python +from transformers import Tool +from huggingface_hub import list_models + + +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " + "It takes the name of the category (such as text-classification, depth-estimation, etc), and " + "returns the name of the checkpoint." + ) + + inputs = ["text"] + outputs = ["text"] + + def __call__(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id +``` + +이제 도구를 손쉽게 사용할 수 있게 되었습니다. +도구를 파일에 저장하고 메인 스크립트에서 가져옵니다. 이 파일의 이름을 `model_downloads.py`로 지정하면 결과적으로 가져오기 코드는 다음과 같습니다: + +```python +from model_downloads import HFModelDownloadsTool + +tool = HFModelDownloadsTool() +``` + +다른 사람들이 이 기능을 활용할 수 있도록 하고 초기화를 더 간단하게 하려면 네임스페이스 아래의 Hub로 푸시하는 것이 좋습니다. +그렇게 하려면 `tool` 변수에서 `push_to_hub`를 호출하면 됩니다: + +```python +tool.push_to_hub("hf-model-downloads") +``` + +이제 허브에 코드가 생겼습니다! 마지막 단계인 에이전트가 코드를 사용하도록 하는 단계를 살펴보겠습니다. + +#### 에이전트가 도구를 사용하게 하기[[Having-the-agent-use-the-tool]] + +이제 이런 식으로 허브에 존재하는 도구를 인스턴스화할 수 있습니다(도구의 사용자 이름은 변경하세요): +We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool): + +```python +from transformers import load_tool + +tool = load_tool("lysandre/hf-model-downloads") +``` + +이 도구를 에이전트에서 사용하려면 에이전트 초기화 메소드의 `additional_tools` 매개변수에 전달하기만 하면 됩니다: + +```python +from transformers import HfAgent + +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) + +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` +그러면 다음과 같은 결과가 출력됩니다: +```text +==Code generated by the agent== +model = model_download_counter(task="text-to-video") +print(f"The model with the most downloads is {model}.") +audio_model = text_reader(model) + + +==Result== +The model with the most downloads is damo-vilab/text-to-video-ms-1.7b. +``` + +and generates the following audio. + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +|