Further removed torchtext

ludwig-ai · Nov 25, 2024 · b2a1454 · b2a1454
1 parent 01533ef
commit b2a1454
Show file tree

Hide file tree

Showing 8 changed files with 47 additions and 148 deletions.
diff --git a/.github/workflows/pytest_slow.yml b/.github/workflows/pytest_slow.yml
@@ -50,7 +50,7 @@ jobs:
           python --version
           pip --version
           python -m pip install -U pip
-          pip install torch==2.1.0 torchtext torchvision torchaudio
+          pip install torch==2.1.0 torchvision torchaudio
           pip install ray==2.3.1
           pip install '.[test]'
 

diff --git a/docker/ludwig-ray-gpu/Dockerfile b/docker/ludwig-ray-gpu/Dockerfile
@@ -50,7 +50,7 @@ RUN pip install -U pip
 
 WORKDIR /ludwig
 
-RUN pip install --no-cache-dir torch==2.1.0 torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN pip install --no-cache-dir torch==2.1.0  torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 
 COPY . .
 RUN pip install --no-cache-dir '.[full]' --extra-index-url https://download.pytorch.org/whl/cu118
diff --git a/docker/ludwig-ray/Dockerfile b/docker/ludwig-ray/Dockerfile
@@ -36,7 +36,7 @@ RUN pip install -U pip
 
 WORKDIR /ludwig
 
-RUN pip install --no-cache-dir torch==2.1.0 torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir torch==2.1.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 
 COPY . .
 RUN pip install --no-cache-dir '.[full]' --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/docker/ludwig/Dockerfile b/docker/ludwig/Dockerfile
@@ -24,7 +24,7 @@ RUN pip install -U pip
 
 WORKDIR /ludwig
 
-RUN pip install --no-cache-dir torch==2.0.0 torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir torch==2.0.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 
 COPY . .
 RUN pip install --no-cache-dir '.[full]'

diff --git a/ludwig/decoders/llm_decoders.py b/ludwig/decoders/llm_decoders.py
@@ -1,3 +1,4 @@
+# flake8: noqa: E501
 import logging
 import re
 from typing import Any, Dict, List, Union
@@ -91,7 +92,6 @@ def __init__(
             # Transformer Tokenizers
             self.tokenizer_vocab_size = self.tokenizer.tokenizer.vocab_size
         else:
-            # TorchText Tokenizers
             self.tokenizer_vocab_size = len(self.tokenizer.vocab)
 
         # Maximum number of new tokens that will be generated

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ keywords = [
     "processing",
     "vision",
 ]
+
 dependencies = [
     "absl-py",
     "bitsandbytes<0.41.0",
@@ -69,7 +70,6 @@ dependencies = [
     "torchaudio==2.4.1",
     "torchinfo",
     "torchmetrics>=0.11.0",
-    #"torchtext==0.17.2",
     "torchvision==0.19.1",
     "tqdm",
     "transformers>=4.42.3",
@@ -78,48 +78,42 @@ dependencies = [
     "xlsxwriter>=1.4.3",
     "xlwt",
     "tifffile==2024.9.20",
+    "onnx",
 ]
 
-# Optional Dependencies
 [project.optional-dependencies]
-full = [
-    "accelerate",
-    "awscli",
-    "captum",
-    "cartonml-nightly",
-    "dask[dataframe]<2023.4.0",
-    "deepspeed!=0.11.0,<0.13.0",
-    "faiss-cpu",
-    "fastapi",
-    "getdaft[ray]==0.1.20",
-    "GPUtil",
-    "hiplot",
-    "httpx",
-    "hummingbird-ml>=0.4.8",
-    "hyperopt",
-    "lightgbm",
-    "lightgbm-ray",
-    "loralib",
-    "matplotlib>3.4,<3.9.0; python_version > '3.6'",
-    "matplotlib>=3.0,<3.4; python_version <= '3.6'",
-    "neuropod==0.3.0rc6 ; platform_system != \"Windows\" and python_version < '3.9'",
-    "peft>=0.10.0",
-    "ptitprince",
-    "pyarrow",
-    "python-multipart",
-    "ray[default,data,serve,tune]==2.3.1",
-    "ray[default,tune]>=2.0.0",
-    "s3fs",
-    "seaborn>=0.7,<0.12",
-    "sentence-transformers",
-    "tblib",
-    "tensorboardX<2.3",
-    "uvicorn",
-]
-
 dev = ["flake8", "flake8-pyproject", "pre-commit", "setuptools"]
+test = [
+    # Core testing
+    "pytest",
+    "pytest-timeout",
+    "pytest-cov",
+    "tifffile",
+    "wget",
+    "six>=1.13.0",
 
-test = ["pytest", "pytest-timeout", "wget", "six>=1.13.0", "cloudpickle"]
+    # Logging and experiment tracking
+    "aim",
+    "wandb<0.12.11",
+    "comet_ml",
+    "mlflow",
+    "sqlalchemy<2",  # Pinned for aimstack compatibility
+
+    # Ray Tune Search Algorithms
+    "hpbandster",            # BOHB algorithm
+    "ConfigSpace==0.7.1",
+    "ax-platform",           # AX algorithm
+    "bayesian-optimization", # Bayesian optimization
+    "flaml[blendsearch]",    # CFO and blendsearch
+    "HEBO",                  # HEBO algorithm
+    "nevergrad",             # Nevergrad algorithm
+    "optuna",                # Optuna algorithm
+    "scikit-optimize",       # SKopt algorithm
+    "zoopt",                 # ZOOpt algorithm
+
+    # Storage
+    "s3fs>=2022.8.2",
+]
 benchmarking = ["s3fs"]
 distributed = [
     "awscli",
@@ -153,6 +147,7 @@ serve = [
     "neuropod==0.3.0rc6 ; platform_system != \"Windows\" and python_version < '3.9'",
     "python-multipart",
     "uvicorn",
+    "starlette",
 ]
 tree = ["hummingbird-ml>=0.4.8", "lightgbm", "lightgbm-ray"]
 viz = [
@@ -171,69 +166,6 @@ Website = "https://ludwig.ai/latest/"
 [project.scripts]
 ludwig = "ludwig.cli:main"
 
-[tool.hatch.envs.hatch-test]
-dependencies = [
-    "gpy >=1.10.0",
-    "accelerate",
-    "aim",
-    "awscli",
-    "ax-platform",
-    "bayesian-optimization",
-    "captum",
-    "cartonml-nightly",
-    "comet_ml",
-    "ConfigSpace==0.7.1",
-    "dask[dataframe]<2023.4.0",
-    "deepspeed!=0.11.0,<0.13.0",
-    "faiss-cpu",
-    "fastapi",
-    "flaml[blendsearch]",
-    "getdaft[ray]==0.1.20",
-    "GPUtil",
-    "HEBO",
-    "hiplot",
-    "hpbandster",
-    "httpx",
-    "hummingbird-ml>=0.4.8",
-    "hyperopt",
-    "lightgbm",
-    "lightgbm-ray",
-    "loralib",
-    "matplotlib>3.4,<3.9.0; python_version > '3.6'",
-    "matplotlib>=3.0,<3.4; python_version <= '3.6'",
-    "mlflow",
-    "neuropod==0.3.0rc6 ; platform_system != \"Windows\" and python_version < '3.9'",
-    "nevergrad",
-    "optuna",
-    "peft>=0.10.0",
-    "ptitprince",
-    "pyarrow",
-    "pytest",
-    "pytest-timeout",
-    "pytest-cov",
-    "python-multipart",
-    #"ray[default,data,serve,tune]==2.3.1",
-    #"ray[default,tune]>=2.0.0",
-    "s3fs",
-    "s3fs>=2022.8.2",
-    "scikit-optimize",
-    "seaborn>=0.7,<0.12",
-    "sentence-transformers",
-    "six>=1.13.0",
-    "sqlalchemy<2",
-    "tblib",
-    "tensorboardX<2.3",
-    "uvicorn",
-    "wandb",
-    #"wandb<0.12.11",
-    "wget",
-    "zoopt",
-]
-
-#[tool.hatch.envs.hatch-test]
-#setup = "pip install -e .[test]"
-#run = "pytest {args:test}"
-
 [tool.hatch.version]
 path = "ludwig/__about__.py"
 
@@ -270,11 +202,11 @@ dependencies = ["flake8", "flake8-pyproject"]
 style = "flake8 ."
 
 [tool.hatch.envs.default]
-python = "3.12"
+python = "3.11"
 dependencies = ["setuptools>=65.0"]
 
 [tool.hatch.envs.dev]
-python = "3.12"
+python = "3.11"
 dependencies = [".[dev]"]
 
 

diff --git a/tests/integration_tests/test_torchscript.py b/tests/integration_tests/test_torchscript.py
@@ -1,3 +1,4 @@
+# flake8: noqa: E501
 # Copyright (c) 2023 Predibase, Inc., 2019 Uber Technologies, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,7 +22,6 @@
 import pandas as pd
 import pytest
 import torch
-import torchtext
 
 from ludwig.api import LudwigModel
 from ludwig.backend import RAY
@@ -408,32 +408,7 @@ def test_torchscript_e2e_text(tmpdir, csv_filename):
     validate_torchscript_outputs(tmpdir, config, backend, training_data_csv_path)
 
 
-@pytest.mark.skipif(
-    torch.torch_version.TorchVersion(torchtext.__version__) < (0, 14, 0),
-    reason="requires torchtext 0.14.0 or higher",
-)
-@pytest.mark.integration_tests_e
-def test_torchscript_e2e_text_hf_tokenizer(tmpdir, csv_filename):
-    data_csv_path = os.path.join(tmpdir, csv_filename)
-    input_features = [text_feature(encoder={"vocab_size": 3, "type": "bert"})]
-    output_features = [
-        category_feature(),
-    ]
-    backend = LocalTestBackend()
-    config = {
-        "input_features": input_features,
-        "output_features": output_features,
-        TRAINER: {"epochs": 2, BATCH_SIZE: 128, EVAL_BATCH_SIZE: 128},
-    }
-    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
-
-    validate_torchscript_outputs(tmpdir, config, backend, training_data_csv_path)
-
-
-@pytest.mark.skipif(
-    torch.torch_version.TorchVersion(torchtext.__version__) < (0, 14, 0),
-    reason="requires torchtext 0.14.0 or higher",
-)
+@pytest.mark.skip()
 @pytest.mark.integration_tests_e
 def test_torchscript_e2e_text_hf_tokenizer_truncated_sequence(tmpdir, csv_filename):
     data_csv_path = os.path.join(tmpdir, csv_filename)

diff --git a/tests/ludwig/features/test_sequence_features.py b/tests/ludwig/features/test_sequence_features.py
@@ -1,9 +1,9 @@
+# flake8: noqa: E501
 from typing import List, Tuple
 
 import numpy as np
 import pytest
 import torch
-import torchtext
 
 from ludwig.constants import ENCODER_OUTPUT, LAST_HIDDEN, LOGITS, SEQUENCE, TEXT, TYPE
 from ludwig.features.sequence_feature import _SequencePreprocessing, SequenceInputFeature, SequenceOutputFeature
@@ -192,9 +192,7 @@ def test_text_preproc_module_space_punct_tokenizer():
     )
 
 
-@pytest.mark.skipif(
-    torch.torch_version.TorchVersion(torchtext.__version__) < (0, 12, 0), reason="requires torchtext 0.12.0 or higher"
-)
+@pytest.mark.skip()
 def test_sequence_preproc_module_sentencepiece_tokenizer():
     metadata = {
         "preprocessing": {
@@ -227,9 +225,7 @@ def test_sequence_preproc_module_sentencepiece_tokenizer():
     )
 
 
-@pytest.mark.skipif(
-    torch.torch_version.TorchVersion(torchtext.__version__) < (0, 12, 0), reason="requires torchtext 0.12.0 or higher"
-)
+@pytest.mark.skip()
 def test_sequence_preproc_module_clip_tokenizer():
     metadata = {
         "preprocessing": {
@@ -260,9 +256,7 @@ def test_sequence_preproc_module_clip_tokenizer():
     )
 
 
-@pytest.mark.skipif(
-    torch.torch_version.TorchVersion(torchtext.__version__) < (0, 12, 0), reason="requires torchtext 0.12.0 or higher"
-)
+@pytest.mark.skip()
 def test_sequence_preproc_module_gpt2bpe_tokenizer():
     metadata = {
         "preprocessing": {
@@ -296,9 +290,7 @@ def test_sequence_preproc_module_gpt2bpe_tokenizer():
     )
 
 
-@pytest.mark.skipif(
-    torch.torch_version.TorchVersion(torchtext.__version__) < (0, 13, 0), reason="requires torchtext 0.13.0 or higher"
-)
+@pytest.mark.skip()
 def test_sequence_preproc_module_bert_tokenizer():
     metadata = {
         "preprocessing": {