Adding HF CI (deepchem#4173)

* Added HF workflow * Moved prot bert tests to HF marker * Resolved?? * Moved all HF tests to new CI
sudo-rsingh · Nov 12, 2024 · f44e45b · f44e45b
1 parent be83685
commit f44e45b
Show file tree

Hide file tree

Showing 7 changed files with 172 additions and 33 deletions.
diff --git a/.github/workflows/hf_setup.yml b/.github/workflows/hf_setup.yml
@@ -0,0 +1,139 @@
+name: Test for DeepChem HuggingFace
+on:
+  push: # ci work when pushing master branch
+    branches:
+      - master
+  pull_request: # ci work when creating a PR to master branch
+    branches:
+      - master
+jobs:
+  HF-build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ['3.8', '3.10', '3.11']
+        include:
+          - os: windows-latest
+            python-version: 3.9
+    steps:
+    - uses: actions/checkout@v4
+    - name: Cache pip modules for Linux
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements/torch/**') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Build DeepChem
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[torch]'
+    - name: Import checking
+      run: python -c "import deepchem; import torch;"
+
+  HF-tests:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ['3.8', '3.10', '3.11']
+        include:
+          - os: windows-latest
+            python-version: '3.9'
+    env:
+      OS: ${{ matrix.os }}
+      PYTHON_VERSION: ${{ matrix.python-version }}
+    steps:
+    - name: Maximize build space
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf "/usr/local/share/boost"
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/lib/android
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    # https://github.com/galaxyproject/tools-iuc/blob/master/.github/workflows/pr.yaml
+    # The range of commits to check for changes is:
+    # - for events on the master branch we compare against the sha before the event
+    #   (note that this does not work for feature branch events since we want all
+    #   commits on the feature branch and not just the commits of the last event)
+    # - for pull requests we compare against the 1st ancestor, given the current
+    #   HEAD is the merge between the PR branch and the base branch
+    - name: Set commit range (push to the master branch, e.g. merge)
+      if: github.ref == 'refs/heads/master' && github.event_name == 'push'
+      run: echo "COMMIT_RANGE=${{ github.event.before }}.." >> $GITHUB_ENV
+    - name: Set commit range (pull request)
+      if: github.event_name == 'pull_request'
+      run: |
+        git fetch origin master
+        echo "COMMIT_RANGE=origin/master..." >> $GITHUB_ENV
+    - name: Cache pip packages for Linux
+      if: runner.os == 'Linux'
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements/torch/**') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Cache pip packages for MacOS
+      if: runner.os == 'macOS'
+      uses: actions/cache@v4
+      with:
+        path: ~/Library/Caches/pip
+        key: ${{ matrix.os }}-pip-${{ hashFiles('requirements/torch/**') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Cache pip packages for Windows
+      if: runner.os == 'Windows'
+      uses: actions/cache@v4
+      with:
+        path: ~\AppData\Local\pip\Cache
+        key: ${{ matrix.os }}-pip-${{ hashFiles('requirements/torch/**') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Create env.yml for python 3.9+
+      shell: bash
+      if: ${{ matrix.python-version != '3.8' }}
+      run: |
+        python -m pip install --upgrade pip;
+        pip install conda-merge;
+        conda-merge requirements/env_common.yml requirements/torch/env_torch.cpu.yml requirements/env_test.yml > env.yml
+    - name: Create env.yml for python 3.8
+      shell: bash
+      # A special case of environment creation for python 3.8 which includes an older version of matminer 
+      if: ${{ matrix.python-version == '3.8' }}
+      run: |
+        python -m pip install --upgrade pip;
+        pip install conda-merge;
+        conda-merge requirements/env_common_3_8.yml requirements/torch/env_torch.cpu.yml requirements/env_test.yml > env.yml
+    - name: Install all dependencies
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        miniconda-version: "latest"
+        auto-update-conda: true
+        activate-environment: deepchem
+        channels: conda-forge,defaults
+        python-version: ${{ matrix.python-version }}
+        environment-file: env.yml
+    - name: Install DeepChem
+      id: install
+      shell: bash -l {0}
+      run: pip install -e .
+    - name: PyTest
+      if: ${{ (success() || failure()) && (steps.install.outcome == 'failure' || steps.install.outcome == 'success') }}
+      shell: bash -l {0}
+      run: pytest -v -m 'hf' deepchem
diff --git a/deepchem/models/torch_models/tests/test_antibody_modeling.py b/deepchem/models/torch_models/tests/test_antibody_modeling.py
@@ -15,7 +15,7 @@ def igbert_tokenizer():
     return tokenizer
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_init(igbert_tokenizer):
     from deepchem.models.torch_models.antibody_modeling import DeepAbLLM
     from deepchem.models.torch_models.hf_model import HuggingFaceModel
@@ -25,7 +25,7 @@ def test_init(igbert_tokenizer):
     assert anti_model.n_tasks == 1
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_load_from_pretrained(tmpdir):
     pretrain_model_dir = os.path.join(tmpdir, 'pretrain')
     finetune_model_dir = os.path.join(tmpdir, 'finetune')
@@ -57,7 +57,7 @@ def test_load_from_pretrained(tmpdir):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_initialize_new_config():
     model_path = 'Rostlab/prot_bert'
     config = {"num_attention_heads": 8, "num_hidden_layers": 6}
@@ -72,7 +72,7 @@ def test_initialize_new_config():
     assert model.model.config['num_hidden_layers'] == 6
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_save_reload(tmpdir):
     model_path = 'Exscientia/IgBert'
     anti_model = DeepAbLLM(task='mlm',
@@ -99,7 +99,7 @@ def test_save_reload(tmpdir):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_mask_seq_pos(igbert_tokenizer):
     from deepchem.models.torch_models.antibody_modeling import DeepAbLLM
     anti_model = DeepAbLLM(model_path='facebook/esm2_t6_8M_UR50D',
@@ -114,7 +114,7 @@ def test_mask_seq_pos(igbert_tokenizer):
     assert masked_test_string.split(' ')[10] == anti_model.tokenizer.mask_token
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_redesign_residue():
     from Levenshtein import distance
     from deepchem.models.torch_models.antibody_modeling import DeepAbLLM
@@ -139,7 +139,7 @@ def test_redesign_residue():
         assert abs(item[2]) <= 1
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_optimize_sequence():
     from Levenshtein import distance
     from deepchem.models.torch_models.antibody_modeling import DeepAbLLM

diff --git a/deepchem/models/torch_models/tests/test_chemberta.py b/deepchem/models/torch_models/tests/test_chemberta.py
@@ -10,7 +10,7 @@
     pass
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_chemberta_pretraining(smiles_regression_dataset,
                                smiles_multitask_regression_dataset):
     # Pretraining in MLM mode
@@ -28,7 +28,7 @@ def test_chemberta_pretraining(smiles_regression_dataset,
     assert loss
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_chemberta_finetuning(smiles_regression_dataset,
                               smiles_multitask_regression_dataset):
     # test regression
@@ -69,7 +69,7 @@ def test_chemberta_finetuning(smiles_regression_dataset,
     assert prediction.shape == (dataset.y.shape[0], 2)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_chemberta_load_from_pretrained(tmpdir, smiles_regression_dataset):
     pretrain_model_dir = os.path.join(tmpdir, 'pretrain')
     finetune_model_dir = os.path.join(tmpdir, 'finetune')
@@ -100,7 +100,7 @@ def test_chemberta_load_from_pretrained(tmpdir, smiles_regression_dataset):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_chemberta_save_reload(tmpdir):
     tokenizer_path = 'seyonec/PubChem10M_SMILES_BPE_60k'
     model = Chemberta(task='regression',
@@ -125,7 +125,7 @@ def test_chemberta_save_reload(tmpdir):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_chemberta_load_weights_from_hf_hub():
     pretrained_model_path = 'DeepChem/ChemBERTa-77M-MLM'
     tokenizer_path = 'DeepChem/ChemBERTa-77M-MLM'

diff --git a/deepchem/models/torch_models/tests/test_hf_models.py b/deepchem/models/torch_models/tests/test_hf_models.py
@@ -38,7 +38,7 @@ def hf_tokenizer(tmpdir):
     return tokenizer
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_pretraining(hf_tokenizer, smiles_regression_dataset):
     from deepchem.models.torch_models.hf_models import HuggingFaceModel
     from transformers.models.roberta import RobertaConfig, RobertaForMaskedLM
@@ -55,7 +55,7 @@ def test_pretraining(hf_tokenizer, smiles_regression_dataset):
     assert loss
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_hf_model_regression(hf_tokenizer, smiles_regression_dataset):
     from transformers.models.roberta import (RobertaConfig,
                                              RobertaForSequenceClassification)
@@ -77,7 +77,7 @@ def test_hf_model_regression(hf_tokenizer, smiles_regression_dataset):
     assert score
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_hf_model_classification(hf_tokenizer, smiles_regression_dataset):
     y = np.random.choice([0, 1], size=smiles_regression_dataset.y.shape)
     dataset = dc.data.NumpyDataset(X=smiles_regression_dataset.X,
@@ -102,7 +102,7 @@ def test_hf_model_classification(hf_tokenizer, smiles_regression_dataset):
     assert score
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_load_from_pretrained(tmpdir, hf_tokenizer):
     # Create pretrained model
     from transformers.models.roberta import (RobertaConfig, RobertaForMaskedLM,
@@ -147,7 +147,7 @@ def test_load_from_pretrained(tmpdir, hf_tokenizer):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_model_save_reload(tmpdir, hf_tokenizer):
     from transformers.models.roberta import (RobertaConfig,
                                              RobertaForSequenceClassification)
@@ -181,7 +181,7 @@ def test_model_save_reload(tmpdir, hf_tokenizer):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_load_from_hf_checkpoint():
     from transformers.models.t5 import T5Config, T5Model
     config = T5Config()
@@ -203,7 +203,7 @@ def test_load_from_hf_checkpoint():
     assert all(not_matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_fill_mask_IO(tmpdir, hf_tokenizer):
     from transformers import (RobertaConfig, RobertaForMaskedLM)
 
@@ -232,7 +232,7 @@ def test_fill_mask_IO(tmpdir, hf_tokenizer):
     assert isinstance(results[0][0], dict)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_fill_mask_fidelity(tmpdir, hf_tokenizer):
     from transformers import (RobertaConfig, RobertaForMaskedLM)
 
@@ -268,7 +268,7 @@ def test_fill_mask_fidelity(tmpdir, hf_tokenizer):
             assert filled['sequence'].startswith(f'<s>{filled["token_str"]}')
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_load_from_pretrained_with_diff_task(tmpdir):
     # Tests loading a pretrained model where the weight shape in last layer
     # (the final projection layer) of the pretrained model does not match

diff --git a/deepchem/models/torch_models/tests/test_molformer.py b/deepchem/models/torch_models/tests/test_molformer.py
@@ -10,7 +10,7 @@
     pass
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_molformer_pretraining(smiles_regression_dataset,
                                smiles_multitask_regression_dataset):
     # Pretraining in MLM mode
@@ -26,7 +26,7 @@ def test_molformer_pretraining(smiles_regression_dataset,
     assert loss
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_molformer_finetuning(smiles_regression_dataset,
                               smiles_multitask_regression_dataset):
 
@@ -70,7 +70,7 @@ def test_molformer_finetuning(smiles_regression_dataset,
     assert prediction.shape == (dataset.y.shape[0], 2)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_molformer_load_from_pretrained(tmpdir, smiles_regression_dataset):
     pretrain_model_dir = os.path.join(tmpdir, 'pretrain')
     finetune_model_dir = os.path.join(tmpdir, 'finetune')
@@ -95,7 +95,7 @@ def test_molformer_load_from_pretrained(tmpdir, smiles_regression_dataset):
     assert all(matches)
 
 
-@pytest.mark.torch
+@pytest.mark.hf
 def test_molformer_save_reload(tmpdir):
     model = MoLFormer(task='regression', model_dir=tmpdir)
     model._ensure_built()