truera · epinzur · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/.github/workflows/ci-eval-pr.yml b/.github/workflows/ci-eval-pr.yml
@@ -0,0 +1,99 @@
+name: CI
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - trulens_eval/**/*
+      - .github/workflows/ci-eval-pr.yml
+
+jobs:
+  PRBranchProtect:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - python-version: "3.12"
+            tests-folder: tests/unit
+          - python-version: "3.9"
+            tests-folder: tests/unit/static
+          - python-version: "3.10"
+            tests-folder: tests/unit/static
+          - python-version: "3.11"
+            tests-folder: tests/unit/static
+    timeout-minutes: 30
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+          clean: true
+
+      - name: Setup Conda
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+          activate-environment: myenv
+
+      # - name: Install and run formatters
+      #   if: ${{ matrix.python-version }} == '3.11'
+      #   run: |
+      #     conda activate myenv
+      #     pip install yapf==0.32.0 isort==5.10.1 --verbose
+      #     ./format.sh --eval
+      #     num_changed_files=`git ls-files --others -m --exclude-standard ./trulens_eval | wc -l`
+      #     if [ $num_changed_files -ne 0 ]; then
+      #       echo "The following files have changed after running format.sh. Please format your code and update the PR."
+      #       git ls-files --others -m --exclude-standard ./trulens_eval
+      #       git diff
+      #     fi
+      #   shell: bash -el {0}
+
+      - name: Install trulens
+        run: |
+          conda activate myenv
+          cd ./trulens_eval
+          pip install -e . --verbose
+        shell: bash -el {0}
+
+      - name: Install testing packages
+        run: |
+          conda activate myenv
+          pip install pytest==7.0.1 pytest-subtests pytest-azurepipelines --verbose
+        shell: bash -el {0}
+
+      - name: Describe python env
+        run: |
+          conda activate myenv
+          python --version
+          pip --version
+          pip list --verbose
+        shell: bash -el {0}
+
+      - name: Unit tests with required packages
+        run: |
+          conda activate myenv
+          cd ./trulens_eval
+          python -m pytest --test-run-title="Required ${{ matrix.python-version }} unit tests" ${{ matrix.tests-folder }}
+        shell: bash -el {0}
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
+          PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
+          PINECONE_ENV: "us-west1-gcp-free"
+
+    #   - name: Install optional packages and run tests
+    #     run: |
+    #       conda activate myenv
+    #       cd ./trulens_eval
+    #       pip install --verbose -r trulens_eval/requirements.optional.txt
+    #       python -m pytest --test-run-title="Optional ${{ matrix.python-version }} unit tests" ${{ matrix.tests-folder }}
+    #     shell: bash -el {0}
+    #     env:
+    #       TEST_OPTIONAL: true
+    #       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+    #       HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
+    #       PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
+    #       PINECONE_ENV: ${{ secrets.PINECONE_ENV }}
+    #       HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py
@@ -5,7 +5,6 @@
 import nltk
 from nltk.tokenize import sent_tokenize
 import numpy as np
-from tqdm.auto import tqdm
 
 from trulens_eval.feedback import prompts
 from trulens_eval.feedback.provider.endpoint import base as mod_endpoint
@@ -1198,14 +1197,13 @@ def groundedness_measure_with_cot_reasons(
         Returns:
             Tuple[float, str]: A tuple containing a value between 0.0 (not grounded) and 1.0 (grounded) and a string containing the reasons for the evaluation.
         """
-        nltk.download('punkt')
+        nltk.download('punkt', quiet=True)
         groundedness_scores = {}
         reasons_str = ""
 
         hypotheses = sent_tokenize(statement)
         system_prompt = prompts.LLM_GROUNDEDNESS_SYSTEM
-        for i, hypothesis in enumerate(tqdm(
-                hypotheses, desc="Groundedness per statement in source")):
+        for i, hypothesis in enumerate(hypotheses):
             user_prompt = prompts.LLM_GROUNDEDNESS_USER.format(
                 premise=f"{source}", hypothesis=f"{hypothesis}"
             )

diff --git a/trulens_eval/trulens_eval/feedback/provider/hugs.py b/trulens_eval/trulens_eval/feedback/provider/hugs.py
@@ -218,7 +218,7 @@ def groundedness_measure_with_nli(self, source: str,
         Returns:
             Tuple[float, str]: A tuple containing a value between 0.0 (not grounded) and 1.0 (grounded) and a string containing the reasons for the evaluation.
         """
-        nltk.download('punkt')
+        nltk.download('punkt', quiet=True)
         groundedness_scores = {}
 
         reasons_str = ""

diff --git a/trulens_eval/trulens_eval/tru.py b/trulens_eval/trulens_eval/tru.py
@@ -749,6 +749,8 @@ def start_evaluator(self,
             fork: If set, will start the evaluator in a new process instead of a
                 thread. NOT CURRENTLY SUPPORTED.
 
+            disable_tqdm: If set, will disable progress bar logging from the evaluator.
+
         Returns:
             The started process or thread that is executing the deferred feedback
                 evaluator.
@@ -816,14 +818,15 @@ def runloop():
                 total=queue_total,
                 postfix={
                     status.name: count for status, count in queue_stats.items()
-                }
+                },
+                disable=disable_tqdm
             )
 
             # Show the status of the results so far.
-            tqdm_total = tqdm(desc="Done Runs", initial=0, unit="runs")
+            tqdm_total = tqdm(desc="Done Runs", initial=0, unit="runs", disable=disable_tqdm)
 
             # Show what is being waited for right now.
-            tqdm_waiting = tqdm(desc="Waiting for Runs", initial=0, unit="runs")
+            tqdm_waiting = tqdm(desc="Waiting for Runs", initial=0, unit="runs", disable=disable_tqdm)
 
             runs_stats = defaultdict(int)