diff --git a/.github/workflows/ci-eval-pr.yml b/.github/workflows/ci-eval-pr.yml new file mode 100644 index 000000000..7b908f996 --- /dev/null +++ b/.github/workflows/ci-eval-pr.yml @@ -0,0 +1,99 @@ +name: CI + +on: + pull_request: + branches: + - main + paths: + - trulens_eval/**/* + - .github/workflows/ci-eval-pr.yml + +jobs: + PRBranchProtect: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - python-version: "3.12" + tests-folder: tests/unit + - python-version: "3.9" + tests-folder: tests/unit/static + - python-version: "3.10" + tests-folder: tests/unit/static + - python-version: "3.11" + tests-folder: tests/unit/static + timeout-minutes: 30 + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + clean: true + + - name: Setup Conda + uses: conda-incubator/setup-miniconda@v3 + with: + python-version: ${{ matrix.python-version }} + activate-environment: myenv + + # - name: Install and run formatters + # if: ${{ matrix.python-version }} == '3.11' + # run: | + # conda activate myenv + # pip install yapf==0.32.0 isort==5.10.1 --verbose + # ./format.sh --eval + # num_changed_files=`git ls-files --others -m --exclude-standard ./trulens_eval | wc -l` + # if [ $num_changed_files -ne 0 ]; then + # echo "The following files have changed after running format.sh. Please format your code and update the PR." + # git ls-files --others -m --exclude-standard ./trulens_eval + # git diff + # fi + # shell: bash -el {0} + + - name: Install trulens + run: | + conda activate myenv + cd ./trulens_eval + pip install -e . --verbose + shell: bash -el {0} + + - name: Install testing packages + run: | + conda activate myenv + pip install pytest==7.0.1 pytest-subtests pytest-azurepipelines --verbose + shell: bash -el {0} + + - name: Describe python env + run: | + conda activate myenv + python --version + pip --version + pip list --verbose + shell: bash -el {0} + + - name: Unit tests with required packages + run: | + conda activate myenv + cd ./trulens_eval + python -m pytest --test-run-title="Required ${{ matrix.python-version }} unit tests" ${{ matrix.tests-folder }} + shell: bash -el {0} + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }} + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} + PINECONE_ENV: "us-west1-gcp-free" + + # - name: Install optional packages and run tests + # run: | + # conda activate myenv + # cd ./trulens_eval + # pip install --verbose -r trulens_eval/requirements.optional.txt + # python -m pytest --test-run-title="Optional ${{ matrix.python-version }} unit tests" ${{ matrix.tests-folder }} + # shell: bash -el {0} + # env: + # TEST_OPTIONAL: true + # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }} + # PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} + # PINECONE_ENV: ${{ secrets.PINECONE_ENV }} + # HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }} diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py index 2ca01fe73..0fa63a25a 100644 --- a/trulens_eval/trulens_eval/feedback/provider/base.py +++ b/trulens_eval/trulens_eval/feedback/provider/base.py @@ -5,7 +5,6 @@ import nltk from nltk.tokenize import sent_tokenize import numpy as np -from tqdm.auto import tqdm from trulens_eval.feedback import prompts from trulens_eval.feedback.provider.endpoint import base as mod_endpoint @@ -1198,14 +1197,13 @@ def groundedness_measure_with_cot_reasons( Returns: Tuple[float, str]: A tuple containing a value between 0.0 (not grounded) and 1.0 (grounded) and a string containing the reasons for the evaluation. """ - nltk.download('punkt') + nltk.download('punkt', quiet=True) groundedness_scores = {} reasons_str = "" hypotheses = sent_tokenize(statement) system_prompt = prompts.LLM_GROUNDEDNESS_SYSTEM - for i, hypothesis in enumerate(tqdm( - hypotheses, desc="Groundedness per statement in source")): + for i, hypothesis in enumerate(hypotheses): user_prompt = prompts.LLM_GROUNDEDNESS_USER.format( premise=f"{source}", hypothesis=f"{hypothesis}" ) diff --git a/trulens_eval/trulens_eval/feedback/provider/hugs.py b/trulens_eval/trulens_eval/feedback/provider/hugs.py index 6cd4835c7..a3c7594d2 100644 --- a/trulens_eval/trulens_eval/feedback/provider/hugs.py +++ b/trulens_eval/trulens_eval/feedback/provider/hugs.py @@ -218,7 +218,7 @@ def groundedness_measure_with_nli(self, source: str, Returns: Tuple[float, str]: A tuple containing a value between 0.0 (not grounded) and 1.0 (grounded) and a string containing the reasons for the evaluation. """ - nltk.download('punkt') + nltk.download('punkt', quiet=True) groundedness_scores = {} reasons_str = "" diff --git a/trulens_eval/trulens_eval/tru.py b/trulens_eval/trulens_eval/tru.py index 85edac997..de6da1e07 100644 --- a/trulens_eval/trulens_eval/tru.py +++ b/trulens_eval/trulens_eval/tru.py @@ -749,6 +749,8 @@ def start_evaluator(self, fork: If set, will start the evaluator in a new process instead of a thread. NOT CURRENTLY SUPPORTED. + disable_tqdm: If set, will disable progress bar logging from the evaluator. + Returns: The started process or thread that is executing the deferred feedback evaluator. @@ -816,14 +818,15 @@ def runloop(): total=queue_total, postfix={ status.name: count for status, count in queue_stats.items() - } + }, + disable=disable_tqdm ) # Show the status of the results so far. - tqdm_total = tqdm(desc="Done Runs", initial=0, unit="runs") + tqdm_total = tqdm(desc="Done Runs", initial=0, unit="runs", disable=disable_tqdm) # Show what is being waited for right now. - tqdm_waiting = tqdm(desc="Waiting for Runs", initial=0, unit="runs") + tqdm_waiting = tqdm(desc="Waiting for Runs", initial=0, unit="runs", disable=disable_tqdm) runs_stats = defaultdict(int)