wandb · tcapelle · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -297,6 +297,7 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          HF_API_TOKEN: ${{ secrets.HF_API_TOKEN }}
         run: |
           nox -e "tests-${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}(shard='${{ matrix.nox-shard }}')"
   trace-tests-matrix-check: # This job does nothing and is only used for the branch protection

@@ -80,6 +80,7 @@ def tests(session, shard):
         env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
         env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
         env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY")
+        env["HF_API_TOKEN"] = session.env.get("HF_API_TOKEN")
 
     default_test_dirs = [f"integrations/{shard}/"]
     test_dirs_dict = {

@@ -78,6 +78,13 @@ scorers_tests = [
   "google-generativeai>=0.8.0",
   "mistralai>=1.0.3",
   "anthropic>=0.30.0",
+  "sentence-transformers>=3.3.1",
+  "scikit-learn>=1.5.2",
+  "transformers>=4.35.0",
+  "accelerate>=1.0.0",
+  "torch>=2.2.0",
+  "sacrebleu>=2.4.2",
+  "rouge>=1.0.1",
 ]
 notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
 openai = ["openai>=1.0.0"]

@@ -0,0 +1,184 @@
+import math
+
+import pytest  # type: ignore
+
+from weave.scorers import BLEUScorer
+
+
+def truncate(number, decimals=0):
+    """Truncates a number to the specified number of decimal places without rounding."""
+    factor = 10.0**decimals
+    return math.trunc(number * factor) / factor
+
+
+def test_bleu_scorer_initialization():
+    # Test default initialization
+    scorer = BLEUScorer()
+    assert scorer.lowercase == False
+    assert scorer.tokenize is None
+    assert scorer.smooth_method == "exp"
+    assert scorer.smooth_value is None
+    assert scorer.max_ngram_order == 4
+    assert scorer.effective_order == True
+    assert scorer.bleu is not None
+
+    # Test initialization with custom parameters
+    scorer = BLEUScorer(
+        lowercase=True,
+        tokenize="13a",
+        smooth_method="add-k",
+        smooth_value=1.0,
+        max_ngram_order=2,
+        effective_order=False,
+    )
+    assert scorer.lowercase == True
+    assert scorer.tokenize == "13a"
+    assert scorer.smooth_method == "add-k"
+    assert scorer.smooth_value == 1.0
+    assert scorer.max_ngram_order == 2
+    assert scorer.effective_order == False
+    assert scorer.bleu is not None
+
+
+def test_bleu_scorer_score_method():
+    scorer = BLEUScorer()
+    output = "The cat is on the mat."
+    ground_truths = ["The cat is on the mat.", "There is a cat on the mat."]
+
+    # Test score method with exact match
+    result = scorer.score(ground_truths=ground_truths, output=output)
+    assert isinstance(result, dict)
+    assert truncate(result["sentence_bleu"], 1) == 100.0
+    assert truncate(result["sentence_bp"], 1) == 1.0
+    assert result["output_pred"] == output
+    assert result["output_refs"] == ground_truths
+
+    # Test score method with partial match
+    output = "The cat sat on the mat."
+    result = scorer.score(ground_truths=ground_truths, output=output)
+    assert result["sentence_bleu"] < 100.0
+    assert result["output_pred"] == output
+
+    # Test with single reference
+    output = "The dog is in the house."
+    ground_truths = "The dog is outside."
+    result = scorer.score(ground_truths=ground_truths, output=output)
+    assert isinstance(result["output_refs"], list)
+    assert result["output_refs"] == [ground_truths]
+
+
+def test_bleu_scorer_score_method_invalid_input():
+    scorer = BLEUScorer()
+    output = "Sample output"
+
+    # Test with invalid ground_truths type
+    with pytest.raises(
+        AssertionError, match="`ground_truths` must be a list of strings."
+    ):
+        scorer.score(ground_truths=123, output=output)
+
+
+def test_bleu_scorer_summarize_method():
+    scorer = BLEUScorer()
+    score_rows = [
+        {
+            "sentence_bleu": 100.0,
+            "sentence_bp": 1.0,
+            "output_pred": "The cat is on the mat.",
+            "output_refs": ["The cat is on the mat."],
+        },
+        {
+            "sentence_bleu": 50.0,
+            "sentence_bp": 0.8,
+            "output_pred": "A dog is in the yard.",
+            "output_refs": ["The dog is in the yard."],
+        },
+        {
+            "sentence_bleu": 0.0,
+            "sentence_bp": 0.5,
+            "output_pred": "Completely different sentence.",
+            "output_refs": ["No match here."],
+        },
+    ]
+
+    result = scorer.summarize(score_rows)
+    assert isinstance(result, dict)
+    assert "corpus_level" in result
+    assert "sentence_level" in result
+    assert truncate(result["sentence_level"]["bleu"], 1) == 50.0
+
+    # Verify corpus-level BLEU score
+    corpus_bleu = result["corpus_level"]["bleu"]
+    assert truncate(corpus_bleu, 1) >= 0.0 and truncate(corpus_bleu, 1) <= 100.0
+
+
+def test_bleu_scorer_summarize_method_empty_input():
+    scorer = BLEUScorer()
+    score_rows = []
+    result = scorer.summarize(score_rows)
+    assert result == {}
+
+
+def test_bleu_scorer_summarize_method_invalid_score_rows():
+    scorer = BLEUScorer()
+    score_rows = ["invalid", 123, None]
+    with pytest.raises(AssertionError):
+        scorer.summarize(score_rows)
+
+
+def test_bleu_scorer_corpus_score():
+    scorer = BLEUScorer()
+    score_rows = [
+        {
+            "sentence_bleu": 100.0,
+            "sentence_bp": 1.0,
+            "output_pred": "The cat is on the mat.",
+            "output_refs": ["The cat is on the mat."],
+        },
+        {
+            "sentence_bleu": 50.0,
+            "sentence_bp": 0.8,
+            "output_pred": "A dog is in the yard.",
+            "output_refs": ["The dog is in the yard.", "A dog is outside."],
+        },
+    ]
+
+    result = scorer.summarize(score_rows)
+    print(result)
+    corpus_bleu = result["corpus_level"]["bleu"]
+    assert truncate(corpus_bleu, 1) == 100.0
+
+
+def test_bleu_scorer_with_different_tokenizer():
+    # Test BLEUScorer with a different tokenizer
+    scorer = BLEUScorer(tokenize="char")
+    output = "abcd"
+    ground_truths = ["abcf"]
+
+    result = scorer.score(ground_truths=ground_truths, output=output)
+    assert result["sentence_bleu"] < 100.0
+
+
+def test_bleu_scorer_effective_order():
+    # Test BLEUScorer with effective_order set to False
+    scorer = BLEUScorer(effective_order=False)
+    output = "The cat"
+    ground_truths = ["The cat is on the mat."]
+
+    result = scorer.score(ground_truths=ground_truths, output=output)
+    # With effective_order=False, the score might be lower due to missing higher-order n-grams
+    assert result["sentence_bleu"] < 100.0
+
+
+def test_bleu_scorer_smooth_method():
+    # Test BLEUScorer with different smoothing methods
+    scorer = BLEUScorer(smooth_method="floor", smooth_value=0.1)
+    output = "The cat sat on the mat."
+    ground_truths = ["The cat is on the mat."]
+
+    result = scorer.score(ground_truths=ground_truths, output=output)
+    assert result["sentence_bleu"] > 0.0
+
+    # Test with invalid smoothing method
+    with pytest.raises(ValueError):
+        BLEUScorer(smooth_method="invalid_method")
@@ -0,0 +1,95 @@
+import pytest
+
+import weave
+from weave.scorers.coherence_scorer import CoherenceScorer
+
+
+@pytest.fixture
+def coherence_scorer(monkeypatch):
+    scorer = CoherenceScorer(
+        model_name="wandb/coherence_scorer",
+        device="cpu",
+    )
+
+    def mock_pipeline(*args, **kwargs):
+        def inner(inputs):
+            if "incoherent" in inputs["text_pair"] or "incoherent" in inputs["text"]:
+                return {
+                    "label": "incoherent",
+                    "score": 0.2,
+                }
+            return {
+                "label": "coherent",
+                "score": 0.95,
+            }
+
+        return inner
+
+    monkeypatch.setattr(scorer, "_classifier", mock_pipeline())
+    return scorer
+
+
+def test_score_messages_with_coherent_output(coherence_scorer):
+    prompt = "This is a test prompt."
+    output = "This is a coherent response."
+    result = coherence_scorer.score_messages(prompt, output)
+    assert result["coherent"]
+    assert result["coherence"] == "coherent"
+    assert result["coherence_score"] == pytest.approx(0.95)
+
+
+def test_score_messages_with_incoherent_output(coherence_scorer):
+    prompt = "This is a test prompt."
+    output = "This is an incoherent response."
+    result = coherence_scorer.score_messages(prompt, output)
+    assert not result["coherent"]
+    assert result["coherence"] == "incoherent"
+    assert result["coherence_score"] == pytest.approx(0.2)
+
+
+@pytest.mark.asyncio
+async def test_score_with_chat_history(coherence_scorer):
+    prompt = "This is a test prompt."
+    output = "This is a coherent response."
+    chat_history = [
+        {"role": "user", "text": "Hello"},
+        {"role": "assistant", "text": "Hi"},
+    ]
+    result = await coherence_scorer.score(prompt, output, chat_history=chat_history)
+    assert result["coherent"]
+    assert result["coherence"] == "coherent"
+    assert result["coherence_score"] == pytest.approx(0.95)
+
+
+@pytest.mark.asyncio
+async def test_score_with_context(coherence_scorer):
+    prompt = "This is a test prompt."
+    output = "This is a coherent response."
+    context = "This is additional context."
+    result = await coherence_scorer.score(prompt, output, context=context)
+    assert result["coherent"]
+    assert result["coherence"] == "coherent"
+    assert result["coherence_score"] == pytest.approx(0.95)
+
+
+@pytest.mark.asyncio
+async def test_coherence_scorer_evaluation(coherence_scorer):
+    dataset = [
+        {"input": "This is a coherent text."},
+        {"input": "This is an incoherent text."},
+    ]
+
+    @weave.op
+    def model(input: str):
+        return input
+
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[coherence_scorer],
+    )
+    result = await evaluation.evaluate(model)
+
+    assert "CoherenceScorer" in result
+    assert "coherent" in result["CoherenceScorer"]
+    assert result["CoherenceScorer"]["coherent"]["true_count"] == 1
+    assert result["CoherenceScorer"]["coherent"]["true_fraction"] == pytest.approx(0.5)
@@ -0,0 +1,74 @@
+import pytest
+from transformers import AutoTokenizer
+
+import weave
+from weave.scorers import LlamaGuard
+
+_TINY_MODEL_NAME = "HuggingFaceM4/tiny-random-LlamaForCausalLM"
+_LLAMAGUARD_MODEL_NAME = "meta-llama/Llama-Guard-3-1B"
+
+
+@pytest.fixture
+def llamaguard_scorer(monkeypatch):
+    scorer = LlamaGuard(
+        model_name=_TINY_MODEL_NAME,
+        device="cpu",
+    )
+    scorer._tokenizer = AutoTokenizer.from_pretrained(_LLAMAGUARD_MODEL_NAME)
+
+    # Mock the _generate method to return predictable outputs with unsafe_score
+    def mock_generate(*args, **kwargs):
+        return "unsafe\nS10: Hate<|eot_id|>", 0.85  # Added mock unsafe_score
+
+    monkeypatch.setattr(scorer, "_generate", mock_generate)
+    return scorer
+
+
+def test_llamaguard_postprocess(llamaguard_scorer):
+    # Test safe content
+    safe_output = ("safe", 0.1)  # Added mock unsafe_score
+    result = llamaguard_scorer.postprocess(*safe_output)
+    assert result["safe"]
+    assert result["category"] is None
+    assert result["unsafe_score"] == 0.1  # Test unsafe_score
+
+    # Test unsafe content with category
+    unsafe_output = ("unsafe\nS5<|eot_id|>", 0.9)  # Added mock unsafe_score
+    result = llamaguard_scorer.postprocess(*unsafe_output)
+    assert not result["safe"]
+    assert result["category"] == "S5: Defamation"
+    assert result["unsafe_score"] == 0.9  # Test unsafe_score
+
+
+@pytest.mark.asyncio
+async def test_llamaguard_score(llamaguard_scorer):
+    output = "Test content for scoring"
+    result = await llamaguard_scorer.score(output=output)
+    assert isinstance(result, dict)
+    assert "safe" in result
+    assert "category" in result
+    assert "unsafe_score" in result  # Test presence of unsafe_score
+    assert result["safe"] is False
+    assert result["category"] == "S10: Hate"
+    assert result["unsafe_score"] == 0.85  # Test unsafe_score matches mock value
+
+
+@pytest.mark.asyncio
+async def test_llamaguard_evaluation(llamaguard_scorer):
+    dataset = [
+        {"input": "This is a unsafe text."},
+        {"input": "This is also bad text"},
+    ]
+
+    @weave.op
+    def model(input: str):
+        return input
+
+    evaluation = weave.Evaluation(
+        dataset=dataset,
+        scorers=[llamaguard_scorer],
+    )
+    result = await evaluation.evaluate(model)
+
+    assert "LlamaGuard" in result
+    assert "safe" in result["LlamaGuard"]