diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py b/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py index eeaddd90dd..1d03c3cd68 100644 --- a/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py +++ b/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py @@ -1,165 +1,80 @@ -import math -from collections import Counter from typing import Any, List, Union, Optional +try: + import nltk + from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction +except ImportError: + nltk = None + from opik.evaluation.metrics import base_metric, score_result -############################################################################### -# SmoothingFunction -############################################################################### -class SmoothingFunction: + +class BLEU(base_metric.BaseMetric): """ - A collection of smoothing methods for sentence-level BLEU, - adapted from Boxing Chen & Collin Cherry (2014), and some - NLTK references. + BLEU metric relying on the `nltk.translate.bleu_score` implementation. + If `nltk` is not installed, this class will raise an ImportError upon instantiation. """ - def __init__(self, epsilon: float = 0.1, alpha: float = 5.0, k: float = 5.0): - """ - :param epsilon: the small constant added to 0-precision n-grams (method1, etc.) - :param alpha: for method6 interpolation factor - :param k: for method4 or other references - """ - self.epsilon = epsilon - self.alpha = alpha - self.k = k - - def method0(self, p_n: List[float]) -> List[float]: - """ - No smoothing. If there's a zero precision, BLEU can become 0. - """ - return p_n - - def method1(self, p_n: List[float]) -> List[float]: - """ - Add epsilon to each precision if it is 0. - e.g., max(p_i, epsilon) - """ - return [p if p != 0.0 else self.epsilon for p in p_n] - - def method2(self, p_n: List[float]) -> List[float]: - """ - Add 1 to both numerator and denominator for n>1, as in NLTK: - p_n = (count + 1) / (total + 1) for n>1 - - Because we're only storing float p_i = count/total, we do a simplified shift: - for i>0 => p_i = (p_i * total + 1)/(total+1) - Here i is the index in p_n (0=unigram, 1=bigram, etc.) - """ - p_n_new = [] - for i, val in enumerate(p_n): - if i == 0: - # unigrams => no shift - p_n_new.append(val) - else: - # interpret val ~ (count/total) - if val == 0: - # if there's no overlap, treat as 1/(total+1) ~ 1/(something) - # simpler to revert to "some small fraction" - shift_val = 1.0 / 2.0 - else: - # if val>0, interpret val ~ c/t => let c=val, t=1 => c+1=val+1, t+1=2 => ~ (val+1)/2 - shift_val = (val + 1.0) / 2.0 - p_n_new.append(shift_val) - return p_n_new - - def method3(self, p_n: List[float]) -> List[float]: - """ - NIST geometric sequence smoothing (example). - """ - if len(p_n) == 0: - return p_n - return [max(pi, self.epsilon) for pi in p_n] - - def apply(self, method_name: str, p_n: List[float]) -> List[float]: - method = getattr(self, method_name, None) - if not method: - raise ValueError(f"Unknown smoothing method: {method_name}") - return method(p_n) - -############################################################################### -# BLEU Metric -############################################################################### -class BLEU(base_metric.BaseMetric): def __init__( self, name: str = "bleu_metric", track: bool = True, n_grams: int = 4, smoothing_method: str = "method1", - epsilon: float = 0.1, - alpha: float = 5.0, - k: float = 5.0, weights: Optional[List[float]] = None, ): + """ + :param name: Name for this metric instance. + :param track: Whether or not this metric is tracked (depends on your system). + :param n_grams: Up to which n-gram order to use (1 through n_grams). + :param smoothing_method: One of NLTK's SmoothingFunction methods (e.g., "method0", "method1", "method2", etc.). + :param weights: Optional manual weighting for n-gram orders. If None, defaults to uniform across n_grams. + """ super().__init__(name=name, track=track) + + # Ensure nltk is installed; if not, raise an ImportError now. + if nltk is None: + raise ImportError( + "`nltk` library is required for BLEU score calculation. " + "Please install it via `pip install nltk`." + ) + self.n_grams = n_grams self.smoothing_method = smoothing_method - self.smoother = SmoothingFunction(epsilon=epsilon, alpha=alpha, k=k) - # If no weights provided, default to uniform across n_grams + # Set up weights: if not provided, default to uniform among the up to n_grams orders if weights is None: - self.weights = [1.0 / self.n_grams] * self.n_grams + self.weights = [1.0 / n_grams] * n_grams else: - if len(weights) != self.n_grams: + if len(weights) != n_grams: raise ValueError( - f"Length of weights ({len(weights)}) != n_grams ({self.n_grams})" + f"Length of weights ({len(weights)}) != n_grams ({n_grams})" ) if abs(sum(weights) - 1.0) > 1e-6: raise ValueError("Weights must sum to 1.0") self.weights = weights - def _get_ngrams(self, tokens: List[str], n: int) -> Counter: - """Return counts for nth-order n-grams.""" - return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)) + self._nltk_smoother = SmoothingFunction() - def _modified_precision( - self, - references: List[List[str]], - candidate: List[str], - n: int - ) -> tuple[int, int]: + def _get_smoothing_func(self): """ - Clipped count (numerator) & total candidate n-grams (denominator). + Retrieve the corresponding smoothing function from nltk's SmoothingFunction + based on the self.smoothing_method name, e.g. "method0", "method1", etc. + Fallback to method0 if not found. """ - cand_ngrams = self._get_ngrams(candidate, n) - if not cand_ngrams: - return 0, 0 - - # Build up max ref n-gram counts - max_ref_counts = {} - for ref in references: - ref_ngrams = self._get_ngrams(ref, n) - for ng, cnt in ref_ngrams.items(): - if ng in max_ref_counts: - max_ref_counts[ng] = max(max_ref_counts[ng], cnt) - else: - max_ref_counts[ng] = cnt - - clipped = 0 - total = 0 - for ng, cnt in cand_ngrams.items(): - clipped += min(cnt, max_ref_counts.get(ng, 0)) - total += cnt + return getattr(self._nltk_smoother, self.smoothing_method, self._nltk_smoother.method0) - return clipped, total - - def _closest_ref_length(self, references: List[List[str]], c_len: int) -> int: - """ - Return the reference length closest to c_len (ties => shorter). + def _truncate_weights(self, candidate_len: int) -> tuple: """ - ref_lens = [len(r) for r in references] - return min(ref_lens, key=lambda r_len: (abs(r_len - c_len), r_len)) - - def _brevity_penalty(self, c_len: int, r_len: int) -> float: + Truncate the n-gram weights to min(self.n_grams, candidate_len), + then re-normalize them so that they sum to 1.0. """ - BP = exp(1 - r/c) if c 0 - """ - if c_len == 0: - return 0.0 - if c_len > r_len: - return 1.0 - return math.exp(1.0 - float(r_len) / c_len) + max_order = min(self.n_grams, candidate_len) + used_weights = self.weights[:max_order] + w_sum = sum(used_weights) or 1.0 + # Re-normalize to sum to 1.0 + normalized = [w / w_sum for w in used_weights] + return tuple(normalized) ########################################################################### # SINGLE-SENTENCE BLEU @@ -167,8 +82,12 @@ def _brevity_penalty(self, c_len: int, r_len: int) -> float: def score( self, output: str, reference: Union[str, List[str]], **ignored_kwargs: Any ) -> score_result.ScoreResult: - - # 1) Check for empty candidate => test expects reason="Candidate is empty" + """ + Computes a single-sentence BLEU score using nltk.translate.bleu_score.sentence_bleu. + If reference is a single string, it will be treated as one reference. + If reference is a list of strings, multiple references are used. + """ + # 1) Handle empty candidate if not output.strip(): return score_result.ScoreResult( value=0.0, @@ -176,7 +95,7 @@ def score( reason="Candidate is empty" ) - # 2) If reference is a single string, turn it into a list + # 2) Process references if isinstance(reference, str): if not reference.strip(): return score_result.ScoreResult( @@ -186,7 +105,6 @@ def score( ) references = [reference.lower().split()] else: - # List of strings references = [] for ref in reference: if not ref.strip(): @@ -199,63 +117,26 @@ def score( candidate = output.lower().split() - # We'll compute up to min(self.n_grams, len(candidate)) n-grams - max_order = min(self.n_grams, len(candidate)) + # Truncate & normalize weights to the candidate length + used_weights = self._truncate_weights(len(candidate)) - # Compute p_n - p_n: List[float] = [] - for n in range(1, max_order + 1): - clipped, total = self._modified_precision(references, candidate, n) - if total == 0: - p_n.append(0.0) - else: - p_n.append(float(clipped) / float(total)) - - used_weights = self.weights[:max_order] - # renormalize - w_sum = sum(used_weights) or 1.0 - used_weights = [w / w_sum for w in used_weights] - - if all(val == 0.0 for val in p_n): - return score_result.ScoreResult( - value=0.0, - name=self.name, - reason="All n-gram precisions are zero prior to smoothing" - ) - - # Apply smoothing - p_n_smoothed = self.smoother.apply(self.smoothing_method, p_n) + smoothing_func = self._get_smoothing_func() - # Geometric mean of p_n_smoothed using used_weights try: - log_precisions = [] - for w, val in zip(used_weights, p_n_smoothed): - # if val=0 after smoothing => log(0) => BLEU=0 - if val <= 0: - return score_result.ScoreResult( - value=0.0, - name=self.name, - reason=f"Precision is zero even after smoothing" - ) - log_precisions.append(w * math.log(val)) - geo_mean = math.exp(sum(log_precisions)) - except ValueError: - return score_result.ScoreResult( - value=0.0, - name=self.name, - reason="log(0) encountered even after smoothing" + bleu_value = sentence_bleu( + references, + candidate, + weights=used_weights, + smoothing_function=smoothing_func ) + except ZeroDivisionError: + # edge case if references or candidate is basically empty after splitting + bleu_value = 0.0 - # Compute brevity penalty - c_len = len(candidate) - r_len = self._closest_ref_length(references, c_len) - bp = self._brevity_penalty(c_len, r_len) - - bleu_score = bp * geo_mean return score_result.ScoreResult( - value=bleu_score, + value=bleu_value, name=self.name, - reason=f"Sentence-level BLEU (method={self.smoothing_method}): {bleu_score:.4f}", + reason=f"Sentence-level BLEU (nltk, method={self.smoothing_method}): {bleu_value:.4f}", ) ########################################################################### @@ -267,6 +148,9 @@ def score_corpus( references_list: List[Union[str, List[str]]], **ignored_kwargs: Any ) -> score_result.ScoreResult: + """ + Computes a corpus-level BLEU score using nltk.translate.bleu_score.corpus_bleu. + """ if len(outputs) != len(references_list): return score_result.ScoreResult( @@ -275,114 +159,60 @@ def score_corpus( reason="Mismatch: number of candidates != number of references.", ) - total_candidate_length = 0 - total_reference_length = 0 - total_clipped = [0] * self.n_grams - total_counts = [0] * self.n_grams + all_candidates = [] + all_references = [] for output, ref_item in zip(outputs, references_list): if not output.strip(): + # If candidate is empty, skip it (leading to zero or ignoring). continue - candidate = output.lower().split() - c_len = len(candidate) - total_candidate_length += c_len + + candidate_tokens = output.lower().split() if isinstance(ref_item, str): if not ref_item.strip(): continue - references = [ref_item.lower().split()] + refs = [ref_item.lower().split()] else: - references = [] - skipit = False + refs = [] + skip_this = False for r in ref_item: if not r.strip(): - skipit = True + skip_this = True break - references.append(r.lower().split()) - if skipit: + refs.append(r.lower().split()) + if skip_this or not refs: continue - r_len = self._closest_ref_length(references, c_len) - total_reference_length += r_len - - max_order = min(self.n_grams, c_len) - for n in range(1, max_order + 1): - clipped, count = self._modified_precision(references, candidate, n) - total_clipped[n - 1] += clipped - total_counts[n - 1] += count - # If c_len < self.n_grams => we skip the higher-order n-grams entirely - - # Convert to float-based precisions - # We'll see how many "active" n-gram orders had any counts - active_orders = 0 - p_n: List[float] = [] - for i in range(self.n_grams): - if total_counts[i] > 0: - p_n.append(total_clipped[i] / float(total_counts[i])) - active_orders += 1 - else: - # no counts => skip or treat as 0 - p_n.append(0.0) + all_candidates.append(candidate_tokens) + all_references.append(refs) - # If no active orders had any overlap, return 0 - if all(val == 0.0 for val in p_n): + if not all_candidates: return score_result.ScoreResult( value=0.0, name=self.name, - reason="All corpus-level n-gram precisions are zero prior to smoothing.", + reason="No valid candidate/reference pairs" ) - # Now we only want to compute the geometric mean over the n orders that actually had candidate n-grams - # But the test suite lumps them together as if all are in play. We'll do a simpler approach: - # We'll figure out the largest order that had total_counts>0 - # e.g. if the largest candidate length across the corpus is 3 => up to trigram - largest_order = 0 - for i in range(self.n_grams): - if total_counts[i] > 0: - largest_order = i + 1 - # We'll apply smoothing only on p_n[:largest_order] - # And re-normalize self.weights among those that are in use - used_p = p_n[:largest_order] - used_weights = self.weights[:largest_order] - w_sum = sum(used_weights) or 1.0 - used_weights = [w / w_sum for w in used_weights] + # Determine the largest candidate length + max_len = max(len(c) for c in all_candidates) + # Truncate & normalize weights to this largest order + used_weights = self._truncate_weights(max_len) - # Apply smoothing - used_p_smoothed = self.smoother.apply(self.smoothing_method, used_p) + smoothing_func = self._get_smoothing_func() - # If all zero after smoothing => 0 - if all(x == 0.0 for x in used_p_smoothed): - return score_result.ScoreResult( - value=0.0, - name=self.name, - reason="All corpus-level n-gram precisions are zero (after smoothing).", - ) - - # brevity penalty - bp = self._brevity_penalty(total_candidate_length, total_reference_length) - - # Weighted geometric mean try: - log_sum = 0.0 - for w, val in zip(used_weights, used_p_smoothed): - if val <= 0.0: - return score_result.ScoreResult( - value=0.0, - name=self.name, - reason="Zero precision even after smoothing in corpus BLEU" - ) - log_sum += w * math.log(val) - geo_mean = math.exp(log_sum) - except ValueError: - return score_result.ScoreResult( - value=0.0, - name=self.name, - reason="log(0) encountered in corpus BLEU" + bleu_value = corpus_bleu( + all_references, + all_candidates, + weights=used_weights, + smoothing_function=smoothing_func ) + except ZeroDivisionError: + bleu_value = 0.0 - bleu_score = bp * geo_mean return score_result.ScoreResult( - value=bleu_score, + value=bleu_value, name=self.name, - reason=f"Corpus-level BLEU (method={self.smoothing_method}): {bleu_score:.4f}", + reason=f"Corpus-level BLEU (nltk, method={self.smoothing_method}): {bleu_value:.4f}", ) diff --git a/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py b/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py index d782e58225..a1563856a5 100644 --- a/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py +++ b/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py @@ -1,5 +1,9 @@ import pytest -from opik.evaluation.metrics.heuristics import equals, levenshtein_ratio, regex_match +from opik.evaluation.metrics.heuristics import ( + equals, + levenshtein_ratio, + regex_match +) from opik.evaluation.metrics.score_result import ScoreResult from opik.evaluation.metrics.heuristics.bleu import BLEU @@ -15,7 +19,6 @@ def test_evaluation__equals(): name=metric.name, value=0.0, reason=None, metadata=None ) - def test_evaluation__regex_match(): # everything that ends with 'metric' metric_param = ".+metric$" @@ -28,7 +31,6 @@ def test_evaluation__regex_match(): name=metric.name, value=0.0, reason=None, metadata=None ) - def test_evaluation__levenshtein_ratio(): metric_param = "apple" metric = levenshtein_ratio.LevenshteinRatio() @@ -43,39 +45,31 @@ def test_evaluation__levenshtein_ratio(): name=metric.name, value=0.0, reason=None, metadata=None ) - @pytest.mark.parametrize( - "candidate,reference,expected", + "candidate,reference,expected_min,expected_max", [ - # Perfect match => BLEU=1.0 - ("The quick brown fox jumps", "The quick brown fox jumps", 1.0), - # Partial overlap (shorter candidate). - # Standard BLEU with a reference length of 9 vs. candidate length 4 - # yields ~0.2865 if there's near-perfect n-gram match on those 4 tokens. - # We'll approximate that to 0.29, within ±0.05 for leniency. - ("The quick brown fox", "The quick brown fox jumps over the lazy dog", 0.29), - # Full mismatch => BLEU ~ 0.0 - ("apple", "orange", 0.0), - # Single token partial => e.g. "hello" vs. "hello world" - # Typically ~0.3679 with standard brevity penalty => ~0.37 - ("hello", "hello world", 0.37), + ("The quick brown fox jumps over the lazy dog", + "The quick brown fox jumps over the lazy dog", 0.99, 1.01), + + ("The quick brown fox", "The quick green fox jumps over something", 0.05, 0.2), + + ("apple", "orange", -0.01, 0.01), + + ("hello", "hello world", 0.05, 0.5), + + ("", "non-empty reference", -0.01, 0.01), + ("non-empty candidate", "", -0.01, 0.01), ], ) -def test_bleu_score_sentence_level(candidate, reference, expected): - """ - Tests BLEU in more standard scenarios, using approximate checks. - We rely on approximate comparison since BLEU can differ slightly - depending on smoothing details. By default, BLEU uses method1 smoothing. - """ - metric = BLEU() # default n_grams=4, smoothing_method="method1" +def test_bleu_score_sentence_level(candidate, reference, expected_min, expected_max): + + metric = BLEU() result = metric.score(output=candidate, reference=reference) assert isinstance(result, ScoreResult) - # For approximate matching, we allow a small tolerance - # e.g. ±0.05 around our target - assert result.value == pytest.approx( - expected, abs=0.05 - ), f"Got {result.value:.4f}, expected ~{expected} ± 0.05" - + assert expected_min <= result.value <= expected_max, ( + f"For candidate='{candidate}' vs reference='{reference}', " + f"expected BLEU in [{expected_min}, {expected_max}], got {result.value:.4f}" + ) @pytest.mark.parametrize( "candidate,reference", @@ -99,7 +93,6 @@ def test_bleu_score_empty_cases(candidate, reference): elif not reference.strip(): assert "Reference is empty" in res.reason - @pytest.mark.parametrize( "candidate,reference,method", [ @@ -107,12 +100,14 @@ def test_bleu_score_empty_cases(candidate, reference): ("cat", "dog", "method0"), ("cat", "dog", "method1"), ("cat", "dog", "method2"), + # Partial overlap => might see differences among smoothing methods ("The cat", "cat The", "method0"), ("The cat", "cat The", "method1"), ("The cat", "cat The", "method2"), ], ) + def test_bleu_score_different_smoothing(candidate, reference, method): """ Check that different smoothing yields different non-negative values. @@ -126,27 +121,50 @@ def test_bleu_score_different_smoothing(candidate, reference, method): @pytest.mark.parametrize( - "candidates,references", + "candidates,references,expected_min,expected_max", [ - # Perfect match => corpus-level BLEU=1 ( - ["Hello world", "The quick brown fox"], - [["Hello world"], ["The quick brown fox"]], + ["The quick brown fox jumps over the lazy dog"], + [["The quick brown fox jumps over the lazy dog"]], + 0.99, + 1.01, + ), + ( + ["The quick brown fox", "Hello world"], + [ + ["The quick green fox jumps over something"], + ["Hello there big world"], + ], + 0.0, + 1.0, + ), + ( + [ + "The quick brown fox jumps over the lazy dog", + "I love apples and oranges" + ], + [ + ["The quick brown fox jumps over the lazy dog"], + ["I love apples and oranges so much!"] + ], + 0.8, + 1.01, ), - # Partial overlap => expect 0 < BLEU < 1 ( - ["Hello planet", "The quick brown cat"], - [["Hello world"], ["The quick brown fox"]], + ["", "Some text here"], + [["non-empty reference"], [""]], + -0.01, + 0.01, ), ], ) -def test_bleu_score_corpus(candidates, references): + +def test_bleu_score_corpus(candidates, references, expected_min, expected_max): metric = BLEU() res = metric.score_corpus(outputs=candidates, references_list=references) assert isinstance(res, ScoreResult) - if candidates[0] == references[0][0] and candidates[1] == references[1][0]: - # perfect match => 1.0 - assert res.value == pytest.approx(1.0, abs=1e-6) - else: - # partial => between 0 and 1 - assert 0 < res.value < 1.0 + + assert expected_min <= res.value <= expected_max, ( + f"For corpus outputs={candidates} vs references={references}, " + f"expected BLEU in [{expected_min}, {expected_max}], got {res.value:.4f}" + )