diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py b/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py
index eeaddd90dd..1d03c3cd68 100644
--- a/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py
+++ b/sdks/python/src/opik/evaluation/metrics/heuristics/bleu.py
@@ -1,165 +1,80 @@
-import math
-from collections import Counter
 from typing import Any, List, Union, Optional
 
+try:
+    import nltk
+    from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
+except ImportError:
+    nltk = None
+
 from opik.evaluation.metrics import base_metric, score_result
 
-###############################################################################
-# SmoothingFunction
-###############################################################################
-class SmoothingFunction:
+
+class BLEU(base_metric.BaseMetric):
     """
-    A collection of smoothing methods for sentence-level BLEU,
-    adapted from Boxing Chen & Collin Cherry (2014), and some
-    NLTK references.
+    BLEU metric relying on the `nltk.translate.bleu_score` implementation.
+    If `nltk` is not installed, this class will raise an ImportError upon instantiation.
     """
 
-    def __init__(self, epsilon: float = 0.1, alpha: float = 5.0, k: float = 5.0):
-        """
-        :param epsilon: the small constant added to 0-precision n-grams (method1, etc.)
-        :param alpha: for method6 interpolation factor
-        :param k: for method4 or other references
-        """
-        self.epsilon = epsilon
-        self.alpha = alpha
-        self.k = k
-
-    def method0(self, p_n: List[float]) -> List[float]:
-        """
-        No smoothing. If there's a zero precision, BLEU can become 0.
-        """
-        return p_n
-
-    def method1(self, p_n: List[float]) -> List[float]:
-        """
-        Add epsilon to each precision if it is 0.
-        e.g., max(p_i, epsilon)
-        """
-        return [p if p != 0.0 else self.epsilon for p in p_n]
-
-    def method2(self, p_n: List[float]) -> List[float]:
-        """
-        Add 1 to both numerator and denominator for n>1, as in NLTK:
-        p_n = (count + 1) / (total + 1) for n>1
-
-        Because we're only storing float p_i = count/total, we do a simplified shift:
-          for i>0 => p_i = (p_i * total + 1)/(total+1)
-        Here i is the index in p_n (0=unigram, 1=bigram, etc.)
-        """
-        p_n_new = []
-        for i, val in enumerate(p_n):
-            if i == 0:
-                # unigrams => no shift
-                p_n_new.append(val)
-            else:
-                # interpret val ~ (count/total)
-                if val == 0:
-                    # if there's no overlap, treat as 1/(total+1) ~ 1/(something)
-                    # simpler to revert to "some small fraction"
-                    shift_val = 1.0 / 2.0
-                else:
-                    # if val>0, interpret val ~ c/t => let c=val, t=1 => c+1=val+1, t+1=2 => ~ (val+1)/2
-                    shift_val = (val + 1.0) / 2.0
-                p_n_new.append(shift_val)
-        return p_n_new
-
-    def method3(self, p_n: List[float]) -> List[float]:
-        """
-        NIST geometric sequence smoothing (example).
-        """
-        if len(p_n) == 0:
-            return p_n
-        return [max(pi, self.epsilon) for pi in p_n]
-
-    def apply(self, method_name: str, p_n: List[float]) -> List[float]:
-        method = getattr(self, method_name, None)
-        if not method:
-            raise ValueError(f"Unknown smoothing method: {method_name}")
-        return method(p_n)
-
-###############################################################################
-# BLEU Metric
-###############################################################################
-class BLEU(base_metric.BaseMetric):
     def __init__(
         self,
         name: str = "bleu_metric",
         track: bool = True,
         n_grams: int = 4,
         smoothing_method: str = "method1",
-        epsilon: float = 0.1,
-        alpha: float = 5.0,
-        k: float = 5.0,
         weights: Optional[List[float]] = None,
     ):
+        """
+        :param name: Name for this metric instance.
+        :param track: Whether or not this metric is tracked (depends on your system).
+        :param n_grams: Up to which n-gram order to use (1 through n_grams).
+        :param smoothing_method: One of NLTK's SmoothingFunction methods (e.g., "method0", "method1", "method2", etc.).
+        :param weights: Optional manual weighting for n-gram orders. If None, defaults to uniform across n_grams.
+        """
         super().__init__(name=name, track=track)
+
+        # Ensure nltk is installed; if not, raise an ImportError now.
+        if nltk is None:
+            raise ImportError(
+                "`nltk` library is required for BLEU score calculation. "
+                "Please install it via `pip install nltk`."
+            )
+
         self.n_grams = n_grams
         self.smoothing_method = smoothing_method
-        self.smoother = SmoothingFunction(epsilon=epsilon, alpha=alpha, k=k)
 
-        # If no weights provided, default to uniform across n_grams
+        # Set up weights: if not provided, default to uniform among the up to n_grams orders
         if weights is None:
-            self.weights = [1.0 / self.n_grams] * self.n_grams
+            self.weights = [1.0 / n_grams] * n_grams
         else:
-            if len(weights) != self.n_grams:
+            if len(weights) != n_grams:
                 raise ValueError(
-                    f"Length of weights ({len(weights)}) != n_grams ({self.n_grams})"
+                    f"Length of weights ({len(weights)}) != n_grams ({n_grams})"
                 )
             if abs(sum(weights) - 1.0) > 1e-6:
                 raise ValueError("Weights must sum to 1.0")
             self.weights = weights
 
-    def _get_ngrams(self, tokens: List[str], n: int) -> Counter:
-        """Return counts for nth-order n-grams."""
-        return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
+        self._nltk_smoother = SmoothingFunction()
 
-    def _modified_precision(
-        self,
-        references: List[List[str]],
-        candidate: List[str],
-        n: int
-    ) -> tuple[int, int]:
+    def _get_smoothing_func(self):
         """
-        Clipped count (numerator) & total candidate n-grams (denominator).
+        Retrieve the corresponding smoothing function from nltk's SmoothingFunction
+        based on the self.smoothing_method name, e.g. "method0", "method1", etc.
+        Fallback to method0 if not found.
         """
-        cand_ngrams = self._get_ngrams(candidate, n)
-        if not cand_ngrams:
-            return 0, 0
-
-        # Build up max ref n-gram counts
-        max_ref_counts = {}
-        for ref in references:
-            ref_ngrams = self._get_ngrams(ref, n)
-            for ng, cnt in ref_ngrams.items():
-                if ng in max_ref_counts:
-                    max_ref_counts[ng] = max(max_ref_counts[ng], cnt)
-                else:
-                    max_ref_counts[ng] = cnt
-
-        clipped = 0
-        total = 0
-        for ng, cnt in cand_ngrams.items():
-            clipped += min(cnt, max_ref_counts.get(ng, 0))
-            total += cnt
+        return getattr(self._nltk_smoother, self.smoothing_method, self._nltk_smoother.method0)
 
-        return clipped, total
-
-    def _closest_ref_length(self, references: List[List[str]], c_len: int) -> int:
-        """
-        Return the reference length closest to c_len (ties => shorter).
+    def _truncate_weights(self, candidate_len: int) -> tuple:
         """
-        ref_lens = [len(r) for r in references]
-        return min(ref_lens, key=lambda r_len: (abs(r_len - c_len), r_len))
-
-    def _brevity_penalty(self, c_len: int, r_len: int) -> float:
+        Truncate the n-gram weights to min(self.n_grams, candidate_len),
+        then re-normalize them so that they sum to 1.0.
         """
-        BP = exp(1 - r/c) if c<r else 1, c=0 => 0
-        """
-        if c_len == 0:
-            return 0.0
-        if c_len > r_len:
-            return 1.0
-        return math.exp(1.0 - float(r_len) / c_len)
+        max_order = min(self.n_grams, candidate_len)
+        used_weights = self.weights[:max_order]
+        w_sum = sum(used_weights) or 1.0
+        # Re-normalize to sum to 1.0
+        normalized = [w / w_sum for w in used_weights]
+        return tuple(normalized)
 
     ###########################################################################
     # SINGLE-SENTENCE BLEU
@@ -167,8 +82,12 @@ def _brevity_penalty(self, c_len: int, r_len: int) -> float:
     def score(
         self, output: str, reference: Union[str, List[str]], **ignored_kwargs: Any
     ) -> score_result.ScoreResult:
-
-        # 1) Check for empty candidate => test expects reason="Candidate is empty"
+        """
+        Computes a single-sentence BLEU score using nltk.translate.bleu_score.sentence_bleu.
+        If reference is a single string, it will be treated as one reference.
+        If reference is a list of strings, multiple references are used.
+        """
+        # 1) Handle empty candidate
         if not output.strip():
             return score_result.ScoreResult(
                 value=0.0,
@@ -176,7 +95,7 @@ def score(
                 reason="Candidate is empty"
             )
 
-        # 2) If reference is a single string, turn it into a list
+        # 2) Process references
         if isinstance(reference, str):
             if not reference.strip():
                 return score_result.ScoreResult(
@@ -186,7 +105,6 @@ def score(
                 )
             references = [reference.lower().split()]
         else:
-            # List of strings
             references = []
             for ref in reference:
                 if not ref.strip():
@@ -199,63 +117,26 @@ def score(
 
         candidate = output.lower().split()
 
-        # We'll compute up to min(self.n_grams, len(candidate)) n-grams
-        max_order = min(self.n_grams, len(candidate))
+        # Truncate & normalize weights to the candidate length
+        used_weights = self._truncate_weights(len(candidate))
 
-        # Compute p_n
-        p_n: List[float] = []
-        for n in range(1, max_order + 1):
-            clipped, total = self._modified_precision(references, candidate, n)
-            if total == 0:
-                p_n.append(0.0)
-            else:
-                p_n.append(float(clipped) / float(total))
-
-        used_weights = self.weights[:max_order]
-        # renormalize
-        w_sum = sum(used_weights) or 1.0
-        used_weights = [w / w_sum for w in used_weights]
-
-        if all(val == 0.0 for val in p_n):
-            return score_result.ScoreResult(
-                value=0.0,
-                name=self.name,
-                reason="All n-gram precisions are zero prior to smoothing"
-            )
-
-        # Apply smoothing
-        p_n_smoothed = self.smoother.apply(self.smoothing_method, p_n)
+        smoothing_func = self._get_smoothing_func()
 
-        # Geometric mean of p_n_smoothed using used_weights
         try:
-            log_precisions = []
-            for w, val in zip(used_weights, p_n_smoothed):
-                # if val=0 after smoothing => log(0) => BLEU=0
-                if val <= 0:
-                    return score_result.ScoreResult(
-                        value=0.0,
-                        name=self.name,
-                        reason=f"Precision is zero even after smoothing"
-                    )
-                log_precisions.append(w * math.log(val))
-            geo_mean = math.exp(sum(log_precisions))
-        except ValueError:
-            return score_result.ScoreResult(
-                value=0.0,
-                name=self.name,
-                reason="log(0) encountered even after smoothing"
+            bleu_value = sentence_bleu(
+                references,
+                candidate,
+                weights=used_weights,
+                smoothing_function=smoothing_func
             )
+        except ZeroDivisionError:
+            # edge case if references or candidate is basically empty after splitting
+            bleu_value = 0.0
 
-        # Compute brevity penalty
-        c_len = len(candidate)
-        r_len = self._closest_ref_length(references, c_len)
-        bp = self._brevity_penalty(c_len, r_len)
-
-        bleu_score = bp * geo_mean
         return score_result.ScoreResult(
-            value=bleu_score,
+            value=bleu_value,
             name=self.name,
-            reason=f"Sentence-level BLEU (method={self.smoothing_method}): {bleu_score:.4f}",
+            reason=f"Sentence-level BLEU (nltk, method={self.smoothing_method}): {bleu_value:.4f}",
         )
 
     ###########################################################################
@@ -267,6 +148,9 @@ def score_corpus(
         references_list: List[Union[str, List[str]]],
         **ignored_kwargs: Any
     ) -> score_result.ScoreResult:
+        """
+        Computes a corpus-level BLEU score using nltk.translate.bleu_score.corpus_bleu.
+        """
 
         if len(outputs) != len(references_list):
             return score_result.ScoreResult(
@@ -275,114 +159,60 @@ def score_corpus(
                 reason="Mismatch: number of candidates != number of references.",
             )
 
-        total_candidate_length = 0
-        total_reference_length = 0
-        total_clipped = [0] * self.n_grams
-        total_counts = [0] * self.n_grams
+        all_candidates = []
+        all_references = []
 
         for output, ref_item in zip(outputs, references_list):
             if not output.strip():
+                # If candidate is empty, skip it (leading to zero or ignoring).
                 continue
-            candidate = output.lower().split()
-            c_len = len(candidate)
-            total_candidate_length += c_len
+
+            candidate_tokens = output.lower().split()
 
             if isinstance(ref_item, str):
                 if not ref_item.strip():
                     continue
-                references = [ref_item.lower().split()]
+                refs = [ref_item.lower().split()]
             else:
-                references = []
-                skipit = False
+                refs = []
+                skip_this = False
                 for r in ref_item:
                     if not r.strip():
-                        skipit = True
+                        skip_this = True
                         break
-                    references.append(r.lower().split())
-                if skipit:
+                    refs.append(r.lower().split())
+                if skip_this or not refs:
                     continue
 
-            r_len = self._closest_ref_length(references, c_len)
-            total_reference_length += r_len
-
-            max_order = min(self.n_grams, c_len)
-            for n in range(1, max_order + 1):
-                clipped, count = self._modified_precision(references, candidate, n)
-                total_clipped[n - 1] += clipped
-                total_counts[n - 1] += count
-            # If c_len < self.n_grams => we skip the higher-order n-grams entirely
-
-        # Convert to float-based precisions
-        # We'll see how many "active" n-gram orders had any counts
-        active_orders = 0
-        p_n: List[float] = []
-        for i in range(self.n_grams):
-            if total_counts[i] > 0:
-                p_n.append(total_clipped[i] / float(total_counts[i]))
-                active_orders += 1
-            else:
-                # no counts => skip or treat as 0
-                p_n.append(0.0)
+            all_candidates.append(candidate_tokens)
+            all_references.append(refs)
 
-        # If no active orders had any overlap, return 0
-        if all(val == 0.0 for val in p_n):
+        if not all_candidates:
             return score_result.ScoreResult(
                 value=0.0,
                 name=self.name,
-                reason="All corpus-level n-gram precisions are zero prior to smoothing.",
+                reason="No valid candidate/reference pairs"
             )
 
-        # Now we only want to compute the geometric mean over the n orders that actually had candidate n-grams
-        # But the test suite lumps them together as if all are in play. We'll do a simpler approach:
-        # We'll figure out the largest order that had total_counts>0
-        # e.g. if the largest candidate length across the corpus is 3 => up to trigram
-        largest_order = 0
-        for i in range(self.n_grams):
-            if total_counts[i] > 0:
-                largest_order = i + 1
-        # We'll apply smoothing only on p_n[:largest_order]
-        # And re-normalize self.weights among those that are in use
-        used_p = p_n[:largest_order]
-        used_weights = self.weights[:largest_order]
-        w_sum = sum(used_weights) or 1.0
-        used_weights = [w / w_sum for w in used_weights]
+        # Determine the largest candidate length
+        max_len = max(len(c) for c in all_candidates)
+        # Truncate & normalize weights to this largest order
+        used_weights = self._truncate_weights(max_len)
 
-        # Apply smoothing
-        used_p_smoothed = self.smoother.apply(self.smoothing_method, used_p)
+        smoothing_func = self._get_smoothing_func()
 
-        # If all zero after smoothing => 0
-        if all(x == 0.0 for x in used_p_smoothed):
-            return score_result.ScoreResult(
-                value=0.0,
-                name=self.name,
-                reason="All corpus-level n-gram precisions are zero (after smoothing).",
-            )
-
-        # brevity penalty
-        bp = self._brevity_penalty(total_candidate_length, total_reference_length)
-
-        # Weighted geometric mean
         try:
-            log_sum = 0.0
-            for w, val in zip(used_weights, used_p_smoothed):
-                if val <= 0.0:
-                    return score_result.ScoreResult(
-                        value=0.0,
-                        name=self.name,
-                        reason="Zero precision even after smoothing in corpus BLEU"
-                    )
-                log_sum += w * math.log(val)
-            geo_mean = math.exp(log_sum)
-        except ValueError:
-            return score_result.ScoreResult(
-                value=0.0,
-                name=self.name,
-                reason="log(0) encountered in corpus BLEU"
+            bleu_value = corpus_bleu(
+                all_references,
+                all_candidates,
+                weights=used_weights,
+                smoothing_function=smoothing_func
             )
+        except ZeroDivisionError:
+            bleu_value = 0.0
 
-        bleu_score = bp * geo_mean
         return score_result.ScoreResult(
-            value=bleu_score,
+            value=bleu_value,
             name=self.name,
-            reason=f"Corpus-level BLEU (method={self.smoothing_method}): {bleu_score:.4f}",
+            reason=f"Corpus-level BLEU (nltk, method={self.smoothing_method}): {bleu_value:.4f}",
         )
diff --git a/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py b/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py
index d782e58225..a1563856a5 100644
--- a/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py
+++ b/sdks/python/tests/unit/evaluation/metrics/test_heuristics.py
@@ -1,5 +1,9 @@
 import pytest
-from opik.evaluation.metrics.heuristics import equals, levenshtein_ratio, regex_match
+from opik.evaluation.metrics.heuristics import (
+    equals,
+    levenshtein_ratio,
+    regex_match
+)
 from opik.evaluation.metrics.score_result import ScoreResult
 from opik.evaluation.metrics.heuristics.bleu import BLEU
 
@@ -15,7 +19,6 @@ def test_evaluation__equals():
         name=metric.name, value=0.0, reason=None, metadata=None
     )
 
-
 def test_evaluation__regex_match():
     # everything that ends with 'metric'
     metric_param = ".+metric$"
@@ -28,7 +31,6 @@ def test_evaluation__regex_match():
         name=metric.name, value=0.0, reason=None, metadata=None
     )
 
-
 def test_evaluation__levenshtein_ratio():
     metric_param = "apple"
     metric = levenshtein_ratio.LevenshteinRatio()
@@ -43,39 +45,31 @@ def test_evaluation__levenshtein_ratio():
         name=metric.name, value=0.0, reason=None, metadata=None
     )
 
-
 @pytest.mark.parametrize(
-    "candidate,reference,expected",
+    "candidate,reference,expected_min,expected_max",
     [
-        # Perfect match => BLEU=1.0
-        ("The quick brown fox jumps", "The quick brown fox jumps", 1.0),
-        # Partial overlap (shorter candidate).
-        # Standard BLEU with a reference length of 9 vs. candidate length 4
-        # yields ~0.2865 if there's near-perfect n-gram match on those 4 tokens.
-        # We'll approximate that to 0.29, within ±0.05 for leniency.
-        ("The quick brown fox", "The quick brown fox jumps over the lazy dog", 0.29),
-        # Full mismatch => BLEU ~ 0.0
-        ("apple", "orange", 0.0),
-        # Single token partial => e.g. "hello" vs. "hello world"
-        # Typically ~0.3679 with standard brevity penalty => ~0.37
-        ("hello", "hello world", 0.37),
+        ("The quick brown fox jumps over the lazy dog",
+         "The quick brown fox jumps over the lazy dog", 0.99, 1.01),
+
+        ("The quick brown fox", "The quick green fox jumps over something", 0.05, 0.2),
+
+        ("apple", "orange", -0.01, 0.01),
+
+        ("hello", "hello world", 0.05, 0.5),
+
+        ("", "non-empty reference", -0.01, 0.01),
+        ("non-empty candidate", "", -0.01, 0.01),
     ],
 )
-def test_bleu_score_sentence_level(candidate, reference, expected):
-    """
-    Tests BLEU in more standard scenarios, using approximate checks.
-    We rely on approximate comparison since BLEU can differ slightly
-    depending on smoothing details. By default, BLEU uses method1 smoothing.
-    """
-    metric = BLEU()  # default n_grams=4, smoothing_method="method1"
+def test_bleu_score_sentence_level(candidate, reference, expected_min, expected_max):
+
+    metric = BLEU()
     result = metric.score(output=candidate, reference=reference)
     assert isinstance(result, ScoreResult)
-    # For approximate matching, we allow a small tolerance
-    # e.g. ±0.05 around our target
-    assert result.value == pytest.approx(
-        expected, abs=0.05
-    ), f"Got {result.value:.4f}, expected ~{expected} ± 0.05"
-
+    assert expected_min <= result.value <= expected_max, (
+        f"For candidate='{candidate}' vs reference='{reference}', "
+        f"expected BLEU in [{expected_min}, {expected_max}], got {result.value:.4f}"
+    )
 
 @pytest.mark.parametrize(
     "candidate,reference",
@@ -99,7 +93,6 @@ def test_bleu_score_empty_cases(candidate, reference):
     elif not reference.strip():
         assert "Reference is empty" in res.reason
 
-
 @pytest.mark.parametrize(
     "candidate,reference,method",
     [
@@ -107,12 +100,14 @@ def test_bleu_score_empty_cases(candidate, reference):
         ("cat", "dog", "method0"),
         ("cat", "dog", "method1"),
         ("cat", "dog", "method2"),
+
         # Partial overlap => might see differences among smoothing methods
         ("The cat", "cat The", "method0"),
         ("The cat", "cat The", "method1"),
         ("The cat", "cat The", "method2"),
     ],
 )
+
 def test_bleu_score_different_smoothing(candidate, reference, method):
     """
     Check that different smoothing yields different non-negative values.
@@ -126,27 +121,50 @@ def test_bleu_score_different_smoothing(candidate, reference, method):
 
 
 @pytest.mark.parametrize(
-    "candidates,references",
+    "candidates,references,expected_min,expected_max",
     [
-        # Perfect match => corpus-level BLEU=1
         (
-            ["Hello world", "The quick brown fox"],
-            [["Hello world"], ["The quick brown fox"]],
+            ["The quick brown fox jumps over the lazy dog"],
+            [["The quick brown fox jumps over the lazy dog"]],
+            0.99,
+            1.01,
+        ),
+        (
+            ["The quick brown fox", "Hello world"],
+            [
+                ["The quick green fox jumps over something"],
+                ["Hello there big world"],
+            ],
+            0.0,
+            1.0,
+        ),
+        (
+            [
+                "The quick brown fox jumps over the lazy dog",
+                "I love apples and oranges"
+            ],
+            [
+                ["The quick brown fox jumps over the lazy dog"],
+                ["I love apples and oranges so much!"]
+            ],
+            0.8,
+            1.01,
         ),
-        # Partial overlap => expect 0 < BLEU < 1
         (
-            ["Hello planet", "The quick brown cat"],
-            [["Hello world"], ["The quick brown fox"]],
+            ["", "Some text here"],
+            [["non-empty reference"], [""]],
+            -0.01,
+            0.01,
         ),
     ],
 )
-def test_bleu_score_corpus(candidates, references):
+
+def test_bleu_score_corpus(candidates, references, expected_min, expected_max):
     metric = BLEU()
     res = metric.score_corpus(outputs=candidates, references_list=references)
     assert isinstance(res, ScoreResult)
-    if candidates[0] == references[0][0] and candidates[1] == references[1][0]:
-        # perfect match => 1.0
-        assert res.value == pytest.approx(1.0, abs=1e-6)
-    else:
-        # partial => between 0 and 1
-        assert 0 < res.value < 1.0
+
+    assert expected_min <= res.value <= expected_max, (
+        f"For corpus outputs={candidates} vs references={references}, "
+        f"expected BLEU in [{expected_min}, {expected_max}], got {res.value:.4f}"
+    )