From 9af1c77cb68a31258a476e234a67d3f5e9587f53 Mon Sep 17 00:00:00 2001
From: Paroma Varma <varma.paroma@gmail.com>
Date: Tue, 10 Sep 2019 11:30:08 -0700
Subject: [PATCH] Ignore abstains in Scorer, change LabelModel default tie
 break policy (#1450)

---
 snorkel/analysis/scorer.py              |  6 +++++-
 snorkel/labeling/model/label_model.py   | 11 ++++++++---
 test/analysis/test_scorer.py            |  8 +++++++-
 test/labeling/model/test_label_model.py | 20 ++++++++++++++++++++
 4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/snorkel/analysis/scorer.py b/snorkel/analysis/scorer.py
index 131e0e86c..b8d8f43e5 100644
--- a/snorkel/analysis/scorer.py
+++ b/snorkel/analysis/scorer.py
@@ -49,7 +49,11 @@ def __init__(
                 if metric not in METRICS:
                     raise ValueError(f"Unrecognized metric: {metric}")
 
-            filter_dict = {} if abstain_label is None else {"golds": [abstain_label]}
+            filter_dict = (
+                {}
+                if abstain_label is None
+                else {"golds": [abstain_label], "preds": [abstain_label]}
+            )
             self.metrics = {
                 m: partial(metric_score, metric=m, filter_dict=filter_dict)
                 for m in metrics
diff --git a/snorkel/labeling/model/label_model.py b/snorkel/labeling/model/label_model.py
index bbe743e51..1519066e2 100644
--- a/snorkel/labeling/model/label_model.py
+++ b/snorkel/labeling/model/label_model.py
@@ -396,7 +396,7 @@ def predict(
         self,
         L: np.ndarray,
         return_probs: Optional[bool] = False,
-        tie_break_policy: str = "random",
+        tie_break_policy: str = "abstain",
     ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
         """Return predicted labels, with ties broken according to policy.
 
@@ -446,7 +446,7 @@ def score(
         L: np.ndarray,
         Y: np.ndarray,
         metrics: Optional[List[str]] = ["accuracy"],
-        tie_break_policy: str = "random",
+        tie_break_policy: str = "abstain",
     ) -> Dict[str, float]:
         """Calculate one or more scores from user-specified and/or user-defined metrics.
 
@@ -455,7 +455,7 @@ def score(
         L
             An [n,m] matrix with values in {-1,0,1,...,k-1}
         Y
-            Gold labels associated with datapoints in L
+            Gold labels associated with data points in L
         metrics
             A list of metric names
         tie_break_policy
@@ -477,6 +477,11 @@ def score(
         >>> label_model.score(L, Y=np.array([1, 1, 1]), metrics=["f1"])
         {'f1': 0.8}
         """
+        if tie_break_policy == "abstain":  # pragma: no cover
+            logging.warning(
+                "Metrics calculated over data points with non-abstain labels only"
+            )
+
         Y_pred, Y_prob = self.predict(
             L, return_probs=True, tie_break_policy=tie_break_policy
         )
diff --git a/test/analysis/test_scorer.py b/test/analysis/test_scorer.py
index 3cf40d7c4..2de9ae4e2 100644
--- a/test/analysis/test_scorer.py
+++ b/test/analysis/test_scorer.py
@@ -68,12 +68,18 @@ def test_abstain_labels(self) -> None:
         results_expected = dict(accuracy=0.6)
         self.assertEqual(results, results_expected)
 
-        # Test abstain=-1
+        # Test abstain=-1 for gold
         scorer = Scorer(metrics=["accuracy"], abstain_label=-1)
         results = scorer.score(golds, preds, probs)
         results_expected = dict(accuracy=0.75)
         self.assertEqual(results, results_expected)
 
+        # Test abstain=-1 for preds and gold
+        abstain_preds = np.array([-1, -1, 1, 1, 0])
+        results = scorer.score(golds, abstain_preds)
+        results_expected = dict(accuracy=0.5)
+        self.assertEqual(results, results_expected)
+
         # Test abstain set to different value
         scorer = Scorer(metrics=["accuracy"], abstain_label=10)
         results = scorer.score(golds, preds, probs)
diff --git a/test/labeling/model/test_label_model.py b/test/labeling/model/test_label_model.py
index 2f9a24e2f..ede1bbc8b 100644
--- a/test/labeling/model/test_label_model.py
+++ b/test/labeling/model/test_label_model.py
@@ -240,6 +240,14 @@ def test_predict_proba(self):
         np.testing.assert_array_almost_equal(probs, true_probs)
 
     def test_predict(self):
+        # 3 LFs that always disagree/abstain leads to all abstains
+        L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]])
+        label_model = LabelModel(cardinality=2, verbose=False)
+        label_model.fit(L, n_epochs=100)
+        np.testing.assert_array_almost_equal(
+            label_model.predict(L), np.array([-1, -1, -1])
+        )
+
         L = np.array([[0, 1, 0], [0, 1, 0]])
         label_model = self._set_up_model(L)
 
@@ -254,6 +262,18 @@ def test_predict(self):
         np.testing.assert_array_almost_equal(probs, true_probs)
 
     def test_score(self):
+        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
+        Y = np.array([1, 0, 1])
+        label_model = LabelModel(cardinality=2, verbose=False)
+        label_model.fit(L, n_epochs=100)
+        results = label_model.score(L, Y)
+        np.testing.assert_array_almost_equal(
+            label_model.predict(L), np.array([1, -1, 1])
+        )
+
+        results_expected = dict(accuracy=1.0)
+        self.assertEqual(results, results_expected)
+
         L = np.array([[1, 0, 1], [1, 0, 1]])
         label_model = self._set_up_model(L)
         label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(0.01, 0.99))