Add quality control property for batch processing over genotypes (#76)

* add `qc_fail` property to `Series` class * add `qc_fail` property to dictionaries returned with traits by `MultipleDicotPipeline` * add tests for `qc_fail` property * test quality control in notebook for multiple dicots * look at dtype of qc_fail * fix typing annotation for qc_fail property
talmolab · Apr 23, 2024 · 035b153 · 035b153
1 parent f8baa7b
commit 035b153
Show file tree

Hide file tree

Showing 5 changed files with 295 additions and 107 deletions.
diff --git a/notebooks/MultipleDicotPipeline.ipynb b/notebooks/MultipleDicotPipeline.ipynb
diff --git a/sleap_roots/series.py b/sleap_roots/series.py
@@ -39,6 +39,7 @@ class Series:
         series_name: Name of the series derived from the HDF5 filename.
         expected_count: Fetch the expected plant count for this series from the CSV.
         group: Group name for the series from the CSV.
+        qc_fail: Flag to indicate if the series failed QC from the CSV.
     """
 
     h5_path: Optional[str] = None
@@ -161,6 +162,20 @@ def group(self) -> str:
             print(f"No group found for series {self.series_name} in CSV.")
             return np.nan
 
+    @property
+    def qc_fail(self) -> Union[int, float]:
+        """Flag to indicate if the series failed QC from the CSV."""
+        if not self.csv_path or not Path(self.csv_path).exists():
+            print("CSV path is not set or the file does not exist.")
+            return np.nan
+        df = pd.read_csv(self.csv_path)
+        try:
+            # Match the series_name (or plant_qr_code in the CSV) to fetch the QC flag
+            return df[df["plant_qr_code"] == self.series_name]["qc_cylinder"].iloc[0]
+        except IndexError:
+            print(f"No QC flag found for series {self.series_name} in CSV.")
+            return np.nan
+
     def __len__(self) -> int:
         """Length of the series (number of images)."""
         return len(self.video)

diff --git a/sleap_roots/trait_pipelines.py b/sleap_roots/trait_pipelines.py
@@ -401,12 +401,13 @@ def compute_multiple_dicots_traits(
             csv_suffix: The suffix to append to the CSV file name. Default is ".all_frames_summary.csv".
 
         Returns:
-            A dictionary containing the series name, group, aggregated traits, and summary statistics.
+            A dictionary containing the series name, group, qc_fail, aggregated traits, and summary statistics.
         """
         # Initialize the return structure with the series name and group
         result = {
             "series": str(series.series_name),
             "group": str(series.group),
+            "qc_fail": series.qc_fail,
             "traits": {},
             "summary_stats": {},
         }
@@ -528,6 +529,11 @@ def compute_multiple_dicots_traits_for_groups(
         # Group series by their group property
         series_groups = {}
         for series in series_list:
+            # Exclude series with qc_fail flag set to 1
+            if int(series.qc_fail) == 1:
+                print(f"Skipping series '{series.series_name}' due to qc_fail flag.")
+                continue
+            # Get the group name from the series object
             group_name = str(series.group)
             if group_name not in series_groups:
                 series_groups[group_name] = {"names": [], "series": []}
@@ -733,7 +739,7 @@ def compute_batch_multiple_dicots_traits_for_groups(
 
         Returns:
             A pandas DataFrame of computed traits summarized over all frames of each
-            series. The resulting dataframe will have a row for each series and a column
+            group. The resulting dataframe will have a row for each series and a column
             for each series-level summarized trait.
 
             Summarized traits are prefixed with the trait name and an underscore,

diff --git a/tests/data/multiple_arabidopsis_11do/merged_proofread_samples_03122024.csv b/tests/data/multiple_arabidopsis_11do/merged_proofread_samples_03122024.csv
@@ -15,7 +15,7 @@
 6030_2,6030,2,h5s_predictions\ES\6030_2.h5,0,,3,,,,,,,
 6035_1,6035,1,h5s_predictions\ES\6035_1.h5,0,,3,,,,,,,
 6035_2,6035,2,h5s_predictions\ES\6035_2.h5,0,,3,,,,,,,
-6039_1,6039,1,h5s_predictions\ES\6039_1.h5,0,,2,1,1,,,,,
+6039_1,6039,1,h5s_predictions\ES\6039_1.h5,1,,2,1,1,,,,,
 6039_2,6039,2,h5s_predictions\ES\6039_2.h5,0,,3,,,,,,,
 6042_1,6042,1,h5s_predictions\ES\6042_1.h5,0,,3,,,,,,,
 6042_2,6042,2,h5s_predictions\ES\6042_2.h5,0,,3,,,,,,,

diff --git a/tests/test_series.py b/tests/test_series.py
@@ -52,7 +52,7 @@ def csv_path(tmp_path):
     # Create a dummy CSV file
     csv_path = tmp_path / "dummy.csv"
     csv_path.write_text(
-        "plant_qr_code,number_of_plants_cylinder,genotype\ndummy,10,1100\nseries2,15,Kitaake-X\n"
+        "plant_qr_code,number_of_plants_cylinder,genotype,qc_cylinder\ndummy,10,1100,0\nseries2,15,Kitaake-X,1\n"
     )
     return csv_path
 
@@ -85,6 +85,11 @@ def test_expected_count(series_instance, csv_path):
     assert series_instance.expected_count == 10
 
 
+def test_qc_cylinder(series_instance, csv_path):
+    series_instance.csv_path = csv_path
+    assert series_instance.qc_fail == 0
+
+
 def test_len():
     series = Series(video=["frame1", "frame2"])
     assert len(series) == 2