From f5dab2b5a0a427cf2b4a628048e9af2b5b9d8bf6 Mon Sep 17 00:00:00 2001
From: eberrigan <106755962+eberrigan@users.noreply.github.com>
Date: Wed, 5 Jul 2023 13:24:00 -0700
Subject: [PATCH] Add main function for high level pipeline (#41)

* Update base_length_ratio results to scalar and return nan if length=0.

* convert the data in scalar column to the value without [].

* Modify stem width and base-related functions for rice traits.

* change the argument name of 'lateral_only' to 'monocots'.

* change the argument name of 'lateral_only' to 'monocots' (cont.).

* Add function for getting final csv from multiple plants `get_all_plants_traits`

* Add tests for all plant summary

* Modified overwrite parameter to write_per_plant

* Changed plant_name to just the h5 name and not full path

* modify functions to set nan values for base-related traits for rice.

* Modify `get_base_median_ratio` function for all nans.

* Modify `get_traits_value_plant_summary` for traits with no value [].

* Delete monocot test for now

* Fix docstring indentations

* Fix graphpipeline tests

* Add rice test

* Add warning filter for ellipse module

* Add option to change csv suffixes. Import modules in __init__.

* Lint

* Update comment

---------

Co-authored-by: Lin Wang <linwang9926@gmail.com>
Co-authored-by: Talmo Pereira <talmo@salk.edu>
---
 sleap_roots/__init__.py      |   7 +-
 sleap_roots/graphpipeline.py | 169 +++++++++++++++++++++++++++--------
 sleap_roots/series.py        |   2 +-
 tests/test_graphpipeline.py  |  42 ++++++++-
 4 files changed, 178 insertions(+), 42 deletions(-)

diff --git a/sleap_roots/__init__.py b/sleap_roots/__init__.py
index 323e15d..3f0c719 100644
--- a/sleap_roots/__init__.py
+++ b/sleap_roots/__init__.py
@@ -6,10 +6,15 @@
 import sleap_roots.convhull
 import sleap_roots.ellipse
 import sleap_roots.networklength
+import sleap_roots.points
 import sleap_roots.scanline
 import sleap_roots.series
+import sleap_roots.summary
+import sleap_roots.traitsgraph
+import sleap_roots.graphpipeline
+from sleap_roots.graphpipeline import get_all_plants_traits
 from sleap_roots.series import Series
 
 # Define package version.
-# This is read dynamically by setuptools in setup.cfg to determine the release version.
+# This is read dynamically by setuptools in pyproject.toml to determine the release version.
 __version__ = "0.0.1"
diff --git a/sleap_roots/graphpipeline.py b/sleap_roots/graphpipeline.py
index cd30e8a..a6e3200 100644
--- a/sleap_roots/graphpipeline.py
+++ b/sleap_roots/graphpipeline.py
@@ -3,6 +3,9 @@
 import numpy as np
 import pandas as pd
 import os
+from typing import List
+from fractions import Fraction
+from pathlib import Path
 from sleap_roots.traitsgraph import get_traits_graph
 from sleap_roots.angle import get_root_angle
 from sleap_roots.bases import (
@@ -47,7 +50,7 @@
     get_scanline_first_ind,
     get_scanline_last_ind,
 )
-from sleap_roots.series import Series
+from sleap_roots.series import Series, find_all_series
 from sleap_roots.summary import get_summary
 from sleap_roots.tips import get_tips, get_tip_xs, get_tip_ys
 from typing import Dict, Tuple
@@ -128,6 +131,12 @@
     message="invalid value encountered in double_scalars",
     category=RuntimeWarning,
 )
+warnings.filterwarnings(
+    "ignore",
+    message="invalid value encountered in scalar divide",
+    category=RuntimeWarning,
+    module="ellipse",
+)
 
 
 def get_traits_value_frame(
@@ -293,27 +302,36 @@ def get_traits_value_plant(
     n_line: int = 50,
     network_fraction: float = 2 / 3,
     write_csv: bool = False,
-    csv_name: str = "plant_original_traits.csv",
-) -> Tuple[Dict, pd.DataFrame]:
-    """Get SLEAP traits per plant based on graph.
+    csv_suffix: str = ".traits.csv",
+) -> Tuple[Dict, pd.DataFrame, str]:
+    """Get detailed SLEAP traits for every frame of a plant, based on the graph.
 
     Args:
-        h5: h5 file, plant image series.
-        monocots: Boolean value, where false is dicot (default), true is rice.
-        primary_name: primary model name.
-        lateral_name: lateral model name.
-        stem_width_tolerance: difference in projection norm between right and left side.
-        n_line: number of scan lines, np.nan for no interaction.
-        network_fraction: length found in the lower fration value of the network.
-        write_csv: Boolean value, where true is write csv file.
-        csv_name: saved csv file name.
-
-    Return:
-        Tuple of a dictionary and a DataFrame with all traits per plant.
+        h5: The h5 file representing the plant image series.
+        monocots: A boolean value indicating whether the plant is a monocot (True)
+            or a dicot (False) (default).
+        primary_name: Name of the primary root predictions. The predictions file is
+            expected to be named `"{h5_path}.{primary_name}.predictions.slp"`.
+        lateral_name: Name of the lateral root predictions. The predictions file is
+            expected to be named `"{h5_path}.{lateral_name}.predictions.slp"`.
+        stem_width_tolerance: The difference in the projection norm between
+            the right and left side of the stem.
+        n_line: The number of scan lines. Use np.nan for no interaction.
+        network_fraction: The length found in the lower fraction value of the network.
+        write_csv: A boolean value. If True, it writes per plant detailed
+            CSVs with traits for every instance on every frame.
+        csv_suffix: If write_csv=True, the CSV file will be saved with the
+            h5 path + csv_suffix.
+
+    Returns:
+        A tuple containing a dictionary and a DataFrame with all traits per plant,
+            and the plant name. The Dataframe has root traits per instance and frame
+            where each row corresponds to a frame in the H5 file. The plant_name is
+            given by the h5 file.
     """
     plant = Series.load(h5, primary_name=primary_name, lateral_name=lateral_name)
     plant_name = plant.series_name
-    # get nymber of frames per plant
+    # get number of frames per plant
     n_frame = len(plant)
 
     data_plant = []
@@ -384,9 +402,9 @@ def get_traits_value_plant(
     )
 
     if write_csv:
-        csv_name = "plant_original_traits_" + plant_name + ".csv"
+        csv_name = Path(h5).with_suffix(f"{csv_suffix}")
         data_plant_df.to_csv(csv_name, index=False)
-    return data_plant, data_plant_df
+    return data_plant, data_plant_df, plant_name
 
 
 def get_traits_value_plant_summary(
@@ -398,29 +416,37 @@ def get_traits_value_plant_summary(
     n_line: int = 50,
     network_fraction: float = 2 / 3,
     write_csv: bool = False,
-    csv_name: str = "plant_original_traits.csv",
+    csv_suffix: str = ".traits.csv",
     write_summary_csv: bool = False,
-    summary_csv_name: str = "plant_summary_traits.csv",
+    summary_csv_suffix: str = ".summary_traits.csv",
 ) -> pd.DataFrame:
-    """Get summarized SLEAP traits per plant based on graph.
+    """Get summary statistics of SLEAP traits per plant based on graph.
 
     Args:
-        h5: h5 file, plant image series.
-        monocots: Boolean value, where false is dicot (default), true is rice.
-        primary_name: primary model name.
-        lateral_name: lateral model name.
-        stem_width_tolerance: difference in projection norm between right and left side.
-        n_line: number of scan lines, np.nan for no interaction.
-        network_fraction: length found in the lower fration value of the network.
-        write_csv: Boolean value, where true is write csv file.
-        csv_name: saved csv file name.
+        h5: The h5 file representing the plant image series.
+        monocots: A boolean value indicating whether the plant is a monocot (True)
+            or a dicot (False) (default).
+        primary_name: Name of the primary root predictions. The predictions file is
+            expected to be named `"{h5_path}.{primary_name}.predictions.slp"`.
+        lateral_name: Name of the lateral root predictions. The predictions file is
+            expected to be named `"{h5_path}.{lateral_name}.predictions.slp"`.
+        stem_width_tolerance: The difference in the projection norm between
+            the right and left side of the stem.
+        n_line: The number of scan lines. Use np.nan for no interaction.
+        network_fraction: The length found in the lower fraction value of the network.
+        write_csv: A boolean value. If True, it writes per plant detailed
+            CSVs with traits for every instance on every frame.
+        csv_suffix: If write_csv=True, the CSV file will be saved with the name
+            h5 path + csv_suffix.
         write_summary_csv: Boolean value, where true is write summarized csv file.
-        summary_csv_name: saved summarized csv file name.
+        summary_csv_suffix: If write_summary_csv=True, the CSV file with the summary
+            statistics per plant will be saved with the name
+            h5 path + summary_csv_suffix.
 
     Return:
-        A DataFrame with all summarized traits per plant.
+        A DataFrame with summary statistics of all traits per plant.
     """
-    data_plant, data_plant_df = get_traits_value_plant(
+    data_plant, data_plant_df, plant_name = get_traits_value_plant(
         h5,
         monocots,
         primary_name,
@@ -429,7 +455,7 @@ def get_traits_value_plant_summary(
         n_line,
         network_fraction,
         write_csv,
-        csv_name,
+        csv_suffix,
     )
 
     # get summarized non-scalar traits per frame
@@ -602,7 +628,7 @@ def get_traits_value_plant_summary(
         data_plant_frame_summary[
             data_plant_frame_summary_key[j] + "_prc95"
         ] = trait_prc95
-    data_plant_frame_summary["plant_name"] = [os.path.splitext(h5)[0]]
+    data_plant_frame_summary["plant_name"] = [plant_name]
     data_plant_frame_summary_df = pd.DataFrame(data_plant_frame_summary)
 
     # reorganize the column position
@@ -611,5 +637,76 @@ def get_traits_value_plant_summary(
     data_plant_frame_summary_df = data_plant_frame_summary_df[column_names]
 
     if write_summary_csv:
+        summary_csv_name = Path(h5).with_suffix(f"{summary_csv_suffix}")
         data_plant_frame_summary_df.to_csv(summary_csv_name, index=False)
     return data_plant_frame_summary_df
+
+
+def get_all_plants_traits(
+    data_folders: List[str],
+    primary_name: str,
+    lateral_name: str,
+    stem_width_tolerance: float = 0.02,
+    n_line: int = 50,
+    network_fraction: Fraction = Fraction(2, 3),
+    write_per_plant_details: bool = False,
+    per_plant_details_csv_suffix: str = ".traits.csv",
+    write_per_plant_summary: bool = False,
+    per_plant_summary_csv_suffix: str = ".summary_traits.csv",
+    monocots: bool = False,
+    all_plants_csv_name: str = "all_plants_traits.csv",
+) -> pd.DataFrame:
+    """Get a DataFrame with summary traits from all plants in the given data folders.
+
+    Args:
+        h5: The h5 file representing the plant image series.
+        monocots: A boolean value indicating whether the plant is a monocot (True)
+            or a dicot (False) (default).
+        primary_name: Name of the primary root predictions. The predictions file is
+            expected to be named `"{h5_path}.{primary_name}.predictions.slp"`.
+        lateral_name: Name of the lateral root predictions. The predictions file is
+            expected to be named `"{h5_path}.{lateral_name}.predictions.slp"`.
+        stem_width_tolerance: The difference in the projection norm between
+            the right and left side of the stem.
+        n_line: The number of scan lines. Use np.nan for no interaction.
+        network_fraction: The length found in the lower fraction value of the network.
+        write_per_plant_details: A boolean value. If True, it writes per plant detailed
+            CSVs with traits for every instance.
+        per_plant_details_csv_suffix: If write_csv=True, the CSV file will be saved
+            with the name h5 path + csv_suffix.
+        write_per_plant_summary: A boolean value. If True, it writes per plant summary
+            CSVs.
+        per_plant_summary_csv_suffix: If write_summary_csv=True, the CSV file with the
+            summary statistics per plant will be saved with the name
+            h5 path + summary_csv_suffix.
+        all_plants_csv_name: The name of the output CSV file containing all plants'
+            summary traits.
+
+    Returns:
+        A pandas DataFrame with summary root traits for all plants in the data folders.
+        Each row is a sample.
+    """
+    h5_series = find_all_series(data_folders)
+
+    all_traits = []
+    for h5 in h5_series:
+        plant_traits = get_traits_value_plant_summary(
+            h5,
+            monocots=monocots,
+            primary_name=primary_name,
+            lateral_name=lateral_name,
+            stem_width_tolerance=stem_width_tolerance,
+            n_line=n_line,
+            network_fraction=network_fraction,
+            write_csv=write_per_plant_details,
+            csv_suffix=per_plant_details_csv_suffix,
+            write_summary_csv=write_per_plant_summary,
+            summary_csv_suffix=per_plant_summary_csv_suffix,
+        )
+        plant_traits["path"] = h5
+        all_traits.append(plant_traits)
+
+    all_traits_df = pd.concat(all_traits, ignore_index=True)
+
+    all_traits_df.to_csv(all_plants_csv_name, index=False)
+    return all_traits_df
diff --git a/sleap_roots/series.py b/sleap_roots/series.py
index 1e74e04..b40db14 100644
--- a/sleap_roots/series.py
+++ b/sleap_roots/series.py
@@ -57,7 +57,7 @@ def load(
     @property
     def series_name(self) -> str:
         """Name of the series derived from the HDF5 filename."""
-        return Path(self.h5_path).stem
+        return Path(self.h5_path).name.split(".")[0]
 
     @property
     def video(self) -> sio.Video:
diff --git a/tests/test_graphpipeline.py b/tests/test_graphpipeline.py
index 960aa1a..befc797 100644
--- a/tests/test_graphpipeline.py
+++ b/tests/test_graphpipeline.py
@@ -2,9 +2,11 @@
     get_traits_value_frame,
     get_traits_value_plant,
     get_traits_value_plant_summary,
+    get_all_plants_traits,
 )
 import pytest
 import numpy as np
+import pandas as pd
 
 
 @pytest.fixture
@@ -70,7 +72,7 @@ def test_get_traits_value_frame(primary_pts, lateral_pts):
 def test_get_traits_value_plant(canola_h5):
     monocots = False
 
-    data_plant, data_plant_df = get_traits_value_plant(
+    data_plant, data_plant_df, plant_name = get_traits_value_plant(
         canola_h5,
         monocots,
         primary_name="primary_multi_day",
@@ -79,10 +81,10 @@ def test_get_traits_value_plant(canola_h5):
         n_line=50,
         network_fraction=2 / 3,
         write_csv=False,
-        csv_name="plant_original_traits.csv",
     )
     assert len(data_plant) == 72
     assert data_plant_df.shape[1] == 45
+    assert plant_name == "919QDUH"
 
 
 def test_get_traits_value_plant_summary(canola_h5):
@@ -96,10 +98,42 @@ def test_get_traits_value_plant_summary(canola_h5):
         n_line=50,
         network_fraction=2 / 3,
         write_csv=False,
-        csv_name="plant_original_traits.csv",
         write_summary_csv=False,
-        summary_csv_name="plant_summary_traits.csv",
     )
     assert data_plant_summary.shape[0] == 1
     assert data_plant_summary.shape[1] == 1036
     np.testing.assert_almost_equal(data_plant_summary.iloc[0, 5], 16.643764612148875)
+
+
+def test_get_all_plants_traits_dicot(canola_folder):
+    data_folders = [canola_folder]
+    primary_name = "primary_multi_day"
+    lateral_name = "lateral_3_nodes"
+    write_per_plant_details = True
+    write_per_plant_summary = True
+    all_traits_df = get_all_plants_traits(
+        data_folders=data_folders,
+        primary_name=primary_name,
+        lateral_name=lateral_name,
+        write_per_plant_details=write_per_plant_details,
+        write_per_plant_summary=write_per_plant_summary,
+    )
+    assert all_traits_df.shape == (1, 1037)
+    np.testing.assert_almost_equal(all_traits_df.iloc[0, 5], 16.643764612148875)
+
+
+def tests_get_all_plants_traits_monocot(rice_folder):
+    data_folders = [rice_folder]
+    primary_name = "longest_3do_6nodes"
+    lateral_name = "main_3do_6nodes"
+    write_per_plant_details = True
+    write_per_plant_summary = True
+    all_traits_df = get_all_plants_traits(
+        data_folders=data_folders,
+        primary_name=primary_name,
+        lateral_name=lateral_name,
+        write_per_plant_details=write_per_plant_details,
+        write_per_plant_summary=write_per_plant_summary,
+    )
+    assert all_traits_df.shape == (1, 1037)
+    np.testing.assert_almost_equal(all_traits_df.iloc[0, 5], 3.716619501198254)