refactor(preprocessing): remove unnecessary module preprocessing by…

… moving its utility functionality to where needed
ruancomelli · Jun 15, 2022 · 4a19b3e · 4a19b3e
1 parent ee380da
commit 4a19b3e
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 70 deletions.
diff --git a/boiling_learning/preprocessing/__init__.py b/boiling_learning/preprocessing/__init__.py
@@ -2,4 +2,3 @@
 from boiling_learning.preprocessing.experiment_video import ExperimentVideo
 from boiling_learning.preprocessing.experimental_data import *
 from boiling_learning.preprocessing.image_datasets import *
-from boiling_learning.preprocessing.preprocessing import *
diff --git a/boiling_learning/preprocessing/case_data.py b/boiling_learning/preprocessing/case_data.py
diff --git a/boiling_learning/preprocessing/experiment_video.py b/boiling_learning/preprocessing/experiment_video.py
@@ -9,7 +9,6 @@
 
 from boiling_learning.datasets.sliceable import SliceableDataset
 from boiling_learning.io import json
-from boiling_learning.preprocessing.preprocessing import sync_dataframes
 from boiling_learning.preprocessing.video import PimsVideo, VideoFrame, convert_video
 from boiling_learning.utils.dataclasses import dataclass, field
 from boiling_learning.utils.descriptions import describe
@@ -275,7 +274,7 @@ def make_dataframe(
     def sync_time_series(self, source_df: pd.DataFrame, inplace: bool = True) -> pd.DataFrame:
         df = self.make_dataframe(recalculate=False, enforce_time=True, inplace=inplace)
 
-        df = sync_dataframes(
+        df = _sync_dataframes(
             source_df=source_df,
             dest_df=df,
             dest_time_column=self.column_names.elapsed_time,
@@ -355,3 +354,50 @@ def _encode_video(obj: ExperimentVideo) -> json.JSONDataType:
 @describe.instance(ExperimentVideo)
 def _describe_video(obj: ExperimentVideo) -> Path:
     return obj.path
+
+
+def _sync_dataframes(
+    source_df: pd.DataFrame,
+    dest_df: pd.DataFrame,
+    source_time_column: Optional[str] = None,
+    dest_time_column: Optional[str] = None,
+) -> pd.DataFrame:
+    allowed_index = (pd.DatetimeIndex, pd.TimedeltaIndex, pd.Float64Index)
+
+    if source_time_column is not None:
+        source_df = source_df.set_index(source_time_column, drop=False)
+    if not isinstance(source_df.index, allowed_index):
+        raise ValueError(
+            f'the source DataFrame index must be one of {allowed_index}.'
+            ' Ensure this or pass a valid column name as input.'
+            f' Got {type(source_df.index)}'
+        )
+
+    if dest_time_column is not None:
+        dest_df = dest_df.set_index(dest_time_column, drop=False)
+    if not isinstance(dest_df.index, allowed_index):
+        raise ValueError(
+            f'the dest DataFrame index must be one of {allowed_index}.'
+            ' Ensure this or pass a valid column name as input.'
+            f' Got {type(dest_df.index)}'
+        )
+
+    if isinstance(source_df.index, pd.TimedeltaIndex):
+        source_df.index = source_df.index.total_seconds()
+
+    if isinstance(dest_df.index, pd.TimedeltaIndex):
+        dest_df.index = dest_df.index.total_seconds()
+
+    if type(source_df.index) is not type(dest_df.index):
+        raise ValueError(
+            f'the source and dest DataFrames indices must have the same type.'
+            f' Got {type(source_df.index)} and {type(dest_df.index)}'
+        )
+
+    concat = pd.concat([source_df, dest_df]).sort_index()
+    if isinstance(source_df.index, pd.Float64Index):
+        concat = concat.interpolate(method='index', limit_direction='both')
+    else:
+        concat = concat.interpolate(method='time', limit_direction='both')
+    concat = concat.loc[dest_df.index]
+    return concat
diff --git a/boiling_learning/preprocessing/preprocessing.py b/boiling_learning/preprocessing/preprocessing.py