From dfd4a04585e33e82ce9705c2e0adee39458a58cb Mon Sep 17 00:00:00 2001 From: Julia Kukulies Date: Fri, 20 Sep 2024 16:38:29 -0600 Subject: [PATCH 1/6] allows calculation of statistics on raw data --- tobac/feature_detection.py | 25 +++++++++++++++++-- tobac/tests/test_utils_bulk_statistics.py | 30 +++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index fc491ff0..0fd71973 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -912,6 +912,7 @@ def feature_detection_multithreshold_timestep( wavelength_filtering: tuple[float] = None, strict_thresholding: bool = False, statistic: Union[dict[str, Union[Callable, tuple[Callable, dict]]], None] = None, + statistics_unsmoothed: bool = False, ) -> pd.DataFrame: """Find features in each timestep. @@ -984,6 +985,9 @@ def feature_detection_multithreshold_timestep( Default is None. Optional parameter to calculate bulk statistics within feature detection. Dictionary with callable function(s) to apply over the region of each detected feature and the name of the statistics to appear in the feature ou tput dataframe. The functions should be the values and the names of the metric the keys (e.g. {'mean': np.mean}) + statistics_unsmoothed: bool, optional + Default is False. If True, calculate the statistics on the raw data instead of the smoothed input data. + Returns ------- features_threshold : pandas DataFrame @@ -1005,6 +1009,12 @@ def feature_detection_multithreshold_timestep( # get actual numpy array and make a copy so as not to change the data in the iris cube track_data = data_i.core_data().copy() + # keep a copy of the unsmoothed data (that can be used for calculating stats) + if statistics_unsmoothed: + if not statistic: + raise ValueError('Please provide the input parameter statistic to determine what statistics to calculate.') + raw_data = data_i.core_data().copy() + track_data = gaussian_filter( track_data, sigma=sigma_threshold ) # smooth data slightly to create rounded, continuous field @@ -1117,7 +1127,16 @@ def feature_detection_multithreshold_timestep( labels.ravel()[regions_old[key]] = key # apply function to get statistics based on labeled regions and functions provided by the user # the feature dataframe is updated by appending a column for each metric - features_thresholds = get_statistics( + if statistics_unsmoothed: + features_thresholds = get_statistics( + features_thresholds, + labels, + raw_data, + statistic=statistic, + index=np.unique(labels[labels > 0]), + id_column="idx", ) + else: + features_thresholds = get_statistics( features_thresholds, labels, track_data, @@ -1125,7 +1144,7 @@ def feature_detection_multithreshold_timestep( index=np.unique(labels[labels > 0]), id_column="idx", ) - + logging.debug( "Finished feature detection for threshold " + str(i_threshold) @@ -1158,6 +1177,7 @@ def feature_detection_multithreshold( dz: Union[float, None] = None, strict_thresholding: bool = False, statistic: Union[dict[str, Union[Callable, tuple[Callable, dict]]], None] = None, + statistics_unsmoothed: bool = False ) -> pd.DataFrame: """Perform feature detection based on contiguous regions. @@ -1370,6 +1390,7 @@ def feature_detection_multithreshold( wavelength_filtering=wavelength_filtering, strict_thresholding=strict_thresholding, statistic=statistic, + statistics_unsmoothed=statistics_unsmoothed, ) # check if list of features is not empty, then merge features from different threshold # values into one DataFrame and append to list for individual timesteps: diff --git a/tobac/tests/test_utils_bulk_statistics.py b/tobac/tests/test_utils_bulk_statistics.py index 1db036b0..a463ef5c 100644 --- a/tobac/tests/test_utils_bulk_statistics.py +++ b/tobac/tests/test_utils_bulk_statistics.py @@ -8,6 +8,36 @@ import tobac.testing as tb_test +@pytest.mark.parametrize( + "statistics_unsmoothed", [(False), (True)]) +def test_bulk_statistics_fd(statistics_unsmoothed): + """ + Assure that bulk statistics in feature detection work, both on smoothed and raw data + """ + ### Test 2D data with time dimension + test_data = tb_test.make_simple_sample_data_2D().core_data() + common_dset_opts = { + "in_arr": test_data, + "data_type": "iris", + } + test_data_iris = tb_test.make_dataset_from_arr( + time_dim_num=0, y_dim_num=1, x_dim_num=2, **common_dset_opts + ) + stats = {"feature_max": np.max} + + # detect features + threshold = 7 + fd_output = tobac.feature_detection.feature_detection_multithreshold( + test_data_iris, + dxy=1000, + threshold=[threshold], + n_min_threshold=100, + target="maximum", + statistic= stats, + statistics_unsmoothed= statistics_unsmoothed) + + assert 'feature_max' in fd_output.columns + @pytest.mark.parametrize( "id_column, index", [("feature", [1]), ("feature_id", [1]), ("cell", [1])] ) From 84c347ff40bc6ae49fe6e1f4e4483d693a55bbcd Mon Sep 17 00:00:00 2001 From: Julia Kukulies Date: Fri, 20 Sep 2024 16:42:01 -0600 Subject: [PATCH 2/6] black formatting --- tobac/tests/test_utils_bulk_statistics.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tobac/tests/test_utils_bulk_statistics.py b/tobac/tests/test_utils_bulk_statistics.py index a463ef5c..c62ee821 100644 --- a/tobac/tests/test_utils_bulk_statistics.py +++ b/tobac/tests/test_utils_bulk_statistics.py @@ -8,8 +8,7 @@ import tobac.testing as tb_test -@pytest.mark.parametrize( - "statistics_unsmoothed", [(False), (True)]) +@pytest.mark.parametrize("statistics_unsmoothed", [(False), (True)]) def test_bulk_statistics_fd(statistics_unsmoothed): """ Assure that bulk statistics in feature detection work, both on smoothed and raw data @@ -24,7 +23,7 @@ def test_bulk_statistics_fd(statistics_unsmoothed): time_dim_num=0, y_dim_num=1, x_dim_num=2, **common_dset_opts ) stats = {"feature_max": np.max} - + # detect features threshold = 7 fd_output = tobac.feature_detection.feature_detection_multithreshold( @@ -33,11 +32,13 @@ def test_bulk_statistics_fd(statistics_unsmoothed): threshold=[threshold], n_min_threshold=100, target="maximum", - statistic= stats, - statistics_unsmoothed= statistics_unsmoothed) + statistic=stats, + statistics_unsmoothed=statistics_unsmoothed, + ) + + assert "feature_max" in fd_output.columns + - assert 'feature_max' in fd_output.columns - @pytest.mark.parametrize( "id_column, index", [("feature", [1]), ("feature_id", [1]), ("cell", [1])] ) From 22314f7f2ceca9414df7f1f9a7970fbe23d78946 Mon Sep 17 00:00:00 2001 From: Julia Kukulies Date: Fri, 20 Sep 2024 16:46:31 -0600 Subject: [PATCH 3/6] black formatting --- tobac/feature_detection.py | 57 ++++++++++++++++++++------------------ tobac/segmentation.py | 22 +++++++-------- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index 0fd71973..e3c67b65 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -627,9 +627,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, y_val_alt, x_val_alt] - labels_2[ - label_locs_v, label_locs_h1, label_locs_h2 - ] = labels_2_alt + labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( + labels_2_alt + ) skip_list = np.append(skip_list, label_ind) break @@ -673,9 +673,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, y_val_alt, label_x] - labels_2[ - label_locs_v, label_locs_h1, label_locs_h2 - ] = labels_2_alt + labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( + labels_2_alt + ) new_label_ind = labels_2_alt skip_list = np.append(skip_list, label_ind) @@ -717,9 +717,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, label_y, x_val_alt] - labels_2[ - label_locs_v, label_locs_h1, label_locs_h2 - ] = labels_2_alt + labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( + labels_2_alt + ) new_label_ind = labels_2_alt skip_list = np.append(skip_list, label_ind) @@ -912,7 +912,7 @@ def feature_detection_multithreshold_timestep( wavelength_filtering: tuple[float] = None, strict_thresholding: bool = False, statistic: Union[dict[str, Union[Callable, tuple[Callable, dict]]], None] = None, - statistics_unsmoothed: bool = False, + statistics_unsmoothed: bool = False, ) -> pd.DataFrame: """Find features in each timestep. @@ -986,7 +986,7 @@ def feature_detection_multithreshold_timestep( Dictionary with callable function(s) to apply over the region of each detected feature and the name of the statistics to appear in the feature ou tput dataframe. The functions should be the values and the names of the metric the keys (e.g. {'mean': np.mean}) statistics_unsmoothed: bool, optional - Default is False. If True, calculate the statistics on the raw data instead of the smoothed input data. + Default is False. If True, calculate the statistics on the raw data instead of the smoothed input data. Returns ------- @@ -1012,7 +1012,9 @@ def feature_detection_multithreshold_timestep( # keep a copy of the unsmoothed data (that can be used for calculating stats) if statistics_unsmoothed: if not statistic: - raise ValueError('Please provide the input parameter statistic to determine what statistics to calculate.') + raise ValueError( + "Please provide the input parameter statistic to determine what statistics to calculate." + ) raw_data = data_i.core_data().copy() track_data = gaussian_filter( @@ -1129,22 +1131,23 @@ def feature_detection_multithreshold_timestep( # the feature dataframe is updated by appending a column for each metric if statistics_unsmoothed: features_thresholds = get_statistics( - features_thresholds, - labels, - raw_data, - statistic=statistic, - index=np.unique(labels[labels > 0]), - id_column="idx", ) + features_thresholds, + labels, + raw_data, + statistic=statistic, + index=np.unique(labels[labels > 0]), + id_column="idx", + ) else: features_thresholds = get_statistics( - features_thresholds, - labels, - track_data, - statistic=statistic, - index=np.unique(labels[labels > 0]), - id_column="idx", - ) - + features_thresholds, + labels, + track_data, + statistic=statistic, + index=np.unique(labels[labels > 0]), + id_column="idx", + ) + logging.debug( "Finished feature detection for threshold " + str(i_threshold) @@ -1177,7 +1180,7 @@ def feature_detection_multithreshold( dz: Union[float, None] = None, strict_thresholding: bool = False, statistic: Union[dict[str, Union[Callable, tuple[Callable, dict]]], None] = None, - statistics_unsmoothed: bool = False + statistics_unsmoothed: bool = False, ) -> pd.DataFrame: """Perform feature detection based on contiguous regions. diff --git a/tobac/segmentation.py b/tobac/segmentation.py index fe2eda2e..4697a25d 100644 --- a/tobac/segmentation.py +++ b/tobac/segmentation.py @@ -824,15 +824,15 @@ def segmentation_timestep( ) # edit value in buddy_features dataframe - buddy_features.hdim_1.values[ - buddy_looper - ] = pbc_utils.transfm_pbc_point( - float(buddy_feat.hdim_1), hdim1_min, hdim1_max + buddy_features.hdim_1.values[buddy_looper] = ( + pbc_utils.transfm_pbc_point( + float(buddy_feat.hdim_1), hdim1_min, hdim1_max + ) ) - buddy_features.hdim_2.values[ - buddy_looper - ] = pbc_utils.transfm_pbc_point( - float(buddy_feat.hdim_2), hdim2_min, hdim2_max + buddy_features.hdim_2.values[buddy_looper] = ( + pbc_utils.transfm_pbc_point( + float(buddy_feat.hdim_2), hdim2_min, hdim2_max + ) ) buddy_looper = buddy_looper + 1 @@ -1010,9 +1010,9 @@ def segmentation_timestep( segmentation_mask_3[z_val_o, y_val_o, x_val_o] != segmentation_mask_4.data[z_seg, y_seg, x_seg] ): - segmentation_mask_3[ - z_val_o, y_val_o, x_val_o - ] = segmentation_mask_4.data[z_seg, y_seg, x_seg] + segmentation_mask_3[z_val_o, y_val_o, x_val_o] = ( + segmentation_mask_4.data[z_seg, y_seg, x_seg] + ) if not is_3D_seg: segmentation_mask_3 = segmentation_mask_3[0] From 3bd7afa1eb6d33967385e093a71b633457376984 Mon Sep 17 00:00:00 2001 From: Julia Kukulies Date: Fri, 20 Sep 2024 16:48:52 -0600 Subject: [PATCH 4/6] more formatting --- tobac/utils/decorators.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tobac/utils/decorators.py b/tobac/utils/decorators.py index 90e600b5..8d304e71 100644 --- a/tobac/utils/decorators.py +++ b/tobac/utils/decorators.py @@ -75,9 +75,7 @@ def _conv_kwargs_irispandas_to_xarray(conv_kwargs: dict): key: ( convert_cube_to_dataarray(arg) if isinstance(arg, iris.cube.Cube) - else arg.to_xarray() - if isinstance(arg, pd.DataFrame) - else arg + else arg.to_xarray() if isinstance(arg, pd.DataFrame) else arg ) for key, arg in zip(conv_kwargs.keys(), conv_kwargs.values()) } @@ -123,9 +121,7 @@ def _conv_kwargs_xarray_to_irispandas(conv_kwargs: dict): key: ( xr.DataArray.to_iris(arg) if isinstance(arg, xr.DataArray) - else arg.to_dataframe() - if isinstance(arg, xr.Dataset) - else arg + else arg.to_dataframe() if isinstance(arg, xr.Dataset) else arg ) for key, arg in zip(conv_kwargs.keys(), conv_kwargs.values()) } @@ -340,9 +336,7 @@ def wrapper(*args, **kwargs): ( convert_cube_to_dataarray(arg) if type(arg) == iris.cube.Cube - else arg.to_xarray() - if type(arg) == pd.DataFrame - else arg + else arg.to_xarray() if type(arg) == pd.DataFrame else arg ) for arg in args ] From 0040330c7780a5f0f1ccb01e69d2457233de4658 Mon Sep 17 00:00:00 2001 From: Julia Kukulies Date: Thu, 26 Sep 2024 16:28:02 -0600 Subject: [PATCH 5/6] directly pass data_i to statistics function instead of making a copy --- .pre-commit-config.yaml | 5 ----- tobac/feature_detection.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 9e12a10f..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,5 +0,0 @@ -repos: -- repo: https://github.com/psf/black - rev: 21.12b0 - hooks: - - id: black diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index e3c67b65..2555c45a 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1015,7 +1015,7 @@ def feature_detection_multithreshold_timestep( raise ValueError( "Please provide the input parameter statistic to determine what statistics to calculate." ) - raw_data = data_i.core_data().copy() + track_data = gaussian_filter( track_data, sigma=sigma_threshold @@ -1133,7 +1133,7 @@ def feature_detection_multithreshold_timestep( features_thresholds = get_statistics( features_thresholds, labels, - raw_data, + data_i.core_data(), statistic=statistic, index=np.unique(labels[labels > 0]), id_column="idx", From 132ea240528c39499a548a0cb85d9eb71a5c3440 Mon Sep 17 00:00:00 2001 From: Julia Kukulies Date: Thu, 26 Sep 2024 16:29:43 -0600 Subject: [PATCH 6/6] added accidently removed pre-commit yaml file --- .pre-commit-config.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..9e12a10f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: +- repo: https://github.com/psf/black + rev: 21.12b0 + hooks: + - id: black