From 4b434503273cbf02aa96c46912d3dfc2a4e330a3 Mon Sep 17 00:00:00 2001 From: "K.Brunner" Date: Fri, 18 Feb 2022 13:15:05 -0700 Subject: [PATCH 1/6] Adds isolation in 2d to segmentation file. --- tobac/segmentation.py | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/tobac/segmentation.py b/tobac/segmentation.py index c09b1c21..ae621cf0 100644 --- a/tobac/segmentation.py +++ b/tobac/segmentation.py @@ -7,7 +7,7 @@ def segmentation_2D(features,field,dxy,threshold=3e-3,target='maximum',level=Non return segmentation(features,field,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance) -def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto'): +def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = ISO_dilate): """ Function performing watershedding for an individual timestep of the data @@ -26,6 +26,9 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu flag determining the algorithm to use (currently watershedding implemented) max_distance: float maximum distance from a marker allowed to be classified as belonging to that cell + ISO_dilate: int + value by which to dilate the feature size for the isolation parameter. + Default is 8 Output: segmentation_out: iris.cube.Cube @@ -120,6 +123,34 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu else: raise ValueError('unknown method, must be watershed') + #create isolation, currently only available for 2d tracking + if field_in.ndim==2: + nobj = features_in['feature'].values + nobj_len = len(nobj) + + iso = np.empty(nobj_len, dtype='bool') + iso[:] = False + num_obj_around = np.zeros(nobj_len, dtype = 'int') + segmask2 = dilation(segmentation_mask,disk(ISO_dilate)) #formerly square + for iso_id in np.arange(nobj_len): + if iso_id == 0: + continue + obj_ind = np.where(segmask2 == nobj[iso_id]) + objects = np.unique(segmentation_mask[obj_ind]) + if len(objects) >= 3: #one object will always be 0, the element of the feature, any more than those two are considered neighbors + iso[iso_id] = True + num_obj_around[iso_id] = len(objects)-1 #this subtracts the 0 element included in the count + + features_out['isolated'] = iso + features_out['num_objects'] = num_obj_around + + + + + + + + # remove everything from the individual masks that is more than max_distance_pixel away from the markers if max_distance is not None: D=distance_transform_edt((markers==0).astype(int)) @@ -139,7 +170,7 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu return segmentation_out,features_out -def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto'): +def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = 8): """ Function using watershedding or random walker to determine cloud volumes associated with tracked updrafts @@ -162,6 +193,11 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m max_distance: float Maximum distance from a marker allowed to be classified as belonging to that cell + ISO_dilate: int + value by which to dilate the feature size for the isolation parameter. + Default is 8 + + Output: segmentation_out: iris.cube.Cube Cloud mask, 0 outside and integer numbers according to track inside the cloud @@ -169,6 +205,9 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m import pandas as pd from iris.cube import CubeList + features['isolated'] = None + features['num_objects'] = None + logging.info('Start watershedding 3D') # check input for right dimensions: From 637c5bb122daa450027760306b0c5009ebf45a4e Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Tue, 1 Mar 2022 18:00:07 -0600 Subject: [PATCH 2/6] Update segmentation.py Updated for spacing errors --- tobac/segmentation.py | 52 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tobac/segmentation.py b/tobac/segmentation.py index ae621cf0..7719bc09 100644 --- a/tobac/segmentation.py +++ b/tobac/segmentation.py @@ -7,7 +7,7 @@ def segmentation_2D(features,field,dxy,threshold=3e-3,target='maximum',level=Non return segmentation(features,field,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance) -def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = ISO_dilate): +def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = 8): """ Function performing watershedding for an individual timestep of the data @@ -25,16 +25,15 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu method: string flag determining the algorithm to use (currently watershedding implemented) max_distance: float - maximum distance from a marker allowed to be classified as belonging to that cell + maximum distance from a marker allowed to be classified as belonging to that cell ISO_dilate: int - value by which to dilate the feature size for the isolation parameter. - Default is 8 + value by which to dilate the feature size for the isolation parameter. Default is 8 Output: segmentation_out: iris.cube.Cube - cloud mask, 0 outside and integer numbers according to track inside the clouds + cloud mask, 0 outside and integer numbers according to track inside the clouds features_out: pandas.DataFrame - feature dataframe including the number of cells (2D or 3D) in the segmented area/volume of the feature at the timestep + feature dataframe including the number of cells (2D or 3D) in the segmented area/volume of the feature at the timestep """ # The location of watershed within skimage submodules changes with v0.19, but I've kept both for backward compatibility for now try: @@ -43,6 +42,7 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu from skimage.morphology import watershed # from skimage.segmentation import random_walker from scipy.ndimage import distance_transform_edt + from skimage.morphology import dilation, disk from copy import deepcopy import numpy as np @@ -124,25 +124,24 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu raise ValueError('unknown method, must be watershed') #create isolation, currently only available for 2d tracking - if field_in.ndim==2: - nobj = features_in['feature'].values - nobj_len = len(nobj) - - iso = np.empty(nobj_len, dtype='bool') - iso[:] = False - num_obj_around = np.zeros(nobj_len, dtype = 'int') - segmask2 = dilation(segmentation_mask,disk(ISO_dilate)) #formerly square - for iso_id in np.arange(nobj_len): - if iso_id == 0: - continue - obj_ind = np.where(segmask2 == nobj[iso_id]) - objects = np.unique(segmentation_mask[obj_ind]) - if len(objects) >= 3: #one object will always be 0, the element of the feature, any more than those two are considered neighbors - iso[iso_id] = True - num_obj_around[iso_id] = len(objects)-1 #this subtracts the 0 element included in the count + if field_in.ndim==2: + nobj = features_in['feature'].values + nobj_len = len(nobj) + iso = np.empty(nobj_len, dtype='bool') + iso[:] = False + num_obj_around = np.zeros(nobj_len, dtype = 'int') + segmask2 = dilation(segmentation_mask,disk(ISO_dilate)) #formerly square + for iso_id in np.arange(nobj_len): + if iso_id == 0: + continue + obj_ind = np.where(segmask2 == nobj[iso_id]) + objects = np.unique(segmentation_mask[obj_ind]) + if len(objects) >= 3: #one object will always be 0, the element of the feature, any more than those two are considered neighbors + iso[iso_id] = True + num_obj_around[iso_id] = len(objects)-1 #this subtracts the 0 element included in the count - features_out['isolated'] = iso - features_out['num_objects'] = num_obj_around + features_out['isolated'] = iso + features_out['num_objects'] = num_obj_around @@ -206,7 +205,8 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m from iris.cube import CubeList features['isolated'] = None - features['num_objects'] = None + features['num_objects'] = None + print(['iso_dilate:'+str(ISO_dilate)]) logging.info('Start watershedding 3D') @@ -225,7 +225,7 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m for i,field_i in enumerate(field_time): time_i=field_i.coord('time').units.num2date(field_i.coord('time').points[0]) features_i=features.loc[features['time']==time_i] - segmentation_out_i,features_out_i=segmentation_timestep(field_i,features_i,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance,vertical_coord=vertical_coord) + segmentation_out_i,features_out_i=segmentation_timestep(field_i,features_i,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance,vertical_coord=vertical_coord, ISO_dilate = ISO_dilate) segmentation_out_list.append(segmentation_out_i) features_out_list.append(features_out_i) logging.debug('Finished segmentation for '+time_i.strftime('%Y-%m-%d_%H:%M:%S')) From d4cbfa296b76bb04bd8831cbf302579ce52e0592 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Thu, 2 Jun 2022 15:28:25 -0500 Subject: [PATCH 3/6] Create merge_split.py Merge/split as postprocessing step. --- tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 tobac/merge_split.py diff --git a/tobac/merge_split.py b/tobac/merge_split.py new file mode 100644 index 00000000..0cbbc7e5 --- /dev/null +++ b/tobac/merge_split.py @@ -0,0 +1,345 @@ +#Tobac merge and split v0.1 + +from geopy.distance import geodesic +from networkx import * +import numpy as np +from pandas.core.common import flatten +import xarray as xr + +def compress_all(nc_grids, min_dims=2): + for var in nc_grids: + if len(nc_grids[var].dims) >= min_dims: + # print("Compressing ", var) + nc_grids[var].encoding["zlib"] = True + nc_grids[var].encoding["complevel"] = 4 + nc_grids[var].encoding["contiguous"] = False + return nc_grids + + +def standardize_track_dataset(TrackedFeatures, Mask, Projection): + """ Combine a feature mask with the feature data table into a common dataset. + Also renames th + + returned by tobac.themes.tobac_v1.segmentation + with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. + + Also rename the variables to be more desciptive and comply with cf-tree. + + Convert the default cell parent ID to an integer table. + + Add a cell dimension to reflect + + Projection is an xarray DataArray + + TODO: Add metadata attributes to + + """ + feature_standard_names = { + # new variable name, and long description for the NetCDF attribute + 'frame':('feature_time_index', + 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), + 'hdim_1':('feature_hdim1_coordinate', + 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'hdim_2':('feature_hdim2_coordinate', + 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'idx':('feature_id_this_frame',), + 'num':('feature_grid_cell_count', + 'Number of grid points that are within the threshold of this feature'), + 'threshold_value':('feature_threshold_max', + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), + 'feature':('feature_id', + "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), + 'time':('feature_time','time of the feature, consistent with feature_time_index'), + 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), + 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), + 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), + 'lat':('feature_latitude','latitude of the feature'), + 'lon':('feature_longitude','longitude of the feature'), + 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), + 'areas':('feature_area',), + 'isolated':('feature_isolation_flag',), + 'num_objects':('number_of_feature_neighbors',), + 'cell':('feature_parent_cell_id',), + 'time_cell':('feature_parent_cell_elapsed_time',), + 'segmentation_mask':('2d segmentation mask',) + } + new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys()} + + TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) + # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of + # the 'index' variable and call the dimension 'feature' + ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), + Mask]) + + # Add the projection data back in + ds['ProjectionCoordinateSystem']=Projection + + # Convert the cell ID variable from float to integer + if 'int' not in str(TrackedFeatures.cell.dtype): + # The raw output from the tracking is actually an object array + # array([nan, 2, 3], dtype=object) + # (and is cast to a float array when saved as NetCDF, I think). + # Cast to float. + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') + + cell_id_data = TrackedFeatures.cell.astype('float64') + valid_cell = np.isfinite(cell_id_data) + valid_cell_ids = cell_id_data[valid_cell] + if not (np.unique(valid_cell_ids) > 0).all(): + raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') + int_cell[valid_cell] = valid_cell_ids.astype('int64') + #ds['feature_parent_cell_id'] = int_cell + return ds + + + +def merge_split(TRACK,distance = 25000,frame_len = 5): + ''' + function to postprocess tobac track data for merge/split cells + Input: + TRACK: xarray dataset of tobac Track information + + distance: float, optional distance threshold prior to adding a pair of points + into the minimum spanning tree. Default is 25000 meters. + + frame_len: float, optional threshold for the spanning length within which two points + can be separated. Default is five (5) frames. + + + Output: + d: xarray dataset of + feature position along 1st horizontal dimension + hdim2_index: float + feature position along 2nd horizontal dimension + + Example: + d = merge_split(Track) + ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) + # both_ds = xarray.combine_by_coords((ds,d), compat='override') + both_ds = xr.merge([ds, d],compat ='override') + both_ds = compress_all(both_ds) + both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) + + ''' + track_groups = TRACK.groupby('cell') + cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} + id_data = np.fromiter(cell_ids.keys(), dtype=int) + count_data = np.fromiter(cell_ids.values(), dtype=int) + all_frames = np.sort(np.unique(TRACK.frame)) + a_points = list() + b_points = list() + a_names = list() + b_names = list() + dist = list() + + + for i in id_data: + #print(i) + a_pointx = track_groups[i].projection_x_coordinate[-1].values + a_pointy = track_groups[i].projection_y_coordinate[-1].values + for j in id_data: + b_pointx = track_groups[j].projection_x_coordinate[0].values + b_pointy = track_groups[j].projection_y_coordinate[0].values + d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + if d <= distance: + a_points.append([a_pointx,a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) + + + + + +# for i in id_data: +# a_pointx = track_groups[i].grid_longitude[-1].values +# a_pointy = track_groups[i].grid_latitude[-1].values +# for j in id_data: +# b_pointx = track_groups[j].grid_longitude[0].values +# b_pointy = track_groups[j].grid_latitude[0].values +# d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km +# if d <= distance: +# a_points.append([a_pointx,a_pointy]) +# b_points.append([b_pointx, b_pointy]) +# dist.append(d) +# a_names.append(i) +# b_names.append(j) + + id = [] + for i in range(len(dist)-1, -1, -1): + if a_names[i] == b_names[i]: + id.append(i) + a_points.pop(i) + b_points.pop(i) + dist.pop(i) + a_names.pop(i) + b_names.pop(i) + else: + continue + + g = Graph() + for i in np.arange(len(dist)): + g.add_edge(a_names[i], b_names[i],weight=dist[i]) + + tree = minimum_spanning_edges(g) + tree_list = list(minimum_spanning_edges(g)) + + new_tree = [] + for i,j in enumerate(tree_list): + frame_a = np.nanmax(track_groups[j[0]].frame.values) + frame_b = np.nanmin(track_groups[j[1]].frame.values) + if np.abs(frame_a - frame_b) <= frame_len: + new_tree.append(tree_list[i][0:2]) + new_tree_arr = np.array(new_tree) + + TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) + cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) + track_id = dict() #same size as number of total merged tracks + + arr = np.array([0]) + for p in cell_id: + j = np.where(arr == int(p)) + if len(j[0]) > 0: + continue + else: + k = np.where(new_tree_arr == p) + if len(k[0]) == 0: + track_id[p] = [p] + arr = np.append(arr,p) + else: + temp1 = list(np.unique(new_tree_arr[k[0]])) + temp = list(np.unique(new_tree_arr[k[0]])) + + for l in range(15): + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + + if len(temp1) == len(temp): + break + temp1 = np.array(temp) + + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + arr = np.append(arr,np.unique(temp)) + + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + + + storm_id = [0] #default because we don't track larger storm systems *yet* + print('found storm id') + + + track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* + print('found track parent storm ids') + + track_ids = np.array(list(track_id.keys())) + print('found track ids') + + + cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) + print('found cell ids') + + cell_parent_track_id = [] + + for i, id in enumerate(track_id): + + if len(track_id[int(id)]) == 1: + cell_parent_track_id.append(int(id)) + + else: + cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) + + + cell_parent_track_id = list(flatten(cell_parent_track_id)) + print('found cell parent track ids') + + feature_parent_cell_id = list(TRACK.cell.values.astype(float)) + + print('found feature parent cell ids') + + #This version includes all the feature regardless of if they are used in cells or not. + feature_id = list(TRACK.feature.values.astype(int)) + print('found feature ids') + + feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm + print('found feature parent storm ids') + + feature_parent_track_id = [] + feature_parent_track_id = np.zeros(len(feature_id)) + for i, id in enumerate(feature_id): + cellid = feature_parent_cell_id[i] + if np.isnan(cellid): + feature_parent_track_id[i] = -1 + else: + j = np.where(cell_id == cellid) + j = np.squeeze(j) + trackid = cell_parent_track_id[j] + feature_parent_track_id[i] = trackid + + print('found feature parent track ids') + + + storm_child_track_count = [len(track_id)] + print('found storm child track count') + + track_child_cell_count = [] + for i,id in enumerate(track_id): + track_child_cell_count.append(len(track_id[int(id)])) + print('found track child cell count') + + + cell_child_feature_count = [] + for i,id in enumerate(cell_id): + cell_child_feature_count.append(len(track_groups[id].feature.values)) + print('found cell child feature count') + + storm_child_cell_count = [len(cell_id)] + storm_child_feature_count = [len(feature_id)] + + storm_dim = 'nstorms' + track_dim = 'ntracks' + cell_dim = 'ncells' + feature_dim = 'nfeatures' + + d = xr.Dataset({ + 'storm_id': (storm_dim, storm_id), + 'track_id': (track_dim, track_ids), + 'track_parent_storm_id': (track_dim, track_parent_storm_id), + 'cell_id': (cell_dim, cell_id), + 'cell_parent_track_id': (cell_dim, cell_parent_track_id), + 'feature_id': (feature_dim, feature_id), + 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), + 'feature_parent_track_id': (feature_dim, feature_parent_track_id), + 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), + 'storm_child_track_count': (storm_dim, storm_child_track_count), + 'storm_child_cell_count': (storm_dim, storm_child_cell_count), + 'storm_child_feature_count': (storm_dim, storm_child_feature_count), + 'track_child_cell_count': (track_dim, track_child_cell_count), + 'cell_child_feature_count': (cell_dim, cell_child_feature_count), + }) + d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) + + assert len(track_id) == len(track_parent_storm_id) + assert len(cell_id) == len(cell_parent_track_id) + assert len(feature_id) == len(feature_parent_cell_id) + assert sum(storm_child_track_count) == len(track_id) + assert sum(storm_child_cell_count) == len(cell_id) + assert sum(storm_child_feature_count) == len(feature_id) + assert sum(track_child_cell_count) == len(cell_id) + assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) + + return d \ No newline at end of file From 547bd2a7eb2c74061360197e814ecbe277386819 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Thu, 2 Jun 2022 15:36:34 -0500 Subject: [PATCH 4/6] Revert "Create merge_split.py" This reverts commit d4cbfa296b76bb04bd8831cbf302579ce52e0592. --- tobac/merge_split.py | 345 ------------------------------------------- 1 file changed, 345 deletions(-) delete mode 100644 tobac/merge_split.py diff --git a/tobac/merge_split.py b/tobac/merge_split.py deleted file mode 100644 index 0cbbc7e5..00000000 --- a/tobac/merge_split.py +++ /dev/null @@ -1,345 +0,0 @@ -#Tobac merge and split v0.1 - -from geopy.distance import geodesic -from networkx import * -import numpy as np -from pandas.core.common import flatten -import xarray as xr - -def compress_all(nc_grids, min_dims=2): - for var in nc_grids: - if len(nc_grids[var].dims) >= min_dims: - # print("Compressing ", var) - nc_grids[var].encoding["zlib"] = True - nc_grids[var].encoding["complevel"] = 4 - nc_grids[var].encoding["contiguous"] = False - return nc_grids - - -def standardize_track_dataset(TrackedFeatures, Mask, Projection): - """ Combine a feature mask with the feature data table into a common dataset. - Also renames th - - returned by tobac.themes.tobac_v1.segmentation - with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. - - Also rename the variables to be more desciptive and comply with cf-tree. - - Convert the default cell parent ID to an integer table. - - Add a cell dimension to reflect - - Projection is an xarray DataArray - - TODO: Add metadata attributes to - - """ - feature_standard_names = { - # new variable name, and long description for the NetCDF attribute - 'frame':('feature_time_index', - 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), - 'hdim_1':('feature_hdim1_coordinate', - 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' - 'The numbering is consistent with positional indexing of the coordinate, but can be' - 'fractional, to account for a centroid not aligned to the grid.'), - 'hdim_2':('feature_hdim2_coordinate', - 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' - 'The numbering is consistent with positional indexing of the coordinate, but can be' - 'fractional, to account for a centroid not aligned to the grid.'), - 'idx':('feature_id_this_frame',), - 'num':('feature_grid_cell_count', - 'Number of grid points that are within the threshold of this feature'), - 'threshold_value':('feature_threshold_max', - "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), - 'feature':('feature_id', - "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), - 'time':('feature_time','time of the feature, consistent with feature_time_index'), - 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), - 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), - 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), - 'lat':('feature_latitude','latitude of the feature'), - 'lon':('feature_longitude','longitude of the feature'), - 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), - 'areas':('feature_area',), - 'isolated':('feature_isolation_flag',), - 'num_objects':('number_of_feature_neighbors',), - 'cell':('feature_parent_cell_id',), - 'time_cell':('feature_parent_cell_elapsed_time',), - 'segmentation_mask':('2d segmentation mask',) - } - new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() - if k in TrackedFeatures.variables.keys()} - - TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) - # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of - # the 'index' variable and call the dimension 'feature' - ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), - Mask]) - - # Add the projection data back in - ds['ProjectionCoordinateSystem']=Projection - - # Convert the cell ID variable from float to integer - if 'int' not in str(TrackedFeatures.cell.dtype): - # The raw output from the tracking is actually an object array - # array([nan, 2, 3], dtype=object) - # (and is cast to a float array when saved as NetCDF, I think). - # Cast to float. - int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') - - cell_id_data = TrackedFeatures.cell.astype('float64') - valid_cell = np.isfinite(cell_id_data) - valid_cell_ids = cell_id_data[valid_cell] - if not (np.unique(valid_cell_ids) > 0).all(): - raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') - int_cell[valid_cell] = valid_cell_ids.astype('int64') - #ds['feature_parent_cell_id'] = int_cell - return ds - - - -def merge_split(TRACK,distance = 25000,frame_len = 5): - ''' - function to postprocess tobac track data for merge/split cells - Input: - TRACK: xarray dataset of tobac Track information - - distance: float, optional distance threshold prior to adding a pair of points - into the minimum spanning tree. Default is 25000 meters. - - frame_len: float, optional threshold for the spanning length within which two points - can be separated. Default is five (5) frames. - - - Output: - d: xarray dataset of - feature position along 1st horizontal dimension - hdim2_index: float - feature position along 2nd horizontal dimension - - Example: - d = merge_split(Track) - ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) - # both_ds = xarray.combine_by_coords((ds,d), compat='override') - both_ds = xr.merge([ds, d],compat ='override') - both_ds = compress_all(both_ds) - both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) - - ''' - track_groups = TRACK.groupby('cell') - cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} - id_data = np.fromiter(cell_ids.keys(), dtype=int) - count_data = np.fromiter(cell_ids.values(), dtype=int) - all_frames = np.sort(np.unique(TRACK.frame)) - a_points = list() - b_points = list() - a_names = list() - b_names = list() - dist = list() - - - for i in id_data: - #print(i) - a_pointx = track_groups[i].projection_x_coordinate[-1].values - a_pointy = track_groups[i].projection_y_coordinate[-1].values - for j in id_data: - b_pointx = track_groups[j].projection_x_coordinate[0].values - b_pointy = track_groups[j].projection_y_coordinate[0].values - d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) - if d <= distance: - a_points.append([a_pointx,a_pointy]) - b_points.append([b_pointx, b_pointy]) - dist.append(d) - a_names.append(i) - b_names.append(j) - - - - - -# for i in id_data: -# a_pointx = track_groups[i].grid_longitude[-1].values -# a_pointy = track_groups[i].grid_latitude[-1].values -# for j in id_data: -# b_pointx = track_groups[j].grid_longitude[0].values -# b_pointy = track_groups[j].grid_latitude[0].values -# d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km -# if d <= distance: -# a_points.append([a_pointx,a_pointy]) -# b_points.append([b_pointx, b_pointy]) -# dist.append(d) -# a_names.append(i) -# b_names.append(j) - - id = [] - for i in range(len(dist)-1, -1, -1): - if a_names[i] == b_names[i]: - id.append(i) - a_points.pop(i) - b_points.pop(i) - dist.pop(i) - a_names.pop(i) - b_names.pop(i) - else: - continue - - g = Graph() - for i in np.arange(len(dist)): - g.add_edge(a_names[i], b_names[i],weight=dist[i]) - - tree = minimum_spanning_edges(g) - tree_list = list(minimum_spanning_edges(g)) - - new_tree = [] - for i,j in enumerate(tree_list): - frame_a = np.nanmax(track_groups[j[0]].frame.values) - frame_b = np.nanmin(track_groups[j[1]].frame.values) - if np.abs(frame_a - frame_b) <= frame_len: - new_tree.append(tree_list[i][0:2]) - new_tree_arr = np.array(new_tree) - - TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) - cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) - track_id = dict() #same size as number of total merged tracks - - arr = np.array([0]) - for p in cell_id: - j = np.where(arr == int(p)) - if len(j[0]) > 0: - continue - else: - k = np.where(new_tree_arr == p) - if len(k[0]) == 0: - track_id[p] = [p] - arr = np.append(arr,p) - else: - temp1 = list(np.unique(new_tree_arr[k[0]])) - temp = list(np.unique(new_tree_arr[k[0]])) - - for l in range(15): - for i in temp1: - k2 = np.where(new_tree_arr == i) - temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) - temp = list(flatten(temp)) - temp = list(np.unique(temp)) - - if len(temp1) == len(temp): - break - temp1 = np.array(temp) - - for i in temp1: - k2 = np.where(new_tree_arr == i) - temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) - - temp = list(flatten(temp)) - temp = list(np.unique(temp)) - arr = np.append(arr,np.unique(temp)) - - track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - - - - storm_id = [0] #default because we don't track larger storm systems *yet* - print('found storm id') - - - track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* - print('found track parent storm ids') - - track_ids = np.array(list(track_id.keys())) - print('found track ids') - - - cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) - print('found cell ids') - - cell_parent_track_id = [] - - for i, id in enumerate(track_id): - - if len(track_id[int(id)]) == 1: - cell_parent_track_id.append(int(id)) - - else: - cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) - - - cell_parent_track_id = list(flatten(cell_parent_track_id)) - print('found cell parent track ids') - - feature_parent_cell_id = list(TRACK.cell.values.astype(float)) - - print('found feature parent cell ids') - - #This version includes all the feature regardless of if they are used in cells or not. - feature_id = list(TRACK.feature.values.astype(int)) - print('found feature ids') - - feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm - print('found feature parent storm ids') - - feature_parent_track_id = [] - feature_parent_track_id = np.zeros(len(feature_id)) - for i, id in enumerate(feature_id): - cellid = feature_parent_cell_id[i] - if np.isnan(cellid): - feature_parent_track_id[i] = -1 - else: - j = np.where(cell_id == cellid) - j = np.squeeze(j) - trackid = cell_parent_track_id[j] - feature_parent_track_id[i] = trackid - - print('found feature parent track ids') - - - storm_child_track_count = [len(track_id)] - print('found storm child track count') - - track_child_cell_count = [] - for i,id in enumerate(track_id): - track_child_cell_count.append(len(track_id[int(id)])) - print('found track child cell count') - - - cell_child_feature_count = [] - for i,id in enumerate(cell_id): - cell_child_feature_count.append(len(track_groups[id].feature.values)) - print('found cell child feature count') - - storm_child_cell_count = [len(cell_id)] - storm_child_feature_count = [len(feature_id)] - - storm_dim = 'nstorms' - track_dim = 'ntracks' - cell_dim = 'ncells' - feature_dim = 'nfeatures' - - d = xr.Dataset({ - 'storm_id': (storm_dim, storm_id), - 'track_id': (track_dim, track_ids), - 'track_parent_storm_id': (track_dim, track_parent_storm_id), - 'cell_id': (cell_dim, cell_id), - 'cell_parent_track_id': (cell_dim, cell_parent_track_id), - 'feature_id': (feature_dim, feature_id), - 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), - 'feature_parent_track_id': (feature_dim, feature_parent_track_id), - 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), - 'storm_child_track_count': (storm_dim, storm_child_track_count), - 'storm_child_cell_count': (storm_dim, storm_child_cell_count), - 'storm_child_feature_count': (storm_dim, storm_child_feature_count), - 'track_child_cell_count': (track_dim, track_child_cell_count), - 'cell_child_feature_count': (cell_dim, cell_child_feature_count), - }) - d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) - - assert len(track_id) == len(track_parent_storm_id) - assert len(cell_id) == len(cell_parent_track_id) - assert len(feature_id) == len(feature_parent_cell_id) - assert sum(storm_child_track_count) == len(track_id) - assert sum(storm_child_cell_count) == len(cell_id) - assert sum(storm_child_feature_count) == len(feature_id) - assert sum(track_child_cell_count) == len(cell_id) - assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) - - return d \ No newline at end of file From e3fa302a1d7a39f470e613dcb0c2e6a258458bfa Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 5 Jun 2022 20:33:26 -0500 Subject: [PATCH 5/6] "Create merge_split.py" This reverts commit 547bd2a7eb2c74061360197e814ecbe277386819. Attempting to correct some incorrect merging. Should include only the merge_split.py file. --- tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 tobac/merge_split.py diff --git a/tobac/merge_split.py b/tobac/merge_split.py new file mode 100644 index 00000000..0cbbc7e5 --- /dev/null +++ b/tobac/merge_split.py @@ -0,0 +1,345 @@ +#Tobac merge and split v0.1 + +from geopy.distance import geodesic +from networkx import * +import numpy as np +from pandas.core.common import flatten +import xarray as xr + +def compress_all(nc_grids, min_dims=2): + for var in nc_grids: + if len(nc_grids[var].dims) >= min_dims: + # print("Compressing ", var) + nc_grids[var].encoding["zlib"] = True + nc_grids[var].encoding["complevel"] = 4 + nc_grids[var].encoding["contiguous"] = False + return nc_grids + + +def standardize_track_dataset(TrackedFeatures, Mask, Projection): + """ Combine a feature mask with the feature data table into a common dataset. + Also renames th + + returned by tobac.themes.tobac_v1.segmentation + with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. + + Also rename the variables to be more desciptive and comply with cf-tree. + + Convert the default cell parent ID to an integer table. + + Add a cell dimension to reflect + + Projection is an xarray DataArray + + TODO: Add metadata attributes to + + """ + feature_standard_names = { + # new variable name, and long description for the NetCDF attribute + 'frame':('feature_time_index', + 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), + 'hdim_1':('feature_hdim1_coordinate', + 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'hdim_2':('feature_hdim2_coordinate', + 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'idx':('feature_id_this_frame',), + 'num':('feature_grid_cell_count', + 'Number of grid points that are within the threshold of this feature'), + 'threshold_value':('feature_threshold_max', + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), + 'feature':('feature_id', + "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), + 'time':('feature_time','time of the feature, consistent with feature_time_index'), + 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), + 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), + 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), + 'lat':('feature_latitude','latitude of the feature'), + 'lon':('feature_longitude','longitude of the feature'), + 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), + 'areas':('feature_area',), + 'isolated':('feature_isolation_flag',), + 'num_objects':('number_of_feature_neighbors',), + 'cell':('feature_parent_cell_id',), + 'time_cell':('feature_parent_cell_elapsed_time',), + 'segmentation_mask':('2d segmentation mask',) + } + new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys()} + + TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) + # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of + # the 'index' variable and call the dimension 'feature' + ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), + Mask]) + + # Add the projection data back in + ds['ProjectionCoordinateSystem']=Projection + + # Convert the cell ID variable from float to integer + if 'int' not in str(TrackedFeatures.cell.dtype): + # The raw output from the tracking is actually an object array + # array([nan, 2, 3], dtype=object) + # (and is cast to a float array when saved as NetCDF, I think). + # Cast to float. + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') + + cell_id_data = TrackedFeatures.cell.astype('float64') + valid_cell = np.isfinite(cell_id_data) + valid_cell_ids = cell_id_data[valid_cell] + if not (np.unique(valid_cell_ids) > 0).all(): + raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') + int_cell[valid_cell] = valid_cell_ids.astype('int64') + #ds['feature_parent_cell_id'] = int_cell + return ds + + + +def merge_split(TRACK,distance = 25000,frame_len = 5): + ''' + function to postprocess tobac track data for merge/split cells + Input: + TRACK: xarray dataset of tobac Track information + + distance: float, optional distance threshold prior to adding a pair of points + into the minimum spanning tree. Default is 25000 meters. + + frame_len: float, optional threshold for the spanning length within which two points + can be separated. Default is five (5) frames. + + + Output: + d: xarray dataset of + feature position along 1st horizontal dimension + hdim2_index: float + feature position along 2nd horizontal dimension + + Example: + d = merge_split(Track) + ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) + # both_ds = xarray.combine_by_coords((ds,d), compat='override') + both_ds = xr.merge([ds, d],compat ='override') + both_ds = compress_all(both_ds) + both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) + + ''' + track_groups = TRACK.groupby('cell') + cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} + id_data = np.fromiter(cell_ids.keys(), dtype=int) + count_data = np.fromiter(cell_ids.values(), dtype=int) + all_frames = np.sort(np.unique(TRACK.frame)) + a_points = list() + b_points = list() + a_names = list() + b_names = list() + dist = list() + + + for i in id_data: + #print(i) + a_pointx = track_groups[i].projection_x_coordinate[-1].values + a_pointy = track_groups[i].projection_y_coordinate[-1].values + for j in id_data: + b_pointx = track_groups[j].projection_x_coordinate[0].values + b_pointy = track_groups[j].projection_y_coordinate[0].values + d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + if d <= distance: + a_points.append([a_pointx,a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) + + + + + +# for i in id_data: +# a_pointx = track_groups[i].grid_longitude[-1].values +# a_pointy = track_groups[i].grid_latitude[-1].values +# for j in id_data: +# b_pointx = track_groups[j].grid_longitude[0].values +# b_pointy = track_groups[j].grid_latitude[0].values +# d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km +# if d <= distance: +# a_points.append([a_pointx,a_pointy]) +# b_points.append([b_pointx, b_pointy]) +# dist.append(d) +# a_names.append(i) +# b_names.append(j) + + id = [] + for i in range(len(dist)-1, -1, -1): + if a_names[i] == b_names[i]: + id.append(i) + a_points.pop(i) + b_points.pop(i) + dist.pop(i) + a_names.pop(i) + b_names.pop(i) + else: + continue + + g = Graph() + for i in np.arange(len(dist)): + g.add_edge(a_names[i], b_names[i],weight=dist[i]) + + tree = minimum_spanning_edges(g) + tree_list = list(minimum_spanning_edges(g)) + + new_tree = [] + for i,j in enumerate(tree_list): + frame_a = np.nanmax(track_groups[j[0]].frame.values) + frame_b = np.nanmin(track_groups[j[1]].frame.values) + if np.abs(frame_a - frame_b) <= frame_len: + new_tree.append(tree_list[i][0:2]) + new_tree_arr = np.array(new_tree) + + TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) + cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) + track_id = dict() #same size as number of total merged tracks + + arr = np.array([0]) + for p in cell_id: + j = np.where(arr == int(p)) + if len(j[0]) > 0: + continue + else: + k = np.where(new_tree_arr == p) + if len(k[0]) == 0: + track_id[p] = [p] + arr = np.append(arr,p) + else: + temp1 = list(np.unique(new_tree_arr[k[0]])) + temp = list(np.unique(new_tree_arr[k[0]])) + + for l in range(15): + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + + if len(temp1) == len(temp): + break + temp1 = np.array(temp) + + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + arr = np.append(arr,np.unique(temp)) + + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + + + storm_id = [0] #default because we don't track larger storm systems *yet* + print('found storm id') + + + track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* + print('found track parent storm ids') + + track_ids = np.array(list(track_id.keys())) + print('found track ids') + + + cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) + print('found cell ids') + + cell_parent_track_id = [] + + for i, id in enumerate(track_id): + + if len(track_id[int(id)]) == 1: + cell_parent_track_id.append(int(id)) + + else: + cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) + + + cell_parent_track_id = list(flatten(cell_parent_track_id)) + print('found cell parent track ids') + + feature_parent_cell_id = list(TRACK.cell.values.astype(float)) + + print('found feature parent cell ids') + + #This version includes all the feature regardless of if they are used in cells or not. + feature_id = list(TRACK.feature.values.astype(int)) + print('found feature ids') + + feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm + print('found feature parent storm ids') + + feature_parent_track_id = [] + feature_parent_track_id = np.zeros(len(feature_id)) + for i, id in enumerate(feature_id): + cellid = feature_parent_cell_id[i] + if np.isnan(cellid): + feature_parent_track_id[i] = -1 + else: + j = np.where(cell_id == cellid) + j = np.squeeze(j) + trackid = cell_parent_track_id[j] + feature_parent_track_id[i] = trackid + + print('found feature parent track ids') + + + storm_child_track_count = [len(track_id)] + print('found storm child track count') + + track_child_cell_count = [] + for i,id in enumerate(track_id): + track_child_cell_count.append(len(track_id[int(id)])) + print('found track child cell count') + + + cell_child_feature_count = [] + for i,id in enumerate(cell_id): + cell_child_feature_count.append(len(track_groups[id].feature.values)) + print('found cell child feature count') + + storm_child_cell_count = [len(cell_id)] + storm_child_feature_count = [len(feature_id)] + + storm_dim = 'nstorms' + track_dim = 'ntracks' + cell_dim = 'ncells' + feature_dim = 'nfeatures' + + d = xr.Dataset({ + 'storm_id': (storm_dim, storm_id), + 'track_id': (track_dim, track_ids), + 'track_parent_storm_id': (track_dim, track_parent_storm_id), + 'cell_id': (cell_dim, cell_id), + 'cell_parent_track_id': (cell_dim, cell_parent_track_id), + 'feature_id': (feature_dim, feature_id), + 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), + 'feature_parent_track_id': (feature_dim, feature_parent_track_id), + 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), + 'storm_child_track_count': (storm_dim, storm_child_track_count), + 'storm_child_cell_count': (storm_dim, storm_child_cell_count), + 'storm_child_feature_count': (storm_dim, storm_child_feature_count), + 'track_child_cell_count': (track_dim, track_child_cell_count), + 'cell_child_feature_count': (cell_dim, cell_child_feature_count), + }) + d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) + + assert len(track_id) == len(track_parent_storm_id) + assert len(cell_id) == len(cell_parent_track_id) + assert len(feature_id) == len(feature_parent_cell_id) + assert sum(storm_child_track_count) == len(track_id) + assert sum(storm_child_cell_count) == len(cell_id) + assert sum(storm_child_feature_count) == len(feature_id) + assert sum(track_child_cell_count) == len(cell_id) + assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) + + return d \ No newline at end of file From 728cee0fec9ddaa2ba6f93a18c6dac3157d1a500 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 5 Jun 2022 20:51:23 -0500 Subject: [PATCH 6/6] Reverting a segmentation file. Replaced former segmentation file with isolated feature changes. --- build/lib/tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 build/lib/tobac/merge_split.py diff --git a/build/lib/tobac/merge_split.py b/build/lib/tobac/merge_split.py new file mode 100644 index 00000000..0cbbc7e5 --- /dev/null +++ b/build/lib/tobac/merge_split.py @@ -0,0 +1,345 @@ +#Tobac merge and split v0.1 + +from geopy.distance import geodesic +from networkx import * +import numpy as np +from pandas.core.common import flatten +import xarray as xr + +def compress_all(nc_grids, min_dims=2): + for var in nc_grids: + if len(nc_grids[var].dims) >= min_dims: + # print("Compressing ", var) + nc_grids[var].encoding["zlib"] = True + nc_grids[var].encoding["complevel"] = 4 + nc_grids[var].encoding["contiguous"] = False + return nc_grids + + +def standardize_track_dataset(TrackedFeatures, Mask, Projection): + """ Combine a feature mask with the feature data table into a common dataset. + Also renames th + + returned by tobac.themes.tobac_v1.segmentation + with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. + + Also rename the variables to be more desciptive and comply with cf-tree. + + Convert the default cell parent ID to an integer table. + + Add a cell dimension to reflect + + Projection is an xarray DataArray + + TODO: Add metadata attributes to + + """ + feature_standard_names = { + # new variable name, and long description for the NetCDF attribute + 'frame':('feature_time_index', + 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), + 'hdim_1':('feature_hdim1_coordinate', + 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'hdim_2':('feature_hdim2_coordinate', + 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'idx':('feature_id_this_frame',), + 'num':('feature_grid_cell_count', + 'Number of grid points that are within the threshold of this feature'), + 'threshold_value':('feature_threshold_max', + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), + 'feature':('feature_id', + "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), + 'time':('feature_time','time of the feature, consistent with feature_time_index'), + 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), + 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), + 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), + 'lat':('feature_latitude','latitude of the feature'), + 'lon':('feature_longitude','longitude of the feature'), + 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), + 'areas':('feature_area',), + 'isolated':('feature_isolation_flag',), + 'num_objects':('number_of_feature_neighbors',), + 'cell':('feature_parent_cell_id',), + 'time_cell':('feature_parent_cell_elapsed_time',), + 'segmentation_mask':('2d segmentation mask',) + } + new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys()} + + TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) + # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of + # the 'index' variable and call the dimension 'feature' + ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), + Mask]) + + # Add the projection data back in + ds['ProjectionCoordinateSystem']=Projection + + # Convert the cell ID variable from float to integer + if 'int' not in str(TrackedFeatures.cell.dtype): + # The raw output from the tracking is actually an object array + # array([nan, 2, 3], dtype=object) + # (and is cast to a float array when saved as NetCDF, I think). + # Cast to float. + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') + + cell_id_data = TrackedFeatures.cell.astype('float64') + valid_cell = np.isfinite(cell_id_data) + valid_cell_ids = cell_id_data[valid_cell] + if not (np.unique(valid_cell_ids) > 0).all(): + raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') + int_cell[valid_cell] = valid_cell_ids.astype('int64') + #ds['feature_parent_cell_id'] = int_cell + return ds + + + +def merge_split(TRACK,distance = 25000,frame_len = 5): + ''' + function to postprocess tobac track data for merge/split cells + Input: + TRACK: xarray dataset of tobac Track information + + distance: float, optional distance threshold prior to adding a pair of points + into the minimum spanning tree. Default is 25000 meters. + + frame_len: float, optional threshold for the spanning length within which two points + can be separated. Default is five (5) frames. + + + Output: + d: xarray dataset of + feature position along 1st horizontal dimension + hdim2_index: float + feature position along 2nd horizontal dimension + + Example: + d = merge_split(Track) + ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) + # both_ds = xarray.combine_by_coords((ds,d), compat='override') + both_ds = xr.merge([ds, d],compat ='override') + both_ds = compress_all(both_ds) + both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) + + ''' + track_groups = TRACK.groupby('cell') + cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} + id_data = np.fromiter(cell_ids.keys(), dtype=int) + count_data = np.fromiter(cell_ids.values(), dtype=int) + all_frames = np.sort(np.unique(TRACK.frame)) + a_points = list() + b_points = list() + a_names = list() + b_names = list() + dist = list() + + + for i in id_data: + #print(i) + a_pointx = track_groups[i].projection_x_coordinate[-1].values + a_pointy = track_groups[i].projection_y_coordinate[-1].values + for j in id_data: + b_pointx = track_groups[j].projection_x_coordinate[0].values + b_pointy = track_groups[j].projection_y_coordinate[0].values + d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + if d <= distance: + a_points.append([a_pointx,a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) + + + + + +# for i in id_data: +# a_pointx = track_groups[i].grid_longitude[-1].values +# a_pointy = track_groups[i].grid_latitude[-1].values +# for j in id_data: +# b_pointx = track_groups[j].grid_longitude[0].values +# b_pointy = track_groups[j].grid_latitude[0].values +# d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km +# if d <= distance: +# a_points.append([a_pointx,a_pointy]) +# b_points.append([b_pointx, b_pointy]) +# dist.append(d) +# a_names.append(i) +# b_names.append(j) + + id = [] + for i in range(len(dist)-1, -1, -1): + if a_names[i] == b_names[i]: + id.append(i) + a_points.pop(i) + b_points.pop(i) + dist.pop(i) + a_names.pop(i) + b_names.pop(i) + else: + continue + + g = Graph() + for i in np.arange(len(dist)): + g.add_edge(a_names[i], b_names[i],weight=dist[i]) + + tree = minimum_spanning_edges(g) + tree_list = list(minimum_spanning_edges(g)) + + new_tree = [] + for i,j in enumerate(tree_list): + frame_a = np.nanmax(track_groups[j[0]].frame.values) + frame_b = np.nanmin(track_groups[j[1]].frame.values) + if np.abs(frame_a - frame_b) <= frame_len: + new_tree.append(tree_list[i][0:2]) + new_tree_arr = np.array(new_tree) + + TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) + cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) + track_id = dict() #same size as number of total merged tracks + + arr = np.array([0]) + for p in cell_id: + j = np.where(arr == int(p)) + if len(j[0]) > 0: + continue + else: + k = np.where(new_tree_arr == p) + if len(k[0]) == 0: + track_id[p] = [p] + arr = np.append(arr,p) + else: + temp1 = list(np.unique(new_tree_arr[k[0]])) + temp = list(np.unique(new_tree_arr[k[0]])) + + for l in range(15): + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + + if len(temp1) == len(temp): + break + temp1 = np.array(temp) + + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + arr = np.append(arr,np.unique(temp)) + + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + + + storm_id = [0] #default because we don't track larger storm systems *yet* + print('found storm id') + + + track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* + print('found track parent storm ids') + + track_ids = np.array(list(track_id.keys())) + print('found track ids') + + + cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) + print('found cell ids') + + cell_parent_track_id = [] + + for i, id in enumerate(track_id): + + if len(track_id[int(id)]) == 1: + cell_parent_track_id.append(int(id)) + + else: + cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) + + + cell_parent_track_id = list(flatten(cell_parent_track_id)) + print('found cell parent track ids') + + feature_parent_cell_id = list(TRACK.cell.values.astype(float)) + + print('found feature parent cell ids') + + #This version includes all the feature regardless of if they are used in cells or not. + feature_id = list(TRACK.feature.values.astype(int)) + print('found feature ids') + + feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm + print('found feature parent storm ids') + + feature_parent_track_id = [] + feature_parent_track_id = np.zeros(len(feature_id)) + for i, id in enumerate(feature_id): + cellid = feature_parent_cell_id[i] + if np.isnan(cellid): + feature_parent_track_id[i] = -1 + else: + j = np.where(cell_id == cellid) + j = np.squeeze(j) + trackid = cell_parent_track_id[j] + feature_parent_track_id[i] = trackid + + print('found feature parent track ids') + + + storm_child_track_count = [len(track_id)] + print('found storm child track count') + + track_child_cell_count = [] + for i,id in enumerate(track_id): + track_child_cell_count.append(len(track_id[int(id)])) + print('found track child cell count') + + + cell_child_feature_count = [] + for i,id in enumerate(cell_id): + cell_child_feature_count.append(len(track_groups[id].feature.values)) + print('found cell child feature count') + + storm_child_cell_count = [len(cell_id)] + storm_child_feature_count = [len(feature_id)] + + storm_dim = 'nstorms' + track_dim = 'ntracks' + cell_dim = 'ncells' + feature_dim = 'nfeatures' + + d = xr.Dataset({ + 'storm_id': (storm_dim, storm_id), + 'track_id': (track_dim, track_ids), + 'track_parent_storm_id': (track_dim, track_parent_storm_id), + 'cell_id': (cell_dim, cell_id), + 'cell_parent_track_id': (cell_dim, cell_parent_track_id), + 'feature_id': (feature_dim, feature_id), + 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), + 'feature_parent_track_id': (feature_dim, feature_parent_track_id), + 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), + 'storm_child_track_count': (storm_dim, storm_child_track_count), + 'storm_child_cell_count': (storm_dim, storm_child_cell_count), + 'storm_child_feature_count': (storm_dim, storm_child_feature_count), + 'track_child_cell_count': (track_dim, track_child_cell_count), + 'cell_child_feature_count': (cell_dim, cell_child_feature_count), + }) + d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) + + assert len(track_id) == len(track_parent_storm_id) + assert len(cell_id) == len(cell_parent_track_id) + assert len(feature_id) == len(feature_parent_cell_id) + assert sum(storm_child_track_count) == len(track_id) + assert sum(storm_child_cell_count) == len(cell_id) + assert sum(storm_child_feature_count) == len(feature_id) + assert sum(track_child_cell_count) == len(cell_id) + assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) + + return d \ No newline at end of file