From 4b434503273cbf02aa96c46912d3dfc2a4e330a3 Mon Sep 17 00:00:00 2001
From: "K.Brunner" <kelcy.brunner@ttu.edu>
Date: Fri, 18 Feb 2022 13:15:05 -0700
Subject: [PATCH 1/6] Adds isolation in 2d to segmentation file.

---
 tobac/segmentation.py | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/tobac/segmentation.py b/tobac/segmentation.py
index c09b1c21..ae621cf0 100644
--- a/tobac/segmentation.py
+++ b/tobac/segmentation.py
@@ -7,7 +7,7 @@ def segmentation_2D(features,field,dxy,threshold=3e-3,target='maximum',level=Non
     return segmentation(features,field,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance)
 
 
-def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto'):    
+def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = ISO_dilate):    
     """
     Function performing watershedding for an individual timestep of the data
     
@@ -26,6 +26,9 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu
                 flag determining the algorithm to use (currently watershedding implemented)
     max_distance: float
                   maximum distance from a marker allowed to be classified as belonging to that cell
+    ISO_dilate: int
+                value by which to dilate the feature size for the isolation parameter. 
+                Default is 8
     
     Output:
     segmentation_out: iris.cube.Cube
@@ -120,6 +123,34 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu
     else:                
         raise ValueError('unknown method, must be watershed')
 
+	#create isolation, currently only available for 2d tracking
+	if field_in.ndim==2:
+    	nobj = features_in['feature'].values
+    	nobj_len = len(nobj)
+    
+    	iso = np.empty(nobj_len, dtype='bool')
+    	iso[:] = False
+    	num_obj_around = np.zeros(nobj_len, dtype = 'int')
+    	segmask2 = dilation(segmentation_mask,disk(ISO_dilate)) #formerly square
+    	for iso_id in np.arange(nobj_len):
+        	if iso_id == 0:
+            	continue
+        	obj_ind = np.where(segmask2 == nobj[iso_id])
+        	objects = np.unique(segmentation_mask[obj_ind])
+        	if len(objects) >= 3: #one object will always be 0, the element of the feature, any more than those two are considered neighbors
+            	iso[iso_id] = True
+        	num_obj_around[iso_id] = len(objects)-1 #this subtracts the 0 element included in the count
+
+    	features_out['isolated'] = iso
+    	features_out['num_objects'] = num_obj_around
+
+
+
+
+
+
+
+
     # remove everything from the individual masks that is more than max_distance_pixel away from the markers
     if max_distance is not None:
         D=distance_transform_edt((markers==0).astype(int))
@@ -139,7 +170,7 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu
 
     return segmentation_out,features_out
 
-def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto'):
+def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = 8):
     """
     Function using watershedding or random walker to determine cloud volumes associated with tracked updrafts
     
@@ -162,6 +193,11 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m
     max_distance: float
                   Maximum distance from a marker allowed to be classified as belonging to that cell
     
+    ISO_dilate: int
+                value by which to dilate the feature size for the isolation parameter. 
+                Default is 8
+    
+    
     Output:
     segmentation_out: iris.cube.Cube
                    Cloud mask, 0 outside and integer numbers according to track inside the cloud
@@ -169,6 +205,9 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m
     import pandas as pd
     from iris.cube import CubeList
     
+    features['isolated'] = None
+	features['num_objects'] = None
+    
     logging.info('Start watershedding 3D')
 
     # check input for right dimensions: 

From 637c5bb122daa450027760306b0c5009ebf45a4e Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Tue, 1 Mar 2022 18:00:07 -0600
Subject: [PATCH 2/6] Update segmentation.py

Updated for spacing errors
---
 tobac/segmentation.py | 52 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tobac/segmentation.py b/tobac/segmentation.py
index ae621cf0..7719bc09 100644
--- a/tobac/segmentation.py
+++ b/tobac/segmentation.py
@@ -7,7 +7,7 @@ def segmentation_2D(features,field,dxy,threshold=3e-3,target='maximum',level=Non
     return segmentation(features,field,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance)
 
 
-def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = ISO_dilate):    
+def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximum',level=None,method='watershed',max_distance=None,vertical_coord='auto',ISO_dilate = 8):    
     """
     Function performing watershedding for an individual timestep of the data
     
@@ -25,16 +25,15 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu
     method:     string
                 flag determining the algorithm to use (currently watershedding implemented)
     max_distance: float
-                  maximum distance from a marker allowed to be classified as belonging to that cell
+                maximum distance from a marker allowed to be classified as belonging to that cell
     ISO_dilate: int
-                value by which to dilate the feature size for the isolation parameter. 
-                Default is 8
+                value by which to dilate the feature size for the isolation parameter. Default is 8
     
     Output:
     segmentation_out: iris.cube.Cube
-                      cloud mask, 0 outside and integer numbers according to track inside the clouds
+                cloud mask, 0 outside and integer numbers according to track inside the clouds
     features_out: pandas.DataFrame
-                  feature dataframe including the number of cells (2D or 3D) in the segmented area/volume of the feature at the timestep
+                feature dataframe including the number of cells (2D or 3D) in the segmented area/volume of the feature at the timestep
     """
     # The location of watershed within skimage submodules changes with v0.19, but I've kept both for backward compatibility for now
     try:
@@ -43,6 +42,7 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu
         from skimage.morphology import watershed
     # from skimage.segmentation import random_walker
     from scipy.ndimage import distance_transform_edt
+    from skimage.morphology import dilation, disk
     from copy import deepcopy
     import numpy as np
 
@@ -124,25 +124,24 @@ def segmentation_timestep(field_in,features_in,dxy,threshold=3e-3,target='maximu
         raise ValueError('unknown method, must be watershed')
 
 	#create isolation, currently only available for 2d tracking
-	if field_in.ndim==2:
-    	nobj = features_in['feature'].values
-    	nobj_len = len(nobj)
-    
-    	iso = np.empty(nobj_len, dtype='bool')
-    	iso[:] = False
-    	num_obj_around = np.zeros(nobj_len, dtype = 'int')
-    	segmask2 = dilation(segmentation_mask,disk(ISO_dilate)) #formerly square
-    	for iso_id in np.arange(nobj_len):
-        	if iso_id == 0:
-            	continue
-        	obj_ind = np.where(segmask2 == nobj[iso_id])
-        	objects = np.unique(segmentation_mask[obj_ind])
-        	if len(objects) >= 3: #one object will always be 0, the element of the feature, any more than those two are considered neighbors
-            	iso[iso_id] = True
-        	num_obj_around[iso_id] = len(objects)-1 #this subtracts the 0 element included in the count
+    if field_in.ndim==2:
+        nobj = features_in['feature'].values
+        nobj_len = len(nobj)
+        iso = np.empty(nobj_len, dtype='bool')
+        iso[:] = False
+        num_obj_around = np.zeros(nobj_len, dtype = 'int')
+        segmask2 = dilation(segmentation_mask,disk(ISO_dilate)) #formerly square
+        for iso_id in np.arange(nobj_len):
+            if iso_id == 0:
+                continue
+            obj_ind = np.where(segmask2 == nobj[iso_id])
+            objects = np.unique(segmentation_mask[obj_ind])
+            if len(objects) >= 3: #one object will always be 0, the element of the feature, any more than those two are considered neighbors
+                iso[iso_id] = True
+            num_obj_around[iso_id] = len(objects)-1 #this subtracts the 0 element included in the count
 
-    	features_out['isolated'] = iso
-    	features_out['num_objects'] = num_obj_around
+        features_out['isolated'] = iso
+        features_out['num_objects'] = num_obj_around
 
 
 
@@ -206,7 +205,8 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m
     from iris.cube import CubeList
     
     features['isolated'] = None
-	features['num_objects'] = None
+    features['num_objects'] = None
+    print(['iso_dilate:'+str(ISO_dilate)])
     
     logging.info('Start watershedding 3D')
 
@@ -225,7 +225,7 @@ def segmentation(features,field,dxy,threshold=3e-3,target='maximum',level=None,m
     for i,field_i in enumerate(field_time):
         time_i=field_i.coord('time').units.num2date(field_i.coord('time').points[0])
         features_i=features.loc[features['time']==time_i]
-        segmentation_out_i,features_out_i=segmentation_timestep(field_i,features_i,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance,vertical_coord=vertical_coord)                 
+        segmentation_out_i,features_out_i=segmentation_timestep(field_i,features_i,dxy,threshold=threshold,target=target,level=level,method=method,max_distance=max_distance,vertical_coord=vertical_coord, ISO_dilate = ISO_dilate)                 
         segmentation_out_list.append(segmentation_out_i)
         features_out_list.append(features_out_i)
         logging.debug('Finished segmentation for '+time_i.strftime('%Y-%m-%d_%H:%M:%S'))

From d4cbfa296b76bb04bd8831cbf302579ce52e0592 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:28:25 -0500
Subject: [PATCH 3/6] Create merge_split.py

Merge/split as postprocessing step.
---
 tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 tobac/merge_split.py

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
new file mode 100644
index 00000000..0cbbc7e5
--- /dev/null
+++ b/tobac/merge_split.py
@@ -0,0 +1,345 @@
+#Tobac merge and split v0.1
+
+from geopy.distance import geodesic
+from networkx import *
+import numpy as np
+from pandas.core.common import flatten
+import xarray as xr
+
+def compress_all(nc_grids, min_dims=2):
+    for var in nc_grids:
+        if len(nc_grids[var].dims) >= min_dims:
+            # print("Compressing ", var)
+            nc_grids[var].encoding["zlib"] = True
+            nc_grids[var].encoding["complevel"] = 4
+            nc_grids[var].encoding["contiguous"] = False
+    return nc_grids
+    
+
+def standardize_track_dataset(TrackedFeatures, Mask, Projection):
+    """ Combine a feature mask with the feature data table into a common dataset.
+    Also renames th
+
+    returned by tobac.themes.tobac_v1.segmentation
+    with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
+
+    Also rename the variables to be more desciptive and comply with cf-tree.
+
+    Convert the default cell parent ID  to an integer table.
+
+    Add a cell dimension to reflect
+
+    Projection is an xarray DataArray
+
+    TODO: Add metadata attributes to
+
+     """
+    feature_standard_names = {
+        # new variable name, and long description for the NetCDF attribute
+        'frame':('feature_time_index',
+            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
+        'hdim_1':('feature_hdim1_coordinate',
+            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
+        'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'hdim_2':('feature_hdim2_coordinate',
+            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
+            'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'idx':('feature_id_this_frame',),
+        'num':('feature_grid_cell_count',
+            'Number of grid points that are within the threshold of this feature'),
+        'threshold_value':('feature_threshold_max',
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
+        'feature':('feature_id',
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
+        'time':('feature_time','time of the feature, consistent with feature_time_index'),
+        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
+        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'lat':('feature_latitude','latitude of the feature'),
+        'lon':('feature_longitude','longitude of the feature'),
+        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
+        'areas':('feature_area',),
+        'isolated':('feature_isolation_flag',),
+        'num_objects':('number_of_feature_neighbors',),
+        'cell':('feature_parent_cell_id',),
+        'time_cell':('feature_parent_cell_elapsed_time',),
+        'segmentation_mask':('2d segmentation mask',)
+    }
+    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
+                             if k in TrackedFeatures.variables.keys()}
+
+    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
+    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
+    # the 'index' variable and call the dimension 'feature'
+    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
+                  Mask])
+
+    # Add the projection data back in
+    ds['ProjectionCoordinateSystem']=Projection
+
+    # Convert the cell ID variable from float to integer
+    if 'int' not in str(TrackedFeatures.cell.dtype):
+        # The raw output from the tracking is actually an object array
+        # array([nan, 2, 3], dtype=object)
+        # (and is cast to a float array when saved as NetCDF, I think).
+        # Cast to float.
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
+
+        cell_id_data = TrackedFeatures.cell.astype('float64')
+        valid_cell = np.isfinite(cell_id_data)
+        valid_cell_ids = cell_id_data[valid_cell]
+        if not (np.unique(valid_cell_ids) > 0).all():
+            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
+        int_cell[valid_cell] = valid_cell_ids.astype('int64')
+        #ds['feature_parent_cell_id'] = int_cell
+    return ds
+
+
+
+def merge_split(TRACK,distance = 25000,frame_len = 5):
+    '''
+    function to  postprocess tobac track data for merge/split cells
+    Input:
+        TRACK:    xarray dataset of tobac Track information
+        
+        distance:    float, optional distance threshold prior to adding a pair of points
+                            into the minimum spanning tree. Default is 25000 meters.
+        
+        frame_len: float, optional threshold for the spanning length within which two points
+                          can be separated. Default is five (5) frames.
+
+
+    Output:
+        d:    xarray dataset of 
+                        feature position along 1st horizontal dimension
+        hdim2_index:    float
+                        feature position along 2nd horizontal dimension
+                        
+    Example:
+        d = merge_split(Track)
+       ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
+        # both_ds = xarray.combine_by_coords((ds,d), compat='override')
+        both_ds = xr.merge([ds, d],compat ='override')
+        both_ds = compress_all(both_ds)
+        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
+    
+    '''
+    track_groups = TRACK.groupby('cell')
+    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
+    id_data = np.fromiter(cell_ids.keys(), dtype=int)
+    count_data = np.fromiter(cell_ids.values(), dtype=int)
+    all_frames = np.sort(np.unique(TRACK.frame))
+    a_points = list()
+    b_points = list()
+    a_names = list()
+    b_names = list()
+    dist = list()
+    
+    
+    for i in id_data:
+        #print(i)
+        a_pointx = track_groups[i].projection_x_coordinate[-1].values
+        a_pointy = track_groups[i].projection_y_coordinate[-1].values
+        for j in id_data:
+            b_pointx = track_groups[j].projection_x_coordinate[0].values
+            b_pointy = track_groups[j].projection_y_coordinate[0].values
+            d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+            if d <= distance:
+                a_points.append([a_pointx,a_pointy])
+                b_points.append([b_pointx, b_pointy])
+                dist.append(d)
+                a_names.append(i)
+                b_names.append(j)
+
+
+    
+    
+    
+#     for i in id_data:
+#         a_pointx = track_groups[i].grid_longitude[-1].values
+#         a_pointy = track_groups[i].grid_latitude[-1].values
+#         for j in id_data:
+#             b_pointx = track_groups[j].grid_longitude[0].values
+#             b_pointy = track_groups[j].grid_latitude[0].values
+#             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
+#             if d <= distance:
+#                 a_points.append([a_pointx,a_pointy])
+#                 b_points.append([b_pointx, b_pointy])
+#                 dist.append(d)
+#                 a_names.append(i)
+#                 b_names.append(j)
+
+    id = []
+    for i in range(len(dist)-1, -1, -1): 
+        if a_names[i] == b_names[i]:
+            id.append(i)
+            a_points.pop(i)
+            b_points.pop(i)
+            dist.pop(i)
+            a_names.pop(i)
+            b_names.pop(i)
+        else:
+            continue
+ 
+    g = Graph()
+    for i in np.arange(len(dist)):
+        g.add_edge(a_names[i], b_names[i],weight=dist[i])
+
+    tree = minimum_spanning_edges(g)
+    tree_list = list(minimum_spanning_edges(g))
+
+    new_tree = []
+    for i,j in enumerate(tree_list):
+        frame_a = np.nanmax(track_groups[j[0]].frame.values)
+        frame_b = np.nanmin(track_groups[j[1]].frame.values)
+        if np.abs(frame_a - frame_b) <= frame_len:
+            new_tree.append(tree_list[i][0:2])
+    new_tree_arr = np.array(new_tree)
+
+    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
+    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
+    track_id = dict() #same size as number of total merged tracks
+
+    arr = np.array([0])
+    for p in cell_id:
+        j = np.where(arr == int(p))
+        if len(j[0]) > 0:
+            continue
+        else:
+            k = np.where(new_tree_arr == p)
+            if len(k[0]) == 0:
+                track_id[p] = [p]
+                arr = np.append(arr,p)
+            else:
+                temp1 = list(np.unique(new_tree_arr[k[0]]))
+                temp = list(np.unique(new_tree_arr[k[0]]))
+
+                for l in range(15):
+                    for i in temp1:
+                        k2 = np.where(new_tree_arr == i)
+                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+                        temp = list(flatten(temp))
+                        temp = list(np.unique(temp))
+
+                    if len(temp1) == len(temp):
+                        break
+                    temp1 = np.array(temp)
+                
+                for i in temp1:
+                    k2 = np.where(new_tree_arr == i)
+                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+
+                temp = list(flatten(temp))
+                temp = list(np.unique(temp))
+                arr = np.append(arr,np.unique(temp))
+                
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
+                
+                
+
+    storm_id = [0] #default because we don't track larger storm systems *yet*
+    print('found storm id')
+
+
+    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
+    print('found track parent storm ids')
+
+    track_ids = np.array(list(track_id.keys()))
+    print('found track ids')
+
+
+    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
+    print('found cell ids')
+
+    cell_parent_track_id = []
+
+    for i, id in enumerate(track_id):
+    
+        if len(track_id[int(id)]) ==  1:
+            cell_parent_track_id.append(int(id))
+
+        else:
+            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
+
+
+    cell_parent_track_id = list(flatten(cell_parent_track_id))
+    print('found cell parent track ids')
+
+    feature_parent_cell_id = list(TRACK.cell.values.astype(float))
+
+    print('found feature parent cell ids')
+
+    #This version includes all the feature regardless of if they are used in cells or not.
+    feature_id = list(TRACK.feature.values.astype(int))
+    print('found feature ids')
+
+    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
+    print('found feature parent storm ids')
+
+    feature_parent_track_id  = []
+    feature_parent_track_id = np.zeros(len(feature_id))
+    for i, id in enumerate(feature_id):
+        cellid = feature_parent_cell_id[i]
+        if np.isnan(cellid):
+            feature_parent_track_id[i] = -1 
+        else:
+            j = np.where(cell_id == cellid)
+            j = np.squeeze(j)
+            trackid = cell_parent_track_id[j]
+            feature_parent_track_id[i] = trackid
+        
+    print('found feature parent track ids')
+
+
+    storm_child_track_count = [len(track_id)]
+    print('found storm child track count')
+
+    track_child_cell_count = []
+    for i,id in enumerate(track_id):
+        track_child_cell_count.append(len(track_id[int(id)]))
+    print('found track child cell count')
+    
+
+    cell_child_feature_count = []
+    for i,id in enumerate(cell_id):
+        cell_child_feature_count.append(len(track_groups[id].feature.values))
+    print('found cell child feature count')
+
+    storm_child_cell_count =  [len(cell_id)] 
+    storm_child_feature_count = [len(feature_id)]
+    
+    storm_dim = 'nstorms'
+    track_dim = 'ntracks'
+    cell_dim = 'ncells'
+    feature_dim = 'nfeatures'
+    
+    d = xr.Dataset({
+        'storm_id': (storm_dim, storm_id),
+        'track_id': (track_dim, track_ids),
+        'track_parent_storm_id': (track_dim, track_parent_storm_id),
+        'cell_id': (cell_dim, cell_id),
+        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
+        'feature_id': (feature_dim, feature_id),
+        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
+        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
+        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
+        'storm_child_track_count': (storm_dim, storm_child_track_count),
+        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
+        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
+        'track_child_cell_count': (track_dim, track_child_cell_count),
+        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
+            })
+    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
+
+    assert len(track_id) == len(track_parent_storm_id)
+    assert len(cell_id) == len(cell_parent_track_id)
+    assert len(feature_id) == len(feature_parent_cell_id)
+    assert sum(storm_child_track_count) == len(track_id)
+    assert sum(storm_child_cell_count) == len(cell_id)
+    assert sum(storm_child_feature_count) == len(feature_id)
+    assert sum(track_child_cell_count) == len(cell_id)
+    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
+    
+    return d
\ No newline at end of file

From 547bd2a7eb2c74061360197e814ecbe277386819 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:36:34 -0500
Subject: [PATCH 4/6] Revert "Create merge_split.py"

This reverts commit d4cbfa296b76bb04bd8831cbf302579ce52e0592.
---
 tobac/merge_split.py | 345 -------------------------------------------
 1 file changed, 345 deletions(-)
 delete mode 100644 tobac/merge_split.py

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
deleted file mode 100644
index 0cbbc7e5..00000000
--- a/tobac/merge_split.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#Tobac merge and split v0.1
-
-from geopy.distance import geodesic
-from networkx import *
-import numpy as np
-from pandas.core.common import flatten
-import xarray as xr
-
-def compress_all(nc_grids, min_dims=2):
-    for var in nc_grids:
-        if len(nc_grids[var].dims) >= min_dims:
-            # print("Compressing ", var)
-            nc_grids[var].encoding["zlib"] = True
-            nc_grids[var].encoding["complevel"] = 4
-            nc_grids[var].encoding["contiguous"] = False
-    return nc_grids
-    
-
-def standardize_track_dataset(TrackedFeatures, Mask, Projection):
-    """ Combine a feature mask with the feature data table into a common dataset.
-    Also renames th
-
-    returned by tobac.themes.tobac_v1.segmentation
-    with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
-
-    Also rename the variables to be more desciptive and comply with cf-tree.
-
-    Convert the default cell parent ID  to an integer table.
-
-    Add a cell dimension to reflect
-
-    Projection is an xarray DataArray
-
-    TODO: Add metadata attributes to
-
-     """
-    feature_standard_names = {
-        # new variable name, and long description for the NetCDF attribute
-        'frame':('feature_time_index',
-            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
-        'hdim_1':('feature_hdim1_coordinate',
-            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
-        'The numbering is consistent with positional indexing of the coordinate, but can be'
-            'fractional, to account for a centroid not aligned to the grid.'),
-        'hdim_2':('feature_hdim2_coordinate',
-            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
-            'The numbering is consistent with positional indexing of the coordinate, but can be'
-            'fractional, to account for a centroid not aligned to the grid.'),
-        'idx':('feature_id_this_frame',),
-        'num':('feature_grid_cell_count',
-            'Number of grid points that are within the threshold of this feature'),
-        'threshold_value':('feature_threshold_max',
-            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
-        'feature':('feature_id',
-            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
-        'time':('feature_time','time of the feature, consistent with feature_time_index'),
-        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
-        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
-        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
-        'lat':('feature_latitude','latitude of the feature'),
-        'lon':('feature_longitude','longitude of the feature'),
-        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
-        'areas':('feature_area',),
-        'isolated':('feature_isolation_flag',),
-        'num_objects':('number_of_feature_neighbors',),
-        'cell':('feature_parent_cell_id',),
-        'time_cell':('feature_parent_cell_elapsed_time',),
-        'segmentation_mask':('2d segmentation mask',)
-    }
-    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
-                             if k in TrackedFeatures.variables.keys()}
-
-    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
-    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
-    # the 'index' variable and call the dimension 'feature'
-    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
-                  Mask])
-
-    # Add the projection data back in
-    ds['ProjectionCoordinateSystem']=Projection
-
-    # Convert the cell ID variable from float to integer
-    if 'int' not in str(TrackedFeatures.cell.dtype):
-        # The raw output from the tracking is actually an object array
-        # array([nan, 2, 3], dtype=object)
-        # (and is cast to a float array when saved as NetCDF, I think).
-        # Cast to float.
-        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
-
-        cell_id_data = TrackedFeatures.cell.astype('float64')
-        valid_cell = np.isfinite(cell_id_data)
-        valid_cell_ids = cell_id_data[valid_cell]
-        if not (np.unique(valid_cell_ids) > 0).all():
-            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
-        int_cell[valid_cell] = valid_cell_ids.astype('int64')
-        #ds['feature_parent_cell_id'] = int_cell
-    return ds
-
-
-
-def merge_split(TRACK,distance = 25000,frame_len = 5):
-    '''
-    function to  postprocess tobac track data for merge/split cells
-    Input:
-        TRACK:    xarray dataset of tobac Track information
-        
-        distance:    float, optional distance threshold prior to adding a pair of points
-                            into the minimum spanning tree. Default is 25000 meters.
-        
-        frame_len: float, optional threshold for the spanning length within which two points
-                          can be separated. Default is five (5) frames.
-
-
-    Output:
-        d:    xarray dataset of 
-                        feature position along 1st horizontal dimension
-        hdim2_index:    float
-                        feature position along 2nd horizontal dimension
-                        
-    Example:
-        d = merge_split(Track)
-       ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
-        # both_ds = xarray.combine_by_coords((ds,d), compat='override')
-        both_ds = xr.merge([ds, d],compat ='override')
-        both_ds = compress_all(both_ds)
-        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
-    
-    '''
-    track_groups = TRACK.groupby('cell')
-    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
-    id_data = np.fromiter(cell_ids.keys(), dtype=int)
-    count_data = np.fromiter(cell_ids.values(), dtype=int)
-    all_frames = np.sort(np.unique(TRACK.frame))
-    a_points = list()
-    b_points = list()
-    a_names = list()
-    b_names = list()
-    dist = list()
-    
-    
-    for i in id_data:
-        #print(i)
-        a_pointx = track_groups[i].projection_x_coordinate[-1].values
-        a_pointy = track_groups[i].projection_y_coordinate[-1].values
-        for j in id_data:
-            b_pointx = track_groups[j].projection_x_coordinate[0].values
-            b_pointy = track_groups[j].projection_y_coordinate[0].values
-            d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
-            if d <= distance:
-                a_points.append([a_pointx,a_pointy])
-                b_points.append([b_pointx, b_pointy])
-                dist.append(d)
-                a_names.append(i)
-                b_names.append(j)
-
-
-    
-    
-    
-#     for i in id_data:
-#         a_pointx = track_groups[i].grid_longitude[-1].values
-#         a_pointy = track_groups[i].grid_latitude[-1].values
-#         for j in id_data:
-#             b_pointx = track_groups[j].grid_longitude[0].values
-#             b_pointy = track_groups[j].grid_latitude[0].values
-#             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
-#             if d <= distance:
-#                 a_points.append([a_pointx,a_pointy])
-#                 b_points.append([b_pointx, b_pointy])
-#                 dist.append(d)
-#                 a_names.append(i)
-#                 b_names.append(j)
-
-    id = []
-    for i in range(len(dist)-1, -1, -1): 
-        if a_names[i] == b_names[i]:
-            id.append(i)
-            a_points.pop(i)
-            b_points.pop(i)
-            dist.pop(i)
-            a_names.pop(i)
-            b_names.pop(i)
-        else:
-            continue
- 
-    g = Graph()
-    for i in np.arange(len(dist)):
-        g.add_edge(a_names[i], b_names[i],weight=dist[i])
-
-    tree = minimum_spanning_edges(g)
-    tree_list = list(minimum_spanning_edges(g))
-
-    new_tree = []
-    for i,j in enumerate(tree_list):
-        frame_a = np.nanmax(track_groups[j[0]].frame.values)
-        frame_b = np.nanmin(track_groups[j[1]].frame.values)
-        if np.abs(frame_a - frame_b) <= frame_len:
-            new_tree.append(tree_list[i][0:2])
-    new_tree_arr = np.array(new_tree)
-
-    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
-    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
-    track_id = dict() #same size as number of total merged tracks
-
-    arr = np.array([0])
-    for p in cell_id:
-        j = np.where(arr == int(p))
-        if len(j[0]) > 0:
-            continue
-        else:
-            k = np.where(new_tree_arr == p)
-            if len(k[0]) == 0:
-                track_id[p] = [p]
-                arr = np.append(arr,p)
-            else:
-                temp1 = list(np.unique(new_tree_arr[k[0]]))
-                temp = list(np.unique(new_tree_arr[k[0]]))
-
-                for l in range(15):
-                    for i in temp1:
-                        k2 = np.where(new_tree_arr == i)
-                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
-                        temp = list(flatten(temp))
-                        temp = list(np.unique(temp))
-
-                    if len(temp1) == len(temp):
-                        break
-                    temp1 = np.array(temp)
-                
-                for i in temp1:
-                    k2 = np.where(new_tree_arr == i)
-                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
-
-                temp = list(flatten(temp))
-                temp = list(np.unique(temp))
-                arr = np.append(arr,np.unique(temp))
-                
-                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
-                
-                
-
-    storm_id = [0] #default because we don't track larger storm systems *yet*
-    print('found storm id')
-
-
-    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
-    print('found track parent storm ids')
-
-    track_ids = np.array(list(track_id.keys()))
-    print('found track ids')
-
-
-    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
-    print('found cell ids')
-
-    cell_parent_track_id = []
-
-    for i, id in enumerate(track_id):
-    
-        if len(track_id[int(id)]) ==  1:
-            cell_parent_track_id.append(int(id))
-
-        else:
-            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
-
-
-    cell_parent_track_id = list(flatten(cell_parent_track_id))
-    print('found cell parent track ids')
-
-    feature_parent_cell_id = list(TRACK.cell.values.astype(float))
-
-    print('found feature parent cell ids')
-
-    #This version includes all the feature regardless of if they are used in cells or not.
-    feature_id = list(TRACK.feature.values.astype(int))
-    print('found feature ids')
-
-    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
-    print('found feature parent storm ids')
-
-    feature_parent_track_id  = []
-    feature_parent_track_id = np.zeros(len(feature_id))
-    for i, id in enumerate(feature_id):
-        cellid = feature_parent_cell_id[i]
-        if np.isnan(cellid):
-            feature_parent_track_id[i] = -1 
-        else:
-            j = np.where(cell_id == cellid)
-            j = np.squeeze(j)
-            trackid = cell_parent_track_id[j]
-            feature_parent_track_id[i] = trackid
-        
-    print('found feature parent track ids')
-
-
-    storm_child_track_count = [len(track_id)]
-    print('found storm child track count')
-
-    track_child_cell_count = []
-    for i,id in enumerate(track_id):
-        track_child_cell_count.append(len(track_id[int(id)]))
-    print('found track child cell count')
-    
-
-    cell_child_feature_count = []
-    for i,id in enumerate(cell_id):
-        cell_child_feature_count.append(len(track_groups[id].feature.values))
-    print('found cell child feature count')
-
-    storm_child_cell_count =  [len(cell_id)] 
-    storm_child_feature_count = [len(feature_id)]
-    
-    storm_dim = 'nstorms'
-    track_dim = 'ntracks'
-    cell_dim = 'ncells'
-    feature_dim = 'nfeatures'
-    
-    d = xr.Dataset({
-        'storm_id': (storm_dim, storm_id),
-        'track_id': (track_dim, track_ids),
-        'track_parent_storm_id': (track_dim, track_parent_storm_id),
-        'cell_id': (cell_dim, cell_id),
-        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
-        'feature_id': (feature_dim, feature_id),
-        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
-        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
-        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
-        'storm_child_track_count': (storm_dim, storm_child_track_count),
-        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
-        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
-        'track_child_cell_count': (track_dim, track_child_cell_count),
-        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
-            })
-    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
-
-    assert len(track_id) == len(track_parent_storm_id)
-    assert len(cell_id) == len(cell_parent_track_id)
-    assert len(feature_id) == len(feature_parent_cell_id)
-    assert sum(storm_child_track_count) == len(track_id)
-    assert sum(storm_child_cell_count) == len(cell_id)
-    assert sum(storm_child_feature_count) == len(feature_id)
-    assert sum(track_child_cell_count) == len(cell_id)
-    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
-    
-    return d
\ No newline at end of file

From e3fa302a1d7a39f470e613dcb0c2e6a258458bfa Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 5 Jun 2022 20:33:26 -0500
Subject: [PATCH 5/6] "Create merge_split.py"

This reverts commit 547bd2a7eb2c74061360197e814ecbe277386819.
Attempting to correct some incorrect merging. Should include only the merge_split.py file.
---
 tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 tobac/merge_split.py

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
new file mode 100644
index 00000000..0cbbc7e5
--- /dev/null
+++ b/tobac/merge_split.py
@@ -0,0 +1,345 @@
+#Tobac merge and split v0.1
+
+from geopy.distance import geodesic
+from networkx import *
+import numpy as np
+from pandas.core.common import flatten
+import xarray as xr
+
+def compress_all(nc_grids, min_dims=2):
+    for var in nc_grids:
+        if len(nc_grids[var].dims) >= min_dims:
+            # print("Compressing ", var)
+            nc_grids[var].encoding["zlib"] = True
+            nc_grids[var].encoding["complevel"] = 4
+            nc_grids[var].encoding["contiguous"] = False
+    return nc_grids
+    
+
+def standardize_track_dataset(TrackedFeatures, Mask, Projection):
+    """ Combine a feature mask with the feature data table into a common dataset.
+    Also renames th
+
+    returned by tobac.themes.tobac_v1.segmentation
+    with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
+
+    Also rename the variables to be more desciptive and comply with cf-tree.
+
+    Convert the default cell parent ID  to an integer table.
+
+    Add a cell dimension to reflect
+
+    Projection is an xarray DataArray
+
+    TODO: Add metadata attributes to
+
+     """
+    feature_standard_names = {
+        # new variable name, and long description for the NetCDF attribute
+        'frame':('feature_time_index',
+            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
+        'hdim_1':('feature_hdim1_coordinate',
+            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
+        'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'hdim_2':('feature_hdim2_coordinate',
+            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
+            'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'idx':('feature_id_this_frame',),
+        'num':('feature_grid_cell_count',
+            'Number of grid points that are within the threshold of this feature'),
+        'threshold_value':('feature_threshold_max',
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
+        'feature':('feature_id',
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
+        'time':('feature_time','time of the feature, consistent with feature_time_index'),
+        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
+        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'lat':('feature_latitude','latitude of the feature'),
+        'lon':('feature_longitude','longitude of the feature'),
+        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
+        'areas':('feature_area',),
+        'isolated':('feature_isolation_flag',),
+        'num_objects':('number_of_feature_neighbors',),
+        'cell':('feature_parent_cell_id',),
+        'time_cell':('feature_parent_cell_elapsed_time',),
+        'segmentation_mask':('2d segmentation mask',)
+    }
+    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
+                             if k in TrackedFeatures.variables.keys()}
+
+    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
+    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
+    # the 'index' variable and call the dimension 'feature'
+    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
+                  Mask])
+
+    # Add the projection data back in
+    ds['ProjectionCoordinateSystem']=Projection
+
+    # Convert the cell ID variable from float to integer
+    if 'int' not in str(TrackedFeatures.cell.dtype):
+        # The raw output from the tracking is actually an object array
+        # array([nan, 2, 3], dtype=object)
+        # (and is cast to a float array when saved as NetCDF, I think).
+        # Cast to float.
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
+
+        cell_id_data = TrackedFeatures.cell.astype('float64')
+        valid_cell = np.isfinite(cell_id_data)
+        valid_cell_ids = cell_id_data[valid_cell]
+        if not (np.unique(valid_cell_ids) > 0).all():
+            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
+        int_cell[valid_cell] = valid_cell_ids.astype('int64')
+        #ds['feature_parent_cell_id'] = int_cell
+    return ds
+
+
+
+def merge_split(TRACK,distance = 25000,frame_len = 5):
+    '''
+    function to  postprocess tobac track data for merge/split cells
+    Input:
+        TRACK:    xarray dataset of tobac Track information
+        
+        distance:    float, optional distance threshold prior to adding a pair of points
+                            into the minimum spanning tree. Default is 25000 meters.
+        
+        frame_len: float, optional threshold for the spanning length within which two points
+                          can be separated. Default is five (5) frames.
+
+
+    Output:
+        d:    xarray dataset of 
+                        feature position along 1st horizontal dimension
+        hdim2_index:    float
+                        feature position along 2nd horizontal dimension
+                        
+    Example:
+        d = merge_split(Track)
+       ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
+        # both_ds = xarray.combine_by_coords((ds,d), compat='override')
+        both_ds = xr.merge([ds, d],compat ='override')
+        both_ds = compress_all(both_ds)
+        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
+    
+    '''
+    track_groups = TRACK.groupby('cell')
+    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
+    id_data = np.fromiter(cell_ids.keys(), dtype=int)
+    count_data = np.fromiter(cell_ids.values(), dtype=int)
+    all_frames = np.sort(np.unique(TRACK.frame))
+    a_points = list()
+    b_points = list()
+    a_names = list()
+    b_names = list()
+    dist = list()
+    
+    
+    for i in id_data:
+        #print(i)
+        a_pointx = track_groups[i].projection_x_coordinate[-1].values
+        a_pointy = track_groups[i].projection_y_coordinate[-1].values
+        for j in id_data:
+            b_pointx = track_groups[j].projection_x_coordinate[0].values
+            b_pointy = track_groups[j].projection_y_coordinate[0].values
+            d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+            if d <= distance:
+                a_points.append([a_pointx,a_pointy])
+                b_points.append([b_pointx, b_pointy])
+                dist.append(d)
+                a_names.append(i)
+                b_names.append(j)
+
+
+    
+    
+    
+#     for i in id_data:
+#         a_pointx = track_groups[i].grid_longitude[-1].values
+#         a_pointy = track_groups[i].grid_latitude[-1].values
+#         for j in id_data:
+#             b_pointx = track_groups[j].grid_longitude[0].values
+#             b_pointy = track_groups[j].grid_latitude[0].values
+#             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
+#             if d <= distance:
+#                 a_points.append([a_pointx,a_pointy])
+#                 b_points.append([b_pointx, b_pointy])
+#                 dist.append(d)
+#                 a_names.append(i)
+#                 b_names.append(j)
+
+    id = []
+    for i in range(len(dist)-1, -1, -1): 
+        if a_names[i] == b_names[i]:
+            id.append(i)
+            a_points.pop(i)
+            b_points.pop(i)
+            dist.pop(i)
+            a_names.pop(i)
+            b_names.pop(i)
+        else:
+            continue
+ 
+    g = Graph()
+    for i in np.arange(len(dist)):
+        g.add_edge(a_names[i], b_names[i],weight=dist[i])
+
+    tree = minimum_spanning_edges(g)
+    tree_list = list(minimum_spanning_edges(g))
+
+    new_tree = []
+    for i,j in enumerate(tree_list):
+        frame_a = np.nanmax(track_groups[j[0]].frame.values)
+        frame_b = np.nanmin(track_groups[j[1]].frame.values)
+        if np.abs(frame_a - frame_b) <= frame_len:
+            new_tree.append(tree_list[i][0:2])
+    new_tree_arr = np.array(new_tree)
+
+    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
+    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
+    track_id = dict() #same size as number of total merged tracks
+
+    arr = np.array([0])
+    for p in cell_id:
+        j = np.where(arr == int(p))
+        if len(j[0]) > 0:
+            continue
+        else:
+            k = np.where(new_tree_arr == p)
+            if len(k[0]) == 0:
+                track_id[p] = [p]
+                arr = np.append(arr,p)
+            else:
+                temp1 = list(np.unique(new_tree_arr[k[0]]))
+                temp = list(np.unique(new_tree_arr[k[0]]))
+
+                for l in range(15):
+                    for i in temp1:
+                        k2 = np.where(new_tree_arr == i)
+                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+                        temp = list(flatten(temp))
+                        temp = list(np.unique(temp))
+
+                    if len(temp1) == len(temp):
+                        break
+                    temp1 = np.array(temp)
+                
+                for i in temp1:
+                    k2 = np.where(new_tree_arr == i)
+                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+
+                temp = list(flatten(temp))
+                temp = list(np.unique(temp))
+                arr = np.append(arr,np.unique(temp))
+                
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
+                
+                
+
+    storm_id = [0] #default because we don't track larger storm systems *yet*
+    print('found storm id')
+
+
+    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
+    print('found track parent storm ids')
+
+    track_ids = np.array(list(track_id.keys()))
+    print('found track ids')
+
+
+    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
+    print('found cell ids')
+
+    cell_parent_track_id = []
+
+    for i, id in enumerate(track_id):
+    
+        if len(track_id[int(id)]) ==  1:
+            cell_parent_track_id.append(int(id))
+
+        else:
+            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
+
+
+    cell_parent_track_id = list(flatten(cell_parent_track_id))
+    print('found cell parent track ids')
+
+    feature_parent_cell_id = list(TRACK.cell.values.astype(float))
+
+    print('found feature parent cell ids')
+
+    #This version includes all the feature regardless of if they are used in cells or not.
+    feature_id = list(TRACK.feature.values.astype(int))
+    print('found feature ids')
+
+    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
+    print('found feature parent storm ids')
+
+    feature_parent_track_id  = []
+    feature_parent_track_id = np.zeros(len(feature_id))
+    for i, id in enumerate(feature_id):
+        cellid = feature_parent_cell_id[i]
+        if np.isnan(cellid):
+            feature_parent_track_id[i] = -1 
+        else:
+            j = np.where(cell_id == cellid)
+            j = np.squeeze(j)
+            trackid = cell_parent_track_id[j]
+            feature_parent_track_id[i] = trackid
+        
+    print('found feature parent track ids')
+
+
+    storm_child_track_count = [len(track_id)]
+    print('found storm child track count')
+
+    track_child_cell_count = []
+    for i,id in enumerate(track_id):
+        track_child_cell_count.append(len(track_id[int(id)]))
+    print('found track child cell count')
+    
+
+    cell_child_feature_count = []
+    for i,id in enumerate(cell_id):
+        cell_child_feature_count.append(len(track_groups[id].feature.values))
+    print('found cell child feature count')
+
+    storm_child_cell_count =  [len(cell_id)] 
+    storm_child_feature_count = [len(feature_id)]
+    
+    storm_dim = 'nstorms'
+    track_dim = 'ntracks'
+    cell_dim = 'ncells'
+    feature_dim = 'nfeatures'
+    
+    d = xr.Dataset({
+        'storm_id': (storm_dim, storm_id),
+        'track_id': (track_dim, track_ids),
+        'track_parent_storm_id': (track_dim, track_parent_storm_id),
+        'cell_id': (cell_dim, cell_id),
+        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
+        'feature_id': (feature_dim, feature_id),
+        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
+        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
+        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
+        'storm_child_track_count': (storm_dim, storm_child_track_count),
+        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
+        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
+        'track_child_cell_count': (track_dim, track_child_cell_count),
+        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
+            })
+    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
+
+    assert len(track_id) == len(track_parent_storm_id)
+    assert len(cell_id) == len(cell_parent_track_id)
+    assert len(feature_id) == len(feature_parent_cell_id)
+    assert sum(storm_child_track_count) == len(track_id)
+    assert sum(storm_child_cell_count) == len(cell_id)
+    assert sum(storm_child_feature_count) == len(feature_id)
+    assert sum(track_child_cell_count) == len(cell_id)
+    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
+    
+    return d
\ No newline at end of file

From 728cee0fec9ddaa2ba6f93a18c6dac3157d1a500 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 5 Jun 2022 20:51:23 -0500
Subject: [PATCH 6/6] Reverting a segmentation file.

Replaced former segmentation file with isolated feature changes.
---
 build/lib/tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 build/lib/tobac/merge_split.py

diff --git a/build/lib/tobac/merge_split.py b/build/lib/tobac/merge_split.py
new file mode 100644
index 00000000..0cbbc7e5
--- /dev/null
+++ b/build/lib/tobac/merge_split.py
@@ -0,0 +1,345 @@
+#Tobac merge and split v0.1
+
+from geopy.distance import geodesic
+from networkx import *
+import numpy as np
+from pandas.core.common import flatten
+import xarray as xr
+
+def compress_all(nc_grids, min_dims=2):
+    for var in nc_grids:
+        if len(nc_grids[var].dims) >= min_dims:
+            # print("Compressing ", var)
+            nc_grids[var].encoding["zlib"] = True
+            nc_grids[var].encoding["complevel"] = 4
+            nc_grids[var].encoding["contiguous"] = False
+    return nc_grids
+    
+
+def standardize_track_dataset(TrackedFeatures, Mask, Projection):
+    """ Combine a feature mask with the feature data table into a common dataset.
+    Also renames th
+
+    returned by tobac.themes.tobac_v1.segmentation
+    with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
+
+    Also rename the variables to be more desciptive and comply with cf-tree.
+
+    Convert the default cell parent ID  to an integer table.
+
+    Add a cell dimension to reflect
+
+    Projection is an xarray DataArray
+
+    TODO: Add metadata attributes to
+
+     """
+    feature_standard_names = {
+        # new variable name, and long description for the NetCDF attribute
+        'frame':('feature_time_index',
+            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
+        'hdim_1':('feature_hdim1_coordinate',
+            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
+        'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'hdim_2':('feature_hdim2_coordinate',
+            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
+            'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'idx':('feature_id_this_frame',),
+        'num':('feature_grid_cell_count',
+            'Number of grid points that are within the threshold of this feature'),
+        'threshold_value':('feature_threshold_max',
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
+        'feature':('feature_id',
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
+        'time':('feature_time','time of the feature, consistent with feature_time_index'),
+        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
+        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'lat':('feature_latitude','latitude of the feature'),
+        'lon':('feature_longitude','longitude of the feature'),
+        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
+        'areas':('feature_area',),
+        'isolated':('feature_isolation_flag',),
+        'num_objects':('number_of_feature_neighbors',),
+        'cell':('feature_parent_cell_id',),
+        'time_cell':('feature_parent_cell_elapsed_time',),
+        'segmentation_mask':('2d segmentation mask',)
+    }
+    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
+                             if k in TrackedFeatures.variables.keys()}
+
+    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
+    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
+    # the 'index' variable and call the dimension 'feature'
+    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
+                  Mask])
+
+    # Add the projection data back in
+    ds['ProjectionCoordinateSystem']=Projection
+
+    # Convert the cell ID variable from float to integer
+    if 'int' not in str(TrackedFeatures.cell.dtype):
+        # The raw output from the tracking is actually an object array
+        # array([nan, 2, 3], dtype=object)
+        # (and is cast to a float array when saved as NetCDF, I think).
+        # Cast to float.
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
+
+        cell_id_data = TrackedFeatures.cell.astype('float64')
+        valid_cell = np.isfinite(cell_id_data)
+        valid_cell_ids = cell_id_data[valid_cell]
+        if not (np.unique(valid_cell_ids) > 0).all():
+            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
+        int_cell[valid_cell] = valid_cell_ids.astype('int64')
+        #ds['feature_parent_cell_id'] = int_cell
+    return ds
+
+
+
+def merge_split(TRACK,distance = 25000,frame_len = 5):
+    '''
+    function to  postprocess tobac track data for merge/split cells
+    Input:
+        TRACK:    xarray dataset of tobac Track information
+        
+        distance:    float, optional distance threshold prior to adding a pair of points
+                            into the minimum spanning tree. Default is 25000 meters.
+        
+        frame_len: float, optional threshold for the spanning length within which two points
+                          can be separated. Default is five (5) frames.
+
+
+    Output:
+        d:    xarray dataset of 
+                        feature position along 1st horizontal dimension
+        hdim2_index:    float
+                        feature position along 2nd horizontal dimension
+                        
+    Example:
+        d = merge_split(Track)
+       ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
+        # both_ds = xarray.combine_by_coords((ds,d), compat='override')
+        both_ds = xr.merge([ds, d],compat ='override')
+        both_ds = compress_all(both_ds)
+        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
+    
+    '''
+    track_groups = TRACK.groupby('cell')
+    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
+    id_data = np.fromiter(cell_ids.keys(), dtype=int)
+    count_data = np.fromiter(cell_ids.values(), dtype=int)
+    all_frames = np.sort(np.unique(TRACK.frame))
+    a_points = list()
+    b_points = list()
+    a_names = list()
+    b_names = list()
+    dist = list()
+    
+    
+    for i in id_data:
+        #print(i)
+        a_pointx = track_groups[i].projection_x_coordinate[-1].values
+        a_pointy = track_groups[i].projection_y_coordinate[-1].values
+        for j in id_data:
+            b_pointx = track_groups[j].projection_x_coordinate[0].values
+            b_pointy = track_groups[j].projection_y_coordinate[0].values
+            d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+            if d <= distance:
+                a_points.append([a_pointx,a_pointy])
+                b_points.append([b_pointx, b_pointy])
+                dist.append(d)
+                a_names.append(i)
+                b_names.append(j)
+
+
+    
+    
+    
+#     for i in id_data:
+#         a_pointx = track_groups[i].grid_longitude[-1].values
+#         a_pointy = track_groups[i].grid_latitude[-1].values
+#         for j in id_data:
+#             b_pointx = track_groups[j].grid_longitude[0].values
+#             b_pointy = track_groups[j].grid_latitude[0].values
+#             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
+#             if d <= distance:
+#                 a_points.append([a_pointx,a_pointy])
+#                 b_points.append([b_pointx, b_pointy])
+#                 dist.append(d)
+#                 a_names.append(i)
+#                 b_names.append(j)
+
+    id = []
+    for i in range(len(dist)-1, -1, -1): 
+        if a_names[i] == b_names[i]:
+            id.append(i)
+            a_points.pop(i)
+            b_points.pop(i)
+            dist.pop(i)
+            a_names.pop(i)
+            b_names.pop(i)
+        else:
+            continue
+ 
+    g = Graph()
+    for i in np.arange(len(dist)):
+        g.add_edge(a_names[i], b_names[i],weight=dist[i])
+
+    tree = minimum_spanning_edges(g)
+    tree_list = list(minimum_spanning_edges(g))
+
+    new_tree = []
+    for i,j in enumerate(tree_list):
+        frame_a = np.nanmax(track_groups[j[0]].frame.values)
+        frame_b = np.nanmin(track_groups[j[1]].frame.values)
+        if np.abs(frame_a - frame_b) <= frame_len:
+            new_tree.append(tree_list[i][0:2])
+    new_tree_arr = np.array(new_tree)
+
+    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
+    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
+    track_id = dict() #same size as number of total merged tracks
+
+    arr = np.array([0])
+    for p in cell_id:
+        j = np.where(arr == int(p))
+        if len(j[0]) > 0:
+            continue
+        else:
+            k = np.where(new_tree_arr == p)
+            if len(k[0]) == 0:
+                track_id[p] = [p]
+                arr = np.append(arr,p)
+            else:
+                temp1 = list(np.unique(new_tree_arr[k[0]]))
+                temp = list(np.unique(new_tree_arr[k[0]]))
+
+                for l in range(15):
+                    for i in temp1:
+                        k2 = np.where(new_tree_arr == i)
+                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+                        temp = list(flatten(temp))
+                        temp = list(np.unique(temp))
+
+                    if len(temp1) == len(temp):
+                        break
+                    temp1 = np.array(temp)
+                
+                for i in temp1:
+                    k2 = np.where(new_tree_arr == i)
+                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+
+                temp = list(flatten(temp))
+                temp = list(np.unique(temp))
+                arr = np.append(arr,np.unique(temp))
+                
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
+                
+                
+
+    storm_id = [0] #default because we don't track larger storm systems *yet*
+    print('found storm id')
+
+
+    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
+    print('found track parent storm ids')
+
+    track_ids = np.array(list(track_id.keys()))
+    print('found track ids')
+
+
+    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
+    print('found cell ids')
+
+    cell_parent_track_id = []
+
+    for i, id in enumerate(track_id):
+    
+        if len(track_id[int(id)]) ==  1:
+            cell_parent_track_id.append(int(id))
+
+        else:
+            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
+
+
+    cell_parent_track_id = list(flatten(cell_parent_track_id))
+    print('found cell parent track ids')
+
+    feature_parent_cell_id = list(TRACK.cell.values.astype(float))
+
+    print('found feature parent cell ids')
+
+    #This version includes all the feature regardless of if they are used in cells or not.
+    feature_id = list(TRACK.feature.values.astype(int))
+    print('found feature ids')
+
+    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
+    print('found feature parent storm ids')
+
+    feature_parent_track_id  = []
+    feature_parent_track_id = np.zeros(len(feature_id))
+    for i, id in enumerate(feature_id):
+        cellid = feature_parent_cell_id[i]
+        if np.isnan(cellid):
+            feature_parent_track_id[i] = -1 
+        else:
+            j = np.where(cell_id == cellid)
+            j = np.squeeze(j)
+            trackid = cell_parent_track_id[j]
+            feature_parent_track_id[i] = trackid
+        
+    print('found feature parent track ids')
+
+
+    storm_child_track_count = [len(track_id)]
+    print('found storm child track count')
+
+    track_child_cell_count = []
+    for i,id in enumerate(track_id):
+        track_child_cell_count.append(len(track_id[int(id)]))
+    print('found track child cell count')
+    
+
+    cell_child_feature_count = []
+    for i,id in enumerate(cell_id):
+        cell_child_feature_count.append(len(track_groups[id].feature.values))
+    print('found cell child feature count')
+
+    storm_child_cell_count =  [len(cell_id)] 
+    storm_child_feature_count = [len(feature_id)]
+    
+    storm_dim = 'nstorms'
+    track_dim = 'ntracks'
+    cell_dim = 'ncells'
+    feature_dim = 'nfeatures'
+    
+    d = xr.Dataset({
+        'storm_id': (storm_dim, storm_id),
+        'track_id': (track_dim, track_ids),
+        'track_parent_storm_id': (track_dim, track_parent_storm_id),
+        'cell_id': (cell_dim, cell_id),
+        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
+        'feature_id': (feature_dim, feature_id),
+        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
+        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
+        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
+        'storm_child_track_count': (storm_dim, storm_child_track_count),
+        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
+        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
+        'track_child_cell_count': (track_dim, track_child_cell_count),
+        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
+            })
+    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
+
+    assert len(track_id) == len(track_parent_storm_id)
+    assert len(cell_id) == len(cell_parent_track_id)
+    assert len(feature_id) == len(feature_parent_cell_id)
+    assert sum(storm_child_track_count) == len(track_id)
+    assert sum(storm_child_cell_count) == len(cell_id)
+    assert sum(storm_child_feature_count) == len(feature_id)
+    assert sum(track_child_cell_count) == len(cell_id)
+    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
+    
+    return d
\ No newline at end of file