@@ -320,9 +320,21 @@ def columns_metadata(df):
320320 return columns
321321
322322
323+ def process_aid_columns (arg ):
324+ if isinstance (arg , list ):
325+ return arg
326+ elif isinstance (arg , tuple ):
327+ return list (arg )
328+ elif isinstance (arg , str ):
329+ return [arg ]
330+ else :
331+ return []
332+
333+
323334def main (
324335 input_path : str ,
325336 output_path : str ,
337+ aid_columns : list [str ] = [],
326338 ml_target : str = None ,
327339 ml_features_only : bool = False ,
328340 syndiffix_args : str = '' ,
@@ -333,6 +345,7 @@ def main(
333345 Parameters:
334346 input_path: Path of input CSV file.
335347 output_path: Path of output CSV file.
348+ aid_columns: Entity identifier columns. If not specified, assumes one row per entity.
336349 ml_target: If specified, focuses on this column for better ML prediction.
337350 ml_features_only: If set, limits columns to only ML features of ml_target.
338351 syndiffix_args: Extra arguments to pass to syndiffix.
@@ -347,14 +360,19 @@ def main(
347360
348361 extra_args = []
349362
363+ aid_columns = process_aid_columns (aid_columns )
364+ if len (aid_columns ) > 0 :
365+ print (f'AID Columns: { aid_columns } ' )
366+ extra_args += ['--aidcolumns' , * aid_columns ]
367+
350368 if ml_target :
351369 print ('ML Target: ' + ml_target )
352370
353371 print ('Selecting ML features...' )
354372 features = select_features_ml (df , ml_target )['kFeatures' ]
355373 print ('ML Features: ' + (', ' .join (features )))
356374
357- extra_args = [
375+ extra_args + = [
358376 '--clustering-maincolumn' , ml_target ,
359377 '--clustering-mainfeatures' , * features
360378 ]
0 commit comments