Support AID columns in Python wrapper

edongashi · edongashi · commit a604b2c8f38b · 2023-07-31T17:32:17.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,8 +2,8 @@
 
 ### Version 1.1.0
 
-- Added `syndiffix.py` python wrapper for ML feature selection.
-- Lowered default thresholds for range and singularity nodes to 15 and 5.
+- Added Python wrapper for auto-detecting column types and main features for ML.
+- Lowered default thresholds for range and singularity nodes and raised default tree depth limit.
 - Improved clustering algorithm for main column.
 - Added `--output` (`-o`) CLI argument to directly save the CSV file to disk.
 - Added `--clustering-mainfeatures <features>` CLI argument to specify main column's ML features.
diff --git a/syndiffix.py b/syndiffix.py
@@ -320,9 +320,21 @@ def columns_metadata(df):
     return columns
 
 
+def process_aid_columns(arg):
+    if isinstance(arg, list):
+        return arg
+    elif isinstance(arg, tuple):
+        return list(arg)
+    elif isinstance(arg, str):
+        return [arg]
+    else:
+        return []
+
+
 def main(
         input_path: str,
         output_path: str,
+        aid_columns: list[str] = [],
         ml_target: str = None,
         ml_features_only: bool = False,
         syndiffix_args: str = '',
@@ -333,6 +345,7 @@ def main(
     Parameters:
         input_path: Path of input CSV file.
         output_path: Path of output CSV file.
+        aid_columns: Entity identifier columns. If not specified, assumes one row per entity.
         ml_target: If specified, focuses on this column for better ML prediction.
         ml_features_only: If set, limits columns to only ML features of ml_target.
         syndiffix_args: Extra arguments to pass to syndiffix.
@@ -347,14 +360,19 @@ def main(
 
     extra_args = []
 
+    aid_columns = process_aid_columns(aid_columns)
+    if len(aid_columns) > 0:
+        print(f'AID Columns: {aid_columns}')
+        extra_args += ['--aidcolumns', *aid_columns]
+
     if ml_target:
         print('ML Target: ' + ml_target)
 
         print('Selecting ML features...')
         features = select_features_ml(df, ml_target)['kFeatures']
         print('ML Features: ' + (', '.join(features)))
 
-        extra_args = [
+        extra_args += [
             '--clustering-maincolumn', ml_target,
             '--clustering-mainfeatures', *features
         ]