outbrain-inc · bmramor · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024 · SkBlaz
diff --git a/outrank/__main__.py b/outrank/__main__.py
@@ -159,7 +159,7 @@ def main():
         '--feature_set_focus',
         type=str,
         default=None,
-        help='Collection of which feature transformations to consider',
+        help='Collection of which feature transformations to consider. To consider only --reference_model_JSON features, set _all_from_reference_JSON',
     )
 
     parser.add_argument(
@@ -238,7 +238,13 @@ def main():
         default=1.0,
         help='If < 1.0, MI algorithm will further subsample data in stratified manner (equal distributions per value if possible).',
     )
-
+
+    parser.add_argument(
+        '--histogram_max_bins',
+        type=int,
+        default=100,
+        help='Number of histogram bins in value_repetitions.json',
+    )
 
     args = parser.parse_args()
 

diff --git a/outrank/task_ranking.py b/outrank/task_ranking.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pandas as pd
+from statistics import quantiles
 
 from outrank.algorithms.importance_estimator import rank_features_3MR
 from outrank.core_ranking import estimate_importances_minibatches
@@ -280,9 +281,14 @@ def outrank_task_conduct_ranking(args: Any) -> None:
     with open(f'{args.output_folder}/value_repetitions.json', 'w') as out_counts:
         out_dict = {}
         for k, v in GLOBAL_ITEM_COUNTS.items():
-            actual_hist = np.array(list(v.default_counter.values()))
+            frequencies = np.array(list(v.default_counter.values()))
             more_than = lambda n, ary: len(np.where(ary > n)[0])
-            out_dict[k] = {x: more_than(x, actual_hist)  for x in [0] + [1 * 10 ** x for x in range(6)]}
+            out_dict[k] = {str(x): str(more_than(x, frequencies))  for x in [0] + [1 * 10 ** x for x in range(6)]}
+            if len(frequencies) < args.histogram_max_bins:
+                out_dict[k]["quantiles"] = str(sorted(list(frequencies)))
+            else:
+                out_dict[k]["quantiles"] = str(quantiles(list(frequencies), n=args.histogram_max_bins))
+        print(out_dict)
         out_counts.write(json.dumps(out_dict))
 
     with open(f'{args.output_folder}/combination_estimation_counts.json', 'w') as out_counts: