Merge pull request #51 from outbrain/more-tests

Benchmark also tests for actual scales of scores
outbrain-inc · Oct 16, 2023 · 4bd8ca0 · 4bd8ca0
2 parents 5f4bf26 + 0327838
commit 4bd8ca0
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 2 deletions.
diff --git a/benchmarks/generator_naive.py b/benchmarks/generator_naive.py
@@ -13,6 +13,7 @@ def generate_random_matrix(num_features, size=2000000):
     target = sample[:, 30]
     # Some noise
 
+    sample[:, 31] = target * 19
     target[target < 20] = 0
     return sample, target
 
@@ -62,11 +63,41 @@ def generate_random_matrix(num_features, size=2000000):
         rankings = pd.read_csv(
             os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t',
         )
-        if rankings.iloc[1]['Feature'] != 'f30-(81; 100)':
+
+        rankings_pairwise = pd.read_csv(
+            os.path.join(args.verify_outputs, 'pairwise_ranks.tsv'), sep='\t',
+        )
+
+        # Partial match test
+        if rankings.iloc[2]['Feature'] != 'f31-(90; 100)' and rankings.iloc[2]['Score MI-numba-randomized'] > 0.9:
+            raise Exception(
+                f'Could not retrieve the appropriate second-ranked feature needle in the haystack {rankings.iloc[2].Feature}, exiting',
+            )
+        else:
+            logger.info(
+                f'Identified the appropriate second-ranked feature in the haystack ({rankings.iloc[1].Feature})',
+            )
+
+        # Test of direct retrievals
+        if rankings.iloc[1]['Feature'] != 'f30-(81; 100)' and rankings.iloc[2]['Score MI-numba-randomized'] > 0.99:
             raise Exception(
                 f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting',
             )
         else:
             logger.info(
                 f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})',
             )
+
+
+        # Tests related to pairwise rankings
+        sorted_by_scores = rankings_pairwise.sort_values(by=['Score', 'FeatureA'])
+
+        if len(sorted_by_scores) < 10000:
+            raise Exception('Number of pairwise comparisons insufficient!')
+        else:
+            logger.info('Found enough pairwise comparisons ..')
+
+        if sorted_by_scores.iloc[-1]['FeatureA'] == 'f45-(90; 100)' and sorted_by_scores.iloc[-1]['FeatureB'] == 'f45-(90; 100)' and sorted_by_scores.iloc[-1]['Score'] > 1.0:
+            logger.info('Similarity check passed for f45 ..')
+        else:
+            raise Exception('Most similar features not identified ..')
diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
@@ -14,11 +14,14 @@ then
     python generator_naive.py --output_df_name dataset_naive --num_features 100 --size 10000;
 
     # Substantial subsampling must retrieve the needle.
-    outrank --data_path dataset_naive --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-randomized --target_ranking_only True --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 20000;
+    outrank --data_path dataset_naive --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-randomized --target_ranking_only False --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 20000;
 
     python generator_naive.py --verify_outputs ranking_outputs;
 
     rm -r ranking_outputs dataset_naive;
+
+    python generator_naive.py --output_df_name dataset_naive --num_features 100 --size 10000;
+
     exit
 fi
 ###################################################################