sohviluukkonen · sohviluukkonen · Oct 6, 2023 · Sep 25, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/src/gbmtsplits/split.py b/src/gbmtsplits/split.py
@@ -302,7 +302,7 @@ def is_convertible(value):
                 values = self.df[task].dropna().unique()
                 values_is_numerical = [is_convertible(value) for value in values]
 
-                # Check in non-convertible strings
+                # Check if contains non-convertible strings
                 if not (all(values_is_numerical)): # Some non numerical values
 
                     if any(values_is_numerical): # But some values are numerical
@@ -323,19 +323,13 @@ def is_convertible(value):
                     logger.info(f'Classification task {task} stratified into {len(self.df[task].dropna().unique())} tasks.')
                 # Regression: bin data and use as tasks
                 else:
-                    bin_size = len(values) // self.stratify_reg_nbins
                     sorted_values = np.sort(values)
-                    bins = [sorted_values[i:i + bin_size] for i in range(0, len(sorted_values), bin_size)]
+                    bins = np.array_split(sorted_values, self.stratify_reg_nbins)
                     for i, bin in enumerate(bins):
                         key = f'{task}_{bin[0]:.2f}_{bin[-1]:.2f}'
                         self.df[key] = self.df[task].apply(lambda x: x if x in bin else np.nan)
                         self.tasks_for_balancing.append(key)
                     logger.info(f'Regression task {task} stratified into {self.stratify_reg_nbins} tasks.')
-
-
-
-
-                    # self.tasks_for_balancing.append(task)
         else:
             self.tasks_for_balancing = self.original_tasks
 
@@ -377,7 +371,7 @@ def _merge_clusters_with_balancing_mapping(
             tasks_vs_clusters_array : np.array,
             sizes : list[float] = [0.9, 0.1, 0.1], 
             equal_weight_perc_compounds_as_tasks : bool = False,
-            relative_gap : float = 0,
+            absolute_gap : float = 1e-3,
             time_limit_seconds : int = 60*60,
             max_N_threads : int = 1,
             preassigned_clusters : dict[int, int] | None = None) -> list[list[int]]:
@@ -403,11 +397,10 @@ def _merge_clusters_with_balancing_mapping(
             equal_weight_perc_compounds_as_tasks : bool
                 - if True, matching the % records will have the same weight as matching the % self.df of individual tasks.
                 - if False, matching the % records will have a weight X times larger than the X tasks.
-            relative_gap : float
-                - the relative gap between the absolute optimal objective and the current one at which the solver
+            absolute_gap : float
+                - the absolute gap between the absolute optimal objective and the current one at which the solver
                 stops and returns a solution. Can be very useful for cases where the exact solution requires
                 far too long to be found to be of any practical use.
-                - set to 0 to obtain the absolute optimal solution (if reached within the time_limit_seconds)
             time_limit_seconds : int
                 - the time limit in seconds for the solver (by default set to 1 hour)
                 - after this time, whatever solution is available is returned
@@ -465,6 +458,12 @@ def _merge_clusters_with_balancing_mapping(
             # Create WML
             sk_harmonic = (1 / fractional_sizes) / np.sum(1 / fractional_sizes)
 
+            # Round all values to have only 3 decimals > reduce computational time
+            A = np.round(A, 3)
+            fractional_sizes = np.round(fractional_sizes, 3)
+            obj_weights = np.round(obj_weights, 3)
+            sk_harmonic = np.round(sk_harmonic, 3)           
+
             # Create the pulp model
             prob = LpProblem("Data_balancing", LpMinimize)
 
@@ -508,12 +507,9 @@ def _merge_clusters_with_balancing_mapping(
                     prob += LpAffineExpression([(x[c+m*N],A[t,c]) for c in cs]) + X[t] >= fractional_sizes[m]
 
             # Solve the model
-            prob.solve(PULP_CBC_CMD(gapRel = relative_gap, timeLimit = time_limit_seconds, threads = max_N_threads, msg=False))
-            #solver.tmpDir = "/zfsself.df/self.df/erik/erik-rp1/pQSAR/scaffoldsplit_trial/tmp"
-            #prob.solve(solver)
+            prob.solve(PULP_CBC_CMD(gapAbs = absolute_gap, timeLimit = time_limit_seconds, threads = max_N_threads, msg=False))
 
             # Extract the solution
-
             list_binary_solution = [value(x[i]) for i in range(N * S)]
             list_initial_cluster_indices = [(list(range(N)) * S)[i] for i,l in enumerate(list_binary_solution) if l == 1]
             list_final_ML_subsets = [(list((1 + np.repeat(range(S), N)).astype('int64')))[i] for i,l in enumerate(list_binary_solution) if l == 1]