Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Reduce risk of getting stuck on Windows #5

Merged
merged 4 commits into from
Oct 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions src/gbmtsplits/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def is_convertible(value):
values = self.df[task].dropna().unique()
values_is_numerical = [is_convertible(value) for value in values]

# Check in non-convertible strings
# Check if contains non-convertible strings
if not (all(values_is_numerical)): # Some non numerical values

if any(values_is_numerical): # But some values are numerical
Expand All @@ -323,19 +323,13 @@ def is_convertible(value):
logger.info(f'Classification task {task} stratified into {len(self.df[task].dropna().unique())} tasks.')
# Regression: bin data and use as tasks
else:
bin_size = len(values) // self.stratify_reg_nbins
sorted_values = np.sort(values)
bins = [sorted_values[i:i + bin_size] for i in range(0, len(sorted_values), bin_size)]
bins = np.array_split(sorted_values, self.stratify_reg_nbins)
for i, bin in enumerate(bins):
key = f'{task}_{bin[0]:.2f}_{bin[-1]:.2f}'
self.df[key] = self.df[task].apply(lambda x: x if x in bin else np.nan)
self.tasks_for_balancing.append(key)
logger.info(f'Regression task {task} stratified into {self.stratify_reg_nbins} tasks.')




# self.tasks_for_balancing.append(task)
else:
self.tasks_for_balancing = self.original_tasks

Expand Down Expand Up @@ -377,7 +371,7 @@ def _merge_clusters_with_balancing_mapping(
tasks_vs_clusters_array : np.array,
sizes : list[float] = [0.9, 0.1, 0.1],
equal_weight_perc_compounds_as_tasks : bool = False,
relative_gap : float = 0,
absolute_gap : float = 1e-3,
time_limit_seconds : int = 60*60,
max_N_threads : int = 1,
preassigned_clusters : dict[int, int] | None = None) -> list[list[int]]:
Expand All @@ -403,11 +397,10 @@ def _merge_clusters_with_balancing_mapping(
equal_weight_perc_compounds_as_tasks : bool
- if True, matching the % records will have the same weight as matching the % self.df of individual tasks.
- if False, matching the % records will have a weight X times larger than the X tasks.
relative_gap : float
- the relative gap between the absolute optimal objective and the current one at which the solver
absolute_gap : float
- the absolute gap between the absolute optimal objective and the current one at which the solver
stops and returns a solution. Can be very useful for cases where the exact solution requires
far too long to be found to be of any practical use.
- set to 0 to obtain the absolute optimal solution (if reached within the time_limit_seconds)
time_limit_seconds : int
- the time limit in seconds for the solver (by default set to 1 hour)
- after this time, whatever solution is available is returned
Expand Down Expand Up @@ -465,6 +458,12 @@ def _merge_clusters_with_balancing_mapping(
# Create WML
sk_harmonic = (1 / fractional_sizes) / np.sum(1 / fractional_sizes)

# Round all values to have only 3 decimals > reduce computational time
A = np.round(A, 3)
fractional_sizes = np.round(fractional_sizes, 3)
obj_weights = np.round(obj_weights, 3)
sk_harmonic = np.round(sk_harmonic, 3)

# Create the pulp model
prob = LpProblem("Data_balancing", LpMinimize)

Expand Down Expand Up @@ -508,12 +507,9 @@ def _merge_clusters_with_balancing_mapping(
prob += LpAffineExpression([(x[c+m*N],A[t,c]) for c in cs]) + X[t] >= fractional_sizes[m]

# Solve the model
prob.solve(PULP_CBC_CMD(gapRel = relative_gap, timeLimit = time_limit_seconds, threads = max_N_threads, msg=False))
#solver.tmpDir = "/zfsself.df/self.df/erik/erik-rp1/pQSAR/scaffoldsplit_trial/tmp"
#prob.solve(solver)
prob.solve(PULP_CBC_CMD(gapAbs = absolute_gap, timeLimit = time_limit_seconds, threads = max_N_threads, msg=False))

# Extract the solution

list_binary_solution = [value(x[i]) for i in range(N * S)]
list_initial_cluster_indices = [(list(range(N)) * S)[i] for i,l in enumerate(list_binary_solution) if l == 1]
list_final_ML_subsets = [(list((1 + np.repeat(range(S), N)).astype('int64')))[i] for i,l in enumerate(list_binary_solution) if l == 1]
Expand Down