Replies: 1 comment
-
Sorry for the getting back late! The fitness function is called for each individual solution (or strain as you said). This means the data sampling will differ from one solution to another. If you want to do sampling once for all solutions (strains) in the same generation (iteration), then you can:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.random import sample_without_replacement
import numpy as np
gene_space = [
# n_estimators
np.linspace(50,200,25, dtype='int'),
# min_samples_split,
np.linspace(2,10,5, dtype='int'),
# min_samples_leaf,
np.linspace(1,10,5, dtype='int'),
# min_impurity_decrease
np.linspace(0,1,10, dtype='float')
]
X = data.drop(columns=[y_name])
y = data[y_name]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.5)
train_idx = sample_without_replacement(n_population=len(X_train),
n_samples=sample_size)
test_idx = sample_without_replacement(n_population=len(X_test),
n_samples=sample_size)
def on_generation(ga_instance):
global X, y, X_train, X_test, y_train, y_test, train_idx, test_idx
X = data.drop(columns=[y_name])
y = data[y_name]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.5)
train_idx = sample_without_replacement(n_population=len(X_train),
n_samples=sample_size)
test_idx = sample_without_replacement(n_population=len(X_test),
n_samples=sample_size)
def fitness_function_factory(hyperparameters, data, y_name, sample_size):
def fitness_function(solution, solution_idx):
model = RandomForestClassifier(
n_estimators=solution[0],
min_samples_split=solution[1],
min_samples_leaf=solution[2],
min_impurity_decrease=solution[3]
)
global X, y, X_train, X_test, y_train, y_test, train_idx, test_idx
model.fit(X_train[train_idx], y_train[train_idx])
fitness = model.score(X_test[test_idx], y_test[test_idx])
return fitness
return fitness_function
cross_validation = pygad.GA(gene_space=gene_space,
fitness_func=fitness_function_factory,
num_generations=100,
num_parents_mating=4,
sol_per_pop=8,
# num_genes=5,
parent_selection_type='sss',
keep_parents=2,
crossover_type="single_point",
mutation_type="random",
mutation_percent_genes=10) |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
I'm using PyGAD for cross-validation hyper-parameter tuning. I sample train/test data in the fitness function; I'm unclear on whether every strain i in iteration x will have access to the same sampled data or if sampling will differ across strains given the same iteration?
Beta Was this translation helpful? Give feedback.
All reactions