diff --git a/mslearn/pipeline.py b/mslearn/pipeline.py index 7c3b809b..b01bdb17 100644 --- a/mslearn/pipeline.py +++ b/mslearn/pipeline.py @@ -203,8 +203,8 @@ def benchmark(self, df, target, test_spec=0.2): If the test spec is a float, it specifies the fraction of the dataframe to be randomly selected for testing (must be a number between 0-1). test_spec=0 means a CV-only validation. - If test_spec is a list/ndarray, it is the indexes of the - dataframe to use for - this option is useful if you + If test_spec is a list/ndarray, it is the iloc indexes of the + dataframe to use for testing. This option is useful if you are comparing multiple techniques and want to use the same test or validation fraction across benchmarks. @@ -224,11 +224,11 @@ def benchmark(self, df, target, test_spec=0.2): # Split data for steps where combined transform could otherwise over-fit # or leak data from validation set into training set. if isinstance(test_spec, Iterable): - msk = test_spec + traindf = df.iloc[~np.asarray(test_spec)] + testdf = df.iloc[np.asarray(test_spec)] else: - msk = np.random.rand(len(df)) < test_spec - traindf = df.iloc[~np.asarray(msk)] - testdf = df.iloc[msk] + testdf, traindf = np.split(df.sample(frac=1), + [int(test_spec * len(df))]) self.logger.info("Dataframe split into training and testing fractions" " having {} and {} samples.".format(traindf.shape[0], testdf.shape[0])) @@ -354,15 +354,17 @@ def MatPipeFast(**kwargs): # print("Validation error is {}".format(mean_squared_error(df[target], df[target + " predicted"]))) mp = MatPipe(**debug_config) - df = mp.benchmark(df, target, test_spec=validation_ix) - print(df) - print("Validation error is {}".format( - mean_squared_error(df[target], df[target + " predicted"]))) - print(mp.digest()) - mp.save("somepipe.p") - - mp = MatPipe.load("somepipe.p") - print(mp.predict(df2, target)) + # df = mp.benchmark(df, target, test_spec=validation_ix) + + df = mp.benchmark(df, target, test_spec=0.25) + # print(df) + # print("Validation error is {}".format( + # mean_squared_error(df[target], df[target + " predicted"]))) + # print(mp.digest()) + # mp.save("somepipe.p") + # + # mp = MatPipe.load("somepipe.p") + # print(mp.predict(df2, target)) # # mp = MatPipe() diff --git a/mslearn/tests/test_pipeline.py b/mslearn/tests/test_pipeline.py index 64aab3f2..fa4e9696 100644 --- a/mslearn/tests/test_pipeline.py +++ b/mslearn/tests/test_pipeline.py @@ -63,10 +63,9 @@ def test_user_features(self): def test_benchmarking(self): pipe = MatPipe(**debug_config) - df = self.df[500:700] - df_test = pipe.benchmark(df, self.target, test_spec=0.2) - self.assertTrue(df_test.shape[0] > 35) - self.assertTrue(df_test.shape[0] < 45) + df = self.df.iloc[500:700] + df_test = pipe.benchmark(df, self.target, test_spec=0.25) + self.assertEqual(df_test.shape[0], 50) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue(r2_score(true, test) > 0.5)