Skip to content

Commit

Permalink
Merge pull request #129 from ardunn/master
Browse files Browse the repository at this point in the history
update benchmark on matpipe and update test
  • Loading branch information
ardunn authored Nov 17, 2018
2 parents b2efaaf + b7269ee commit a71bf3d
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 19 deletions.
32 changes: 17 additions & 15 deletions mslearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ def benchmark(self, df, target, test_spec=0.2):
If the test spec is a float, it specifies the fraction of the
dataframe to be randomly selected for testing (must be a
number between 0-1). test_spec=0 means a CV-only validation.
If test_spec is a list/ndarray, it is the indexes of the
dataframe to use for - this option is useful if you
If test_spec is a list/ndarray, it is the iloc indexes of the
dataframe to use for testing. This option is useful if you
are comparing multiple techniques and want to use the same
test or validation fraction across benchmarks.
Expand All @@ -224,11 +224,11 @@ def benchmark(self, df, target, test_spec=0.2):
# Split data for steps where combined transform could otherwise over-fit
# or leak data from validation set into training set.
if isinstance(test_spec, Iterable):
msk = test_spec
traindf = df.iloc[~np.asarray(test_spec)]
testdf = df.iloc[np.asarray(test_spec)]
else:
msk = np.random.rand(len(df)) < test_spec
traindf = df.iloc[~np.asarray(msk)]
testdf = df.iloc[msk]
testdf, traindf = np.split(df.sample(frac=1),
[int(test_spec * len(df))])
self.logger.info("Dataframe split into training and testing fractions"
" having {} and {} samples.".format(traindf.shape[0],
testdf.shape[0]))
Expand Down Expand Up @@ -354,15 +354,17 @@ def MatPipeFast(**kwargs):
# print("Validation error is {}".format(mean_squared_error(df[target], df[target + " predicted"])))

mp = MatPipe(**debug_config)
df = mp.benchmark(df, target, test_spec=validation_ix)
print(df)
print("Validation error is {}".format(
mean_squared_error(df[target], df[target + " predicted"])))
print(mp.digest())
mp.save("somepipe.p")

mp = MatPipe.load("somepipe.p")
print(mp.predict(df2, target))
# df = mp.benchmark(df, target, test_spec=validation_ix)

df = mp.benchmark(df, target, test_spec=0.25)
# print(df)
# print("Validation error is {}".format(
# mean_squared_error(df[target], df[target + " predicted"])))
# print(mp.digest())
# mp.save("somepipe.p")
#
# mp = MatPipe.load("somepipe.p")
# print(mp.predict(df2, target))

#
# mp = MatPipe()
Expand Down
7 changes: 3 additions & 4 deletions mslearn/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,9 @@ def test_user_features(self):

def test_benchmarking(self):
pipe = MatPipe(**debug_config)
df = self.df[500:700]
df_test = pipe.benchmark(df, self.target, test_spec=0.2)
self.assertTrue(df_test.shape[0] > 35)
self.assertTrue(df_test.shape[0] < 45)
df = self.df.iloc[500:700]
df_test = pipe.benchmark(df, self.target, test_spec=0.25)
self.assertEqual(df_test.shape[0], 50)
true = df_test[self.target]
test = df_test[self.target + " predicted"]
self.assertTrue(r2_score(true, test) > 0.5)
Expand Down

0 comments on commit a71bf3d

Please sign in to comment.