diff --git a/mslearn/pipeline.py b/mslearn/pipeline.py
index 7c3b809b..b01bdb17 100644
--- a/mslearn/pipeline.py
+++ b/mslearn/pipeline.py
@@ -203,8 +203,8 @@ def benchmark(self, df, target, test_spec=0.2):
                 If the test spec is a float, it specifies the fraction of the
                 dataframe to be randomly selected for testing (must be a
                 number between 0-1). test_spec=0 means a CV-only validation.
-                If test_spec is a list/ndarray, it is the indexes of the
-                dataframe to use for  - this option is useful if you
+                If test_spec is a list/ndarray, it is the iloc indexes of the
+                dataframe to use for testing. This option is useful if you
                 are comparing multiple techniques and want to use the same
                 test or validation fraction across benchmarks.
 
@@ -224,11 +224,11 @@ def benchmark(self, df, target, test_spec=0.2):
         # Split data for steps where combined transform could otherwise over-fit
         # or leak data from validation set into training set.
         if isinstance(test_spec, Iterable):
-            msk = test_spec
+            traindf = df.iloc[~np.asarray(test_spec)]
+            testdf = df.iloc[np.asarray(test_spec)]
         else:
-            msk = np.random.rand(len(df)) < test_spec
-        traindf = df.iloc[~np.asarray(msk)]
-        testdf = df.iloc[msk]
+            testdf, traindf = np.split(df.sample(frac=1),
+                                       [int(test_spec * len(df))])
         self.logger.info("Dataframe split into training and testing fractions"
                          " having {} and {} samples.".format(traindf.shape[0],
                                                              testdf.shape[0]))
@@ -354,15 +354,17 @@ def MatPipeFast(**kwargs):
     # print("Validation error is {}".format(mean_squared_error(df[target], df[target + " predicted"])))
 
     mp = MatPipe(**debug_config)
-    df = mp.benchmark(df, target, test_spec=validation_ix)
-    print(df)
-    print("Validation error is {}".format(
-        mean_squared_error(df[target], df[target + " predicted"])))
-    print(mp.digest())
-    mp.save("somepipe.p")
-
-    mp = MatPipe.load("somepipe.p")
-    print(mp.predict(df2, target))
+    # df = mp.benchmark(df, target, test_spec=validation_ix)
+
+    df = mp.benchmark(df, target, test_spec=0.25)
+    # print(df)
+    # print("Validation error is {}".format(
+    #     mean_squared_error(df[target], df[target + " predicted"])))
+    # print(mp.digest())
+    # mp.save("somepipe.p")
+    #
+    # mp = MatPipe.load("somepipe.p")
+    # print(mp.predict(df2, target))
 
     #
     # mp = MatPipe()
diff --git a/mslearn/tests/test_pipeline.py b/mslearn/tests/test_pipeline.py
index 64aab3f2..fa4e9696 100644
--- a/mslearn/tests/test_pipeline.py
+++ b/mslearn/tests/test_pipeline.py
@@ -63,10 +63,9 @@ def test_user_features(self):
 
     def test_benchmarking(self):
         pipe = MatPipe(**debug_config)
-        df = self.df[500:700]
-        df_test = pipe.benchmark(df, self.target, test_spec=0.2)
-        self.assertTrue(df_test.shape[0] > 35)
-        self.assertTrue(df_test.shape[0] < 45)
+        df = self.df.iloc[500:700]
+        df_test = pipe.benchmark(df, self.target, test_spec=0.25)
+        self.assertEqual(df_test.shape[0], 50)
         true = df_test[self.target]
         test = df_test[self.target + " predicted"]
         self.assertTrue(r2_score(true, test) > 0.5)