Appending guarantees deterministic sort order. Rewrote and cleaned up…

… Design Matrix tutorial. Former-commit-id: 0fa3615
cosanlab · Apr 28, 2018 · 774a3a6 · 774a3a6
1 parent 1663463
commit 774a3a6
Show file tree

Hide file tree

Showing 28 changed files with 571 additions and 281 deletions.
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_001.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_001.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_002.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_002.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_003.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_003.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_004.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_004.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_005.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_005.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_006.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_006.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_007.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_007.png
diff --git a/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_008.png b/docs/auto_examples/01_DataOperations/images/sphx_glr_plot_design_matrix_008.png
diff --git a/...o_examples/01_DataOperations/images/thumb/sphx_glr_plot_design_matrix_thumb.png b/...o_examples/01_DataOperations/images/thumb/sphx_glr_plot_design_matrix_thumb.png
diff --git a/docs/auto_examples/01_DataOperations/plot_design_matrix.ipynb b/docs/auto_examples/01_DataOperations/plot_design_matrix.ipynb
diff --git a/docs/auto_examples/01_DataOperations/plot_design_matrix.py b/docs/auto_examples/01_DataOperations/plot_design_matrix.py
diff --git a/docs/auto_examples/01_DataOperations/plot_design_matrix.py.md5 b/docs/auto_examples/01_DataOperations/plot_design_matrix.py.md5
@@ -1 +1 @@
-674c1c7e55385d8ebc2e109812f5c3b8
+d25dc9c6791965dca7df05760d6549bf
diff --git a/docs/auto_examples/01_DataOperations/plot_design_matrix.rst b/docs/auto_examples/01_DataOperations/plot_design_matrix.rst
diff --git a/docs/auto_examples/01_DataOperations/plot_design_matrix_codeobj.pickle b/docs/auto_examples/01_DataOperations/plot_design_matrix_codeobj.pickle
diff --git a/docs/auto_examples/01_DataOperations/plot_mask_codeobj.pickle b/docs/auto_examples/01_DataOperations/plot_mask_codeobj.pickle
diff --git a/docs/auto_examples/01_DataOperations/plot_neurovault_io_codeobj.pickle b/docs/auto_examples/01_DataOperations/plot_neurovault_io_codeobj.pickle
diff --git a/docs/auto_examples/02_Analysis/plot_decomposition_codeobj.pickle b/docs/auto_examples/02_Analysis/plot_decomposition_codeobj.pickle
diff --git a/docs/auto_examples/02_Analysis/plot_multivariate_classification_codeobj.pickle b/docs/auto_examples/02_Analysis/plot_multivariate_classification_codeobj.pickle
diff --git a/docs/auto_examples/02_Analysis/plot_similarity_example_codeobj.pickle b/docs/auto_examples/02_Analysis/plot_similarity_example_codeobj.pickle
diff --git a/docs/auto_examples/02_Analysis/plot_univariate_regression_codeobj.pickle b/docs/auto_examples/02_Analysis/plot_univariate_regression_codeobj.pickle
diff --git a/docs/auto_examples/auto_examples_jupyter.zip b/docs/auto_examples/auto_examples_jupyter.zip
diff --git a/docs/auto_examples/auto_examples_python.zip b/docs/auto_examples/auto_examples_python.zip
diff --git a/docs/auto_examples/index.rst b/docs/auto_examples/index.rst
@@ -235,13 +235,13 @@ Neuroimaging Analysis Examples
 
   .. container:: sphx-glr-download
 
-    :download:`Download all examples in Python source code: auto_examples_python.zip <//Users/lukechang/Github/nltools/docs/auto_examples/auto_examples_python.zip>`
+    :download:`Download all examples in Python source code: auto_examples_python.zip <//Users/Esh/Documents/Python/Cosan/nltools/docs/auto_examples/auto_examples_python.zip>`
 
 
 
   .. container:: sphx-glr-download
 
-    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Users/lukechang/Github/nltools/docs/auto_examples/auto_examples_jupyter.zip>`
+    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Users/Esh/Documents/Python/Cosan/nltools/docs/auto_examples/auto_examples_jupyter.zip>`
 
 
 .. only:: html

diff --git a/examples/01_DataOperations/plot_design_matrix.py b/examples/01_DataOperations/plot_design_matrix.py
diff --git a/nltools/data/design_matrix.py b/nltools/data/design_matrix.py
@@ -105,6 +105,14 @@ def _inherit_attributes(self,
             setattr(dm_out, item, getattr(self,item))
         return dm_out
 
+    def _sort_cols(self):
+        """
+        This is a helper function that tries to ensure that columns of a Design Matrix are sorted according to: a) those not separated during append operations, b) those separated during append operations, c) polynomials. Called primarily during vertical concatentation and cleaning.
+        """
+        data_cols = [elem for elem in self.columns if not elem.split('_')[0].isdigit() and elem not in self.polys]
+        separated_cols = [elem for elem in self.columns if elem.split('_')[0].isdigit() and elem not in self.polys]
+        return self[data_cols + separated_cols + self.polys]
+
     def details(self):
         """Print class meta data.
 
@@ -144,7 +152,7 @@ def append(self, dm, axis=0, keep_separate = True, unique_cols = [], fill_na=0,
         if not all([isinstance(elem,self.__class__) for elem in to_append]):
             raise TypeError("Each object to be appended must be a Design_Matrix!")
         if not all([elem.sampling_freq == self.sampling_freq for elem in to_append]):
-            raise ValueError("All Design Matrices must have the same sampling rate!")
+            raise ValueError("All Design Matrices must have the same sampling frequency!")
 
         if axis == 1:
             if any([not set(self.columns).isdisjoint(elem.columns) for elem in to_append]):
@@ -190,6 +198,7 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
         modify_to_append = []
         all_polys = []
         cols_to_separate = []
+        all_separated = []
 
         if len(unique_cols):
             if not keep_separate:
@@ -216,7 +225,10 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
                                     count = c.split('_')[0]
                                     unique_count.append(int(count))
                                 else:
-                                    to_rename[c] = '0_' + c
+                                    new_name = '0_' + c
+                                    all_separated.append(new_name)
+                                    to_rename[c] = new_name
+                                    all_separated.append(new_name)
                     cols_to_separate.append(searchstr)
 
                 if to_rename:
@@ -256,10 +268,12 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
                                             count = int(c.split('_')[0])
                                             name = '_'.join(c.split('_')[1:])
                                             count += max_unique_count + 1
-                                            to_rename[c] = str(count) + '_' + name
+                                            new_name = str(count) + '_' + name
+                                            to_rename[c] = new_name
                                         else:
-                                            to_rename[c] = str(max_unique_count + 1) + '_' + c
-
+                                            new_name = str(max_unique_count + 1) + '_' + c
+                                            to_rename[c] = new_name
+                                        all_separated.append(new_name)
                             modify_to_append.append(dm.rename(columns=to_rename))
                             max_unique_count += 1
                         else:
@@ -282,9 +296,12 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
                                             count = int(c.split('_')[0])
                                             name = '_'.join(c.split('_')[1:])
                                             count += max_unique_count + 1
-                                            to_rename[c] = str(count) + '_' + name
+                                            new_name = str(count) + '_' + name
+                                            to_rename[c] = new_name
                                         else:
-                                            to_rename[c] = str(max_unique_count + 1) + '_' + c
+                                            new_name = str(max_unique_count + 1) + '_' + c
+                                            to_rename[c] = new_name
+                                        all_separated.append(new_name)
                             modify_to_append.append(dm.rename(columns=to_rename))
                             max_unique_count += 1
                         else:
@@ -339,7 +356,6 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
                         current_poly_max += 1
                         all_polys += list(to_rename.values())
 
-
                         # Handle renaming additional unique cols to keep separate
                         if cols_to_separate:
                             if verbose:
@@ -353,9 +369,12 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
                                             count = int(c.split('_')[0])
                                             name = '_'.join(c.split('_')[1:])
                                             count += max_unique_count + 1
-                                            to_rename[c] = str(count) + '_' + name
+                                            new_name = str(count) + '_' + name
+                                            to_rename[c] = new_name
                                         else:
-                                            to_rename[c] = str(max_unique_count + 1) + '_' + c
+                                            new_name = str(max_unique_count + 1) + '_' + c
+                                            to_rename[c] = new_name
+                                        all_separated.append(new_name)
 
                             # Combine renamed polynomials and renamed uniqu_cols
                             modify_to_append.append(temp_dm.rename(columns=to_rename))
@@ -382,10 +401,12 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
                                             count = int(c.split('_')[0])
                                             name = '_'.join(c.split('_')[1:])
                                             count += max_unique_count + 1
-                                            to_rename[c] = str(count) + '_' + name
+                                            new_name = str(count) + '_' + name
+                                            to_rename[c] = new_name
                                         else:
-                                            to_rename[c] = str(max_unique_count + 1) + '_' + c
-
+                                            new_name = str(max_unique_count + 1) + '_' + c
+                                            to_rename[c] = new_name
+                                        all_separated.append(new_name)
                         modify_to_append.append(dm.rename(to_rename))
                         max_unique_count += 1
                     else:
@@ -403,10 +424,8 @@ def _vertcat(self, df, keep_separate, unique_cols, fill_na, verbose):
         out.convolved = self.convolved
         out.multi = True
         out.polys = all_polys
-        data_cols = [elem for elem in out.columns if elem not in out.polys]
-        out = out[data_cols + out.polys]
 
-        return out
+        return out._sort_cols()
 
     def vif(self,exclude_polys=True):
         """Compute variance inflation factor amongst columns of design matrix,
@@ -472,7 +491,7 @@ def convolve(self, conv_func='hrf', columns=None):
             assert len(conv_func.shape) <= 2, "2d conv_func must be formatted as samplex X kernals!"
         elif isinstance(conv_func, six.string_types):
             assert conv_func == 'hrf',"Did you mean 'hrf'? 'hrf' can generate a kernel for you, otherwise custom kernels should be passed in as 1d or 2d arrays."
-            conv_func = glover_hrf(1. / self.sampling_freq, oversampling=1)
+            conv_func = glover_hrf(1. / self.sampling_freq, oversampling=1.)
 
         else:
             raise TypeError("conv_func must be a 1d or 2d numpy array organized as samples x kernels, or the string 'hrf' for the canonical glover hrf")
@@ -611,7 +630,7 @@ def add_dct_basis(self,duration=180,drop=0):
             if any([elem.count('_') == 2 and 'cosine' in elem for elem in self.polys]):
                 raise AmbiguityError("It appears that this Design Matrix contains cosine bases that were kept seperate from a previous append operation. This makes it ambiguous for adding polynomials terms. Try calling .add_dct_basis() on each separate Design Matrix before appending them instead.")
 
-        basis_mat = make_cosine_basis(self.shape[0],self.sampling_freq,duration,drop=drop)
+        basis_mat = make_cosine_basis(self.shape[0],1./self.sampling_freq,duration,drop=drop)
 
         basis_frame = Design_Matrix(basis_mat,
                                     sampling_freq=self.sampling_freq,columns = [str(elem) for elem in range(basis_mat.shape[1])])
@@ -688,6 +707,8 @@ def clean(self,fill_na=0,exclude_polys=False,thresh=.95,verbose=True):
                        remove.append(j)
         if remove:
             out = out.drop(remove, axis=1)
+            out.polys = [elem for elem in out.polys if elem not in remove]
+            out = out._sort_cols()
         else:
             print("Dropping columns not needed...skipping")
         np.seterr(**old_settings)

diff --git a/nltools/file_reader.py b/nltools/file_reader.py
@@ -75,7 +75,7 @@ def onsets_to_dm(F, sampling_freq, run_length, header='infer', sort=False, keep_
         df['Onset'] = df['Onset'].apply(lambda x: int(np.floor(x/TR)))
 
         #Build dummy codes
-        X = Design_Matrix(np.zeros([run_length,len(df['Stim'].unique())]),columns=df['Stim'].unique(),sampling_rate=TR)
+        X = Design_Matrix(np.zeros([run_length,len(df['Stim'].unique())]),columns=df['Stim'].unique(),sampling_freq=sampling_freq)
         for i, row in df.iterrows():
             if df.shape[1] == 3:
                 dur = np.ceil(row['Duration']/TR)

diff --git a/nltools/stats.py b/nltools/stats.py
@@ -563,14 +563,15 @@ def make_cosine_basis(nsamples, sampling_freq, filter_length, unit_scale=True, d
     # Drop intercept ala SPM
     C = C[:,1:]
 
+    if C.size == 0:
+        raise ValueError('Basis function creation failed! nsamples is too small for requested filter_length.')
+
     if unit_scale:
         C *= 1. / C[0,0]
 
     C = C[:, drop:]
-    if C.size == 0:
-        raise ValueError('Basis function creation failed! nsamples is too small for requested filter_length.')
-    else:
-        return C
+
+    return C
 
 def transform_pairwise(X, y):
     '''Transforms data into pairs with balanced labels for ranking

diff --git a/nltools/tests/test_data.py b/nltools/tests/test_data.py
@@ -617,13 +617,13 @@ def test_designmat(tmpdir):
     assert mat.add_poly(2,include_lower=False).shape[1] == 5
 
     matpd = matp.add_dct_basis()
-    assert matpd.shape[1] == 9
+    assert matpd.shape[1] == 18
 
     assert all(matpd.vif() < 2.0)
     assert not all(matpd.vif(exclude_polys=False) < 2.0)
 
     matc = matpd.clean()
-    assert matc.shape[1] == 7
+    assert matc.shape[1] == 16
 
     # Standard convolve
     assert matpd.convolve().shape == matpd.shape
@@ -653,18 +653,17 @@ def test_designmat(tmpdir):
     # Otherwise stack them
     assert matpd.append(matpd,keep_separate=False).shape[1] == matpd.shape[1]
     # Keep a single stimulus column separate
-    assert matpd.append(matpd,unique_cols=['face_A']).shape[1] == 15
+    assert matpd.append(matpd,unique_cols=['face_A']).shape[1] == 33
     # Keep a common stimulus class separate
-    assert matpd.append(matpd,unique_cols=['face*']).shape[1] == 16
+    assert matpd.append(matpd,unique_cols=['face*']).shape[1] == 34
     # Keep a common stimulus class and a different single stim separate
-    assert matpd.append(matpd,unique_cols=['face*','house_A']).shape[1] == 17
+    assert matpd.append(matpd,unique_cols=['face*','house_A']).shape[1] == 35
     # Keep multiple stimulus class separate
-    assert matpd.append(matpd,unique_cols=['face*','house*']).shape[1] == 18
+    assert matpd.append(matpd,unique_cols=['face*','house*']).shape[1] == 36
 
     # Growing a multi-run design matrix; keeping things separate
     num_runs = 4
     all_runs = Design_Matrix(sampling_freq=.5)
-    run_list = []
     for i in range(num_runs):
         run = Design_Matrix(np.array([
                                 [1,0,0,0],
@@ -683,7 +682,6 @@ def test_designmat(tmpdir):
                                 columns=['stim_A','stim_B','cond_C','cond_D']
                                 )
         run = run.add_poly(2)
-        run_list.append(run)
         all_runs = all_runs.append(run,unique_cols=['stim*','cond*'])
 
     assert all_runs.shape == (44, 28)