From b85eb6e6816248b6ea058e34290864e780fa84bf Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Fri, 29 Nov 2024 08:51:14 -0500 Subject: [PATCH 1/8] Compare populations: already good! --- .../scil_connectivity_compare_populations.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/scil_connectivity_compare_populations.py b/scripts/scil_connectivity_compare_populations.py index 74a9bcced..5f12b8151 100755 --- a/scripts/scil_connectivity_compare_populations.py +++ b/scripts/scil_connectivity_compare_populations.py @@ -2,21 +2,31 @@ # -*- coding: utf-8 -*- """ -Performs a network-based statistical comparison for populations g1 and g2. The -output is a matrix of the same size as the input connectivity matrices, with -p-values at each edge. -All input matrices must have the same shape (NxN). For paired t-test, both -groups must have the same number of observations. +Performs a statistical comparison between connectivity matrices for populations +g1 and g2, using a t-test. + +The inputs are any connectivity matrix, that can be obtained with +scil_connectivity_compute_matrices.py, used separately on the two populations. +All input matrices must have the same shape (NxN). + +The output is a matrix of the same size as the input connectivity matrices, +with p-values at each connection (edge). For example, if you have streamline count weighted matrices for a MCI and a -control group and you want to investiguate differences in their connectomes: +control group, and you want to investiguate differences in their connectomes: >>> scil_connectivity_compare_populations.py pval.npy --g1 MCI/*_sc.npy --g2 CTL/*_sc.npy +Options: + --filtering_mask will simply multiply the binary mask to all input matrices before performing the statistical comparison. Reduces the number of statistical tests, useful when using --fdr or --bonferroni. +--paired with use a paired t-test. Then both groups must have the same number +of observations (subjects). They must be listed in the right order using --g1 +and --g2. + Formerly: scil_compare_connectivity.py """ @@ -53,9 +63,11 @@ def _build_arg_parser(): help='Output matrix (.npy) containing the edges p-value.') p.add_argument('--in_g1', nargs='+', required=True, - help='List of matrices for the first population (.npy).') + help='List of matrices for each subject in the first ' + 'population (.npy).\n') p.add_argument('--in_g2', nargs='+', required=True, - help='List of matrices for the second population (.npy).') + help='List of matrices for each subject in the second ' + 'population (.npy).') p.add_argument('--tail', choices=['left', 'right', 'both'], default='both', help='Enables specification of an alternative hypothesis:\n' 'left: mean of g1 < mean of g2,\n' @@ -94,8 +106,7 @@ def main(): args = parser.parse_args() logging.getLogger().setLevel(logging.getLevelName(args.verbose)) - assert_inputs_exist(parser, args.in_g1+args.in_g2, - args.filtering_mask) + assert_inputs_exist(parser, args.in_g1+args.in_g2, args.filtering_mask) assert_outputs_exist(parser, args, args.out_pval_matrix) if args.filtering_mask: From b15babf4c198582a51519898d6b6bdd2dab09f4e Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Fri, 29 Nov 2024 09:04:59 -0500 Subject: [PATCH 2/8] Normalize: already good! --- scripts/scil_connectivity_normalize.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/scil_connectivity_normalize.py b/scripts/scil_connectivity_normalize.py index 8a3fd8a06..c01c08945 100755 --- a/scripts/scil_connectivity_normalize.py +++ b/scripts/scil_connectivity_normalize.py @@ -4,7 +4,9 @@ """ Normalize a connectivity matrix coming from scil_tractogram_segment_connections_from_labels.py. -3 categories of normalization are available: + +3 categories of normalization are available, with option for each. You may +choose any number of non-mutually exclusive options: -- Edge attributes - length: Multiply each edge by the average bundle length. Compensate for far away connections when using interface seeding. @@ -31,10 +33,10 @@ - sum_to_one: Ensure the sum of all edges weight is one - log_10: Apply a base 10 logarithm to all edges weight -The volume and length matrix should come from the +The volume and length matrices should come from the scil_tractogram_segment_connections_from_labels.py script. -A review of the type of normalization is available in: +A review of the types of normalization is available in: Colon-Perez, Luis M., et al. "Dimensionless, scale-invariant, edge weight metric for the study of complex structural networks." PLOS one 10.7 (2015). @@ -83,13 +85,14 @@ def _build_arg_parser(): help='Volume matrix used for edge-wise division.') vol = edge_p.add_mutually_exclusive_group() - # toDo. Same description for the two options vol.add_argument('--parcel_volume', nargs=2, metavar=('ATLAS', 'LABELS_LIST'), - help='Atlas and labels list for edge-wise division.') + help='Atlas and labels list for edge-wise division by \n' + 'the sum of node volum.') vol.add_argument('--parcel_surface', nargs=2, metavar=('ATLAS', 'LABELS_LIST'), - help='Atlas and labels list for edge-wise division.') + help='Atlas and labels list for edge-wise division by \n' + 'the sum of the node surface.') scaling_p = p.add_argument_group('Scaling options') scale = scaling_p.add_mutually_exclusive_group() From 782e678e474237f3fe5bf2885c7767a21c25755c Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Fri, 29 Nov 2024 09:27:45 -0500 Subject: [PATCH 3/8] reorder_rois: small cleaning of redundancy --- scripts/scil_connectivity_reorder_rois.py | 55 ++++++++++++----------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/scripts/scil_connectivity_reorder_rois.py b/scripts/scil_connectivity_reorder_rois.py index 9ed78ab5d..9d1d78339 100755 --- a/scripts/scil_connectivity_reorder_rois.py +++ b/scripts/scil_connectivity_reorder_rois.py @@ -2,20 +2,23 @@ # -*- coding: utf-8 -*- """ -Re-order one or many connectivity matrices using a text file format. -The first row are the (x) and the second row the (y), must be space separated. -The resulting matrix does not have to be square (support unequal number of -x and y). +Re-order one or many connectivity matrices' rows and columns. The connectivity +matrices can come, for instance, from scil_connectivity_computes_matrices.py + +To state the new order, use a text file with the following formatting: +The first row are the new x order and the second row the new y order, using +integer values separated by a space. The resulting matrix does not have to be +square (supports unequal number of x and y). The values refer to the coordinates (starting at 0) in the matrix, but if the --labels_list parameter is used, the values will refer to the label which will be converted to the appropriate coordinates. This file must be the same as the one provided to the scil_tractogram_segment_connections_from_labels.py. -To subsequently use scil_visualize_connectivity.py with a lookup table, you +To subsequently use scil_viz_connectivity.py with a lookup table, you must use a label-based reording json and use --labels_list. -You can also use the Optimal Leaf Ordering(OLO) algorithm to transform a +You can also use the Optimal Leaf Ordering (OLO) algorithm to transform a sparse matrix into an ordering that reduces the matrix bandwidth. The output file can then be re-used with --in_ordering. Only one input can be used with this option, we recommand an average streamline count or volume matrix. @@ -61,9 +64,12 @@ def _build_arg_parser(): 'structures along the diagonal.') p.add_argument('--out_suffix', - help='Suffix for the output matrix filename.') + help="Suffix for the output matrices filenames. It will " + "be appended to each input matrix's name.") p.add_argument('--out_dir', - help='Output directory for the re-ordered matrices.') + help='Output directory for the re-ordered matrices.\n' + 'If not set, each output matrix will be saved in ' + 'the same \ndirectory as the input matrix.') p.add_argument('--labels_list', help='List saved by the decomposition script,\n' '--in_ordering must contain labels rather than ' @@ -77,10 +83,15 @@ def _build_arg_parser(): def parse_ordering(in_ordering_file, labels_list=None): """ - toDo. Docstring please. + Read the ordering file, which should contain two lines, with integer + values for each. """ + # Cannot load with numpy in case of non-squared matrix (unequal x/y) with open(in_ordering_file, 'r') as my_file: lines = my_file.readlines() + if len(lines) != 2: + raise ValueError("The ordering file should contain exactly two " + "lines of text.") ordering = [[int(val) for val in lines[0].split()], [int(val) for val in lines[1].split()]] if labels_list: @@ -103,8 +114,12 @@ def main(): assert_inputs_exist(parser, args.in_matrices, [args.labels_list, args.in_ordering]) assert_output_dirs_exist_and_empty(parser, args, [], args.out_dir) + # Verification of output matrices names will be done below. if args.optimal_leaf_ordering is not None: + if args.out_suffix or args.out_dir: + logging.warning("Options --out_suffix and --out_dir are ignored " + "with option --optimal_leaf_ordering.") if len(args.in_matrices) > 1: parser.error('Only one input is supported with RCM.') assert_outputs_exist(parser, args, args.optimal_leaf_ordering) @@ -125,29 +140,17 @@ def main(): basename, ext = os.path.splitext(filename) basename = os.path.basename(basename) - curr_filename = os.path.join(out_dir, - '{}{}.{}'.format(basename, - args.out_suffix, - ext[1:])) + curr_filename = os.path.join( + out_dir, '{}{}.{}'.format(basename, args.out_suffix, ext[1:])) out_filenames.append(curr_filename) assert_outputs_exist(parser, args, out_filenames) - # Cannot load with numpy in case of non-squared matrix (unequal x/y) ordering = parse_ordering(args.in_ordering, args.labels_list) - for filename in args.in_matrices: - out_dir = os.path.dirname(filename) if args.out_dir is None \ - else args.out_dir - basename, ext = os.path.splitext(filename) - basename = os.path.basename(basename) - matrix = load_matrix_in_any_format(filename) + for in_name, out_name in zip(args.in_matrices, out_filenames): + matrix = load_matrix_in_any_format(in_name) reordered_matrix = apply_reordering(matrix, ordering) - curr_filename = os.path.join(out_dir, - '{}{}.{}'.format(basename, - args.out_suffix, - ext[1:])) - - save_matrix_in_any_format(curr_filename, reordered_matrix) + save_matrix_in_any_format(out_name, reordered_matrix) if __name__ == "__main__": From 8a87d04c12a1225efb5b024292f92e8bda0e34e3 Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Fri, 29 Nov 2024 11:59:56 -0500 Subject: [PATCH 4/8] Compute_pca: big refactor --- scripts/scil_connectivity_compute_matrices.py | 11 + scripts/scil_connectivity_compute_pca.py | 340 +++++++++--------- 2 files changed, 171 insertions(+), 180 deletions(-) diff --git a/scripts/scil_connectivity_compute_matrices.py b/scripts/scil_connectivity_compute_matrices.py index f28a1730b..bbd66644a 100755 --- a/scripts/scil_connectivity_compute_matrices.py +++ b/scripts/scil_connectivity_compute_matrices.py @@ -33,6 +33,8 @@ - Streamline count. - Length: mean streamline length (mm). + Note that this matrix, as well as the volume-weighted, can be used to + normalize a streamline count matrix in scil_connectivity_normalize. - Volume-weighted: Volume of the bundle. - Similarity: mean density. Uses pre-computed density maps, which can be obtained with @@ -54,6 +56,15 @@ - Mean DPS: Mean values in the data_per_streamline of each streamline in the bundles. +What next? +========== +See our other scripts to help you achieve your goals: + - Normalize a streamline-count matrix based on other matrices using + scil_connectivity_normalize. + - Compute a t-test between two groups of subjects using + scil_connectivity_compare_populations. + - See all our scripts starting with scil_connectivity_ for more ideas! + Formerly: scil_compute_connectivity.py """ diff --git a/scripts/scil_connectivity_compute_pca.py b/scripts/scil_connectivity_compute_pca.py index 8a19d7a5f..6ab12ed8e 100755 --- a/scripts/scil_connectivity_compute_pca.py +++ b/scripts/scil_connectivity_compute_pca.py @@ -2,23 +2,38 @@ # -*- coding: utf-8 -*- """ -Script to compute PCA analysis on diffusion metrics. Output returned is all -significant principal components (e.g. presenting eigenvalues > 1) in a -connectivity matrix format. This script can take into account all edges from -every subject in a population or only non-zero edges across all subjects. +Script to compute PCA analysis on a set of connectivity matrices. The output is +all significant principal components in a connectivity matrix format. +This script can take into account all edges from every subject in a population +or only non-zero edges across all subjects. + +Interpretation of resulting principal components can be done by evaluating the +loadings values for each metrics. A value near 0 means that this metric doesn't +contribute to this specific component whereas high positive or negative values +mean a larger contribution. Components can then be labeled based on which +metric contributes the highest. For example, a principal component showing a +high loading for afd_fixel and near 0 loading for all other metrics can be +interpreted as axonal density (see Gagnon et al. 2022 for this specific example +or ref [3] for an introduction to PCA). The script can take directly as input a connectoflow output folder. Simply use -the --input_connectoflow flag. For other type of folder input, the script -expects a single folder containing all matrices for all subjects. -Example: +the --input_connectoflow flag. Else, the script expects a single folder +containing all matrices for all subjects. Those matrices can be obtained, for +instance, by scil_connectivity_compute_matrices.py. +Example: Default input [in_folder] |--- sub-01_ad.npy |--- sub-01_md.npy |--- sub-02_ad.npy |--- sub-02_md.npy |--- ... +Connectoflow input: + [in_folder] + [subj-01] + [Compute_Connectivity] + |--- ad.npy -The plots, tables and principal components matrices will be outputted in the +The plots, tables and principal components matrices will be saved in the designated folder from the argument. If you want to move back your principal components matrices in your connectoflow output, you can use a similar bash command for all principal components: @@ -27,23 +42,14 @@ cp out_folder/${sub}_PC1.npy connectoflow_output/$sub/Compute_Connectivity/ done -Interpretation of resulting principal components can be done by evaluating the -loadings values for each metrics. A value near 0 means that this metric doesn't -contribute to this specific component whereas high positive or negative values -mean a larger contribution. Components can then be labeled based on which -metric contributes the highest. For example, a principal component showing a -high loading for afd_fixel and near 0 loading for all other metrics can be -interpreted as axonal density (see Gagnon et al. 2022 for this specific example -or ref [3] for an introduction to PCA). - EXAMPLE USAGE: scil_connectivity_compute_pca.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt """ -# Import required libraries. import argparse import logging +import os import matplotlib.pyplot as plt import numpy as np @@ -55,7 +61,8 @@ save_matrix_in_any_format, add_verbose_arg, add_overwrite_arg, - assert_output_dirs_exist_and_empty) + assert_output_dirs_exist_and_empty, + assert_inputs_dirs_exist, assert_inputs_exist) EPILOG = """ @@ -91,7 +98,7 @@ def _build_arg_parser(): 'followed by the .npy extension.') p.add_argument('--list_ids', required=True, metavar='FILE', help='Path to a .txt file containing a list of all ids.') - p.add_argument('--not_only_common', action='store_true', + p.add_argument('--all_edges', action='store_true', help='If true, will include all edges from all subjects ' 'and not only \ncommon edges (Not recommended)') p.add_argument('--input_connectoflow', action='store_true', @@ -104,41 +111,76 @@ def _build_arg_parser(): return p -def generate_pca_input(dictionary): - """ - Function to create PCA input from matrix. - :param dictionary: Dictionary with metrics as keys containing list of matrices sorted by ids. - :return: Numpy array. - """ - for metric in dictionary.keys(): - mat = np.rollaxis(np.array(dictionary[metric]), axis=1, start=3) - matrix_shape = mat.shape[1:3] - n_group = mat.shape[0] - mat = mat.reshape((np.prod(matrix_shape) * n_group, 1)) - dictionary[metric] = mat +def plot_eigenvalues(pca, principaldf, nb_metrics, out_file): + # Plot the eigenvalues. + logging.info('Plotting results...') + eigenvalues = pca.explained_variance_ + pos = list(range(1, nb_metrics + 1)) + plt.clf() + plt.cla() - return np.hstack([dictionary[i] for i in dictionary.keys()]) + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + bar_eig = ax.bar(pos, eigenvalues, align='center', + tick_label=principaldf.columns) + ax.set_xlabel('Principal Components', fontsize=10) + ax.set_ylabel('Eigenvalues', fontsize=10) + ax.set_title('Eigenvalues for each principal components.', fontsize=10) + ax.margins(0.1) + autolabel(bar_eig, ax) + plt.savefig(out_file) + return eigenvalues -def restore_zero_values(orig, new): - """ - Function to restore 0 values in a numpy array. - :param orig: Original numpy array containing 0 values to restore. - :param new: Array in which 0 values need to be restored. - :return: Numpy array with data from the new array but zeros from the original array. - """ - mask = np.copy(orig) - mask[mask != 0] = 1 +def plot_explained_variance(pca, principaldf, nb_metrics, out_file): + # Plot the explained variance. + explained_var = pca.explained_variance_ratio_ + pos = list(range(1, nb_metrics + 1)) + plt.clf() + plt.cla() + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + bar_var = ax.bar(pos, explained_var, align='center', + tick_label=principaldf.columns) + ax.set_xlabel('Principal Components', fontsize=10) + ax.set_ylabel('Explained variance', fontsize=10) + ax.set_title('Amount of explained variance for all principal components.', + fontsize=10) + ax.margins(0.1) + autolabel(bar_var, ax) + plt.savefig(out_file) - return np.multiply(new, mask) + +def plot_contribution(pca, principaldf, metrics, out_excel, out_image): + # Plot the contribution of each measures to principal component. + component = pca.components_ + output_component = pd.DataFrame(component, index=principaldf.columns, + columns=metrics) + output_component.to_excel(out_excel, index=True, header=True) + plt.clf() + plt.cla() + fig, axs = plt.subplots(2) + fig.suptitle('Graph of the contribution of each measures to the first ' + 'and second principal component.', fontsize=10) + pos = list(range(1, len(metrics) + 1)) + bar_pc1 = axs[0].bar(pos, component[0], align='center', + tick_label=metrics) + bar_pc2 = axs[1].bar(pos, component[1], align='center', + tick_label=metrics) + axs[0].margins(0.2) + axs[1].margins(0.2) + autolabel(bar_pc1, axs[0]) + autolabel(bar_pc2, axs[1]) + for ax in axs.flat: + ax.set(xlabel='Diffusion measures', ylabel='Loadings') + for ax in axs.flat: + ax.label_outer() + plt.savefig(out_image) def autolabel(rects, axs): """ Attach a text label above each bar displaying its height (or bar value). - :param rects: Graphical object. - :param axs: Axe number. - :return: """ for rect in rects: height = rect.get_height() @@ -150,56 +192,19 @@ def autolabel(rects, axs): '%.3f' % float(height), ha='center', va='bottom') -def extracting_common_cnx(dictionary, ind): - """ - Function to create a binary mask representing common connections across the population - containing non-zero values. - :param dictionary: Dictionary with metrics as keys containing list of matrices sorted by ids. - :param ind: Indice of the key to use to generate the binary mask from the dictionary. - :return: Binary mask. - """ - keys = list(dictionary.keys()) - met = np.copy(dictionary[keys[ind]]) - - # Replacing all non-zero values by 1. - for i in range(0, len(met)): - met[i][met[i] != 0] = 1 - - # Adding all matrices together. - orig = np.copy(met[0]) - mask = [orig] - for i in range(1, len(met)): - orig = np.add(orig, met[i]) - mask.append(orig) - - # Converting resulting values to 0 and 1. - mask_f = mask[(len(met) - 1)] - mask_f[mask_f != len(met)] = 0 - mask_f[mask_f == len(met)] = 1 - - return mask_f - - -def apply_binary_mask(dictionary, mask): - """ - Function to apply a binary mask to all matrices contained in a dictionary. - :param dictionary: Dictionary with metrics as keys containing list of matrices sorted by ids. - :param mask: Binary mask. - :return: Dictionary with the same shape as input. - """ - for a in dictionary.keys(): - for i in range(0, len(dictionary[a])): - dictionary[a][i] = np.multiply(dictionary[a][i], mask) - - return dictionary - - def main(): parser = _build_arg_parser() args = parser.parse_args() logging.getLogger().setLevel(logging.getLevelName(args.verbose)) - assert_output_dirs_exist_and_empty(parser, args, args.out_folder, create_dir=True) + assert_inputs_dirs_exist(parser, args.in_folder) + assert_inputs_exist(parser, args.list_ids) + assert_output_dirs_exist_and_empty(parser, args, args.out_folder, + create_dir=True) + out_eigenvalues = os.path.join(args.out_folder, 'eigenvalues.pdf') + out_variance = os.path.join(args.out_folder, 'explained_variance.pdf') + out_excel = os.path.join(args.out_folder, 'loadings.xlsx') + out_contributions = os.path.join(args.out_folder, 'contribution.pdf') with open(args.list_ids) as f: subjects = f.read().split() @@ -207,122 +212,97 @@ def main(): if args.input_connectoflow: # Loading all matrix. logging.info('Loading all matrices from a Connectoflow output...') - dictionary = {m: [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') - for a in subjects] - for m in args.metrics} + files_per_metric = [[os.path.join( + args.in_folder, a, 'Compute_Connectivity', '{}.npy'.format(m)) + for a in subjects] + for m in args.metrics] else: logging.info('Loading all matrices...') - dictionary = {m: [load_matrix_in_any_format(f'{args.in_folder}/{a}_{m}.npy') for a in subjects] - for m in args.metrics} - # Assert that all metrics have the same number of subjects. - nb_sub = [len(dictionary[m]) for m in args.metrics] - assert all(x == len(subjects) for x in nb_sub), "Error, the number of matrices for each metric doesn't match" \ - "the number of subject in the id list." \ - "Please validate input folder." + files_per_metric = [[os.path.join( + args.in_folder, '{}_{}.npy'.format(a, m)) + for a in subjects] + for m in args.metrics] + + assert_inputs_exist(parser, sum(files_per_metric, [])) + matrices_per_metric = [[load_matrix_in_any_format(f) for f in files] + for files in files_per_metric] # Setting individual matrix shape. - mat_shape = dictionary[args.metrics[0]][0].shape + mat_shape = matrices_per_metric[0][0].shape - if args.not_only_common: + if args.all_edges: # Creating input structure using all edges from all subjects. logging.info('Creating PCA input structure with all edges...') - df = generate_pca_input(dictionary) else: - m1 = extracting_common_cnx(dictionary, 0) - m2 = extracting_common_cnx(dictionary, 1) - - if m1.shape != mat_shape: - parser.error("Extracted binary mask doesn't match the shape of individual input matrix.") - - if np.sum(m1) != np.sum(m2): - parser.error("Different binary mask have been generated from 2 different metrics, \n " - "please validate input data.") - else: - logging.info('Data shows {} common connections across the population.'.format(np.sum(m1))) - - dictionary = apply_binary_mask(dictionary, m1) - - # Creating input structure. logging.info('Creating PCA input structure with common edges...') - df = generate_pca_input(dictionary) - # Setting 0 values to nan. - df[df == 0] = 'nan' - - # Standardized the data. + common_edges_mask = np.sum(matrices_per_metric[0]) != 0 + + # Verifying that other metrics have the same common edges + for i, matrices in enumerate(matrices_per_metric[1:]): + tmp_mask = np.sum(matrices) != 0 + if not np.array_equal(common_edges_mask, tmp_mask): + parser.error("Different binary masks (common edge) have been " + "generated from metric {} than other metrics. " + "Please validate your input data." + .format(args.metrics[i])) + + logging.info('Data shows {} common connections across the population.' + .format(np.sum(common_edges_mask))) + + # Apply mask + matrices_per_metric = [[m * common_edges_mask + for m in metric_matrices] + for metric_matrices in matrices_per_metric] + + # Creating input structure. + # For each metric, combining all subjects together, flattening all + # connectivity matrices. + flat_mats = [] + for metric_matrices in matrices_per_metric: + mat = np.rollaxis(np.array(metric_matrices), axis=1, start=3) + mat = mat.reshape((np.prod(mat.shape), 1)) + flat_mats.append(mat) + df = np.hstack(flat_mats) # Shape: nb_metrics x (flattened data) + df[df == 0] = np.nan # Setting 0 values to nan. + + # Standardizing the data. logging.info('Standardizing data...') x = StandardScaler().fit_transform(df) df_pca = x[~np.isnan(x).any(axis=1)] x = np.nan_to_num(x, nan=0.) - # Perform the PCA. + # Performing the PCA. logging.info('Performing PCA...') pca = PCA(n_components=len(args.metrics)) principalcomponents = pca.fit_transform(df_pca) - principaldf = pd.DataFrame(data=principalcomponents, columns=[f'PC{i}' for i in range(1, len(args.metrics) + 1)]) - - # Plot the eigenvalues. - logging.info('Plotting results...') - eigenvalues = pca.explained_variance_ - pos = list(range(1, len(args.metrics)+1)) - plt.clf() - plt.cla() - fig = plt.figure() - ax = fig.add_subplot(1, 1, 1) - bar_eig = ax.bar(pos, eigenvalues, align='center', tick_label=principaldf.columns) - ax.set_xlabel('Principal Components', fontsize=10) - ax.set_ylabel('Eigenvalues', fontsize=10) - ax.set_title('Eigenvalues for each principal components.', fontsize=10) - ax.margins(0.1) - autolabel(bar_eig, ax) - plt.savefig(f'{args.out_folder}/eigenvalues.pdf') - - # Plot the explained variance. - explained_var = pca.explained_variance_ratio_ - plt.clf() - plt.cla() - fig = plt.figure() - ax = fig.add_subplot(1, 1, 1) - bar_var = ax.bar(pos, explained_var, align='center', tick_label=principaldf.columns) - ax.set_xlabel('Principal Components', fontsize=10) - ax.set_ylabel('Explained variance', fontsize=10) - ax.set_title('Amount of explained variance for all principal components.', fontsize=10) - ax.margins(0.1) - autolabel(bar_var, ax) - plt.savefig(f'{args.out_folder}/explained_variance.pdf') - - # Plot the contribution of each measures to principal component. - component = pca.components_ - output_component = pd.DataFrame(component, index=principaldf.columns, columns=args.metrics) - output_component.to_excel(f'{args.out_folder}/loadings.xlsx', index=True, header=True) - plt.clf() - plt.cla() - fig, axs = plt.subplots(2) - fig.suptitle('Graph of the contribution of each measures to the first and second principal component.', fontsize=10) - bar_pc1 = axs[0].bar(pos, component[0], align='center', tick_label=args.metrics) - bar_pc2 = axs[1].bar(pos, component[1], align='center', tick_label=args.metrics) - axs[0].margins(0.2) - axs[1].margins(0.2) - autolabel(bar_pc1, axs[0]) - autolabel(bar_pc2, axs[1]) - for ax in axs.flat: - ax.set(xlabel='Diffusion measures', ylabel='Loadings') - for ax in axs.flat: - ax.label_outer() - plt.savefig(f'{args.out_folder}/contribution.pdf') + principaldf = pd.DataFrame( + data=principalcomponents, + columns=[f'PC{i}' for i in range(1, len(args.metrics) + 1)]) + + # Plotting + eigenvalues = plot_eigenvalues(pca, principaldf, + nb_metrics=len(args.metrics), + out_file=out_eigenvalues) + plot_explained_variance(pca, principaldf, nb_metrics=len(args.metrics), + out_file=out_variance) + plot_contribution(pca, principaldf, args.metrics, out_excel, + out_contributions) # Extract the derived newly computed measures from the PCA analysis. logging.info('Saving matrices for PC with eigenvalues > 1...') out = pca.transform(x) - out = restore_zero_values(x, out) - out = out.swapaxes(0, 1).reshape(len(args.metrics), len(subjects), mat_shape[0], mat_shape[1]) + out = out * (x != 0) + out = out.swapaxes(0, 1).reshape(len(args.metrics), len(subjects), + mat_shape[0], mat_shape[1]) # Save matrix for significant components nb_pc = eigenvalues[eigenvalues >= 1] for i in range(0, len(nb_pc)): for s in range(0, len(subjects)): - save_matrix_in_any_format(f'{args.out_folder}/{subjects[s]}_PC{i+1}.npy', - out[i, s, :, :]) + filename = os.path.join(args.out_folder, + f'{subjects[s]}_PC{i+1}.npy') + save_matrix_in_any_format(filename, out[i, s, :, :]) if __name__ == "__main__": From cf57e30f495d240bfbf682332824a555ef3db9b7 Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Fri, 29 Nov 2024 12:03:32 -0500 Subject: [PATCH 5/8] typo --- scripts/scil_connectivity_compare_populations.py | 2 +- scripts/scil_connectivity_normalize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/scil_connectivity_compare_populations.py b/scripts/scil_connectivity_compare_populations.py index 5f12b8151..1a5b561ec 100755 --- a/scripts/scil_connectivity_compare_populations.py +++ b/scripts/scil_connectivity_compare_populations.py @@ -23,7 +23,7 @@ matrices before performing the statistical comparison. Reduces the number of statistical tests, useful when using --fdr or --bonferroni. ---paired with use a paired t-test. Then both groups must have the same number +--paired will use a paired t-test. Then both groups must have the same number of observations (subjects). They must be listed in the right order using --g1 and --g2. diff --git a/scripts/scil_connectivity_normalize.py b/scripts/scil_connectivity_normalize.py index c01c08945..c18ba9654 100755 --- a/scripts/scil_connectivity_normalize.py +++ b/scripts/scil_connectivity_normalize.py @@ -5,7 +5,7 @@ Normalize a connectivity matrix coming from scil_tractogram_segment_connections_from_labels.py. -3 categories of normalization are available, with option for each. You may +3 categories of normalization are available, with options for each. You may choose any number of non-mutually exclusive options: -- Edge attributes - length: Multiply each edge by the average bundle length. From f75886a989d3245273e45318b5ee7d3a0c780751 Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Thu, 12 Dec 2024 12:10:10 -0500 Subject: [PATCH 6/8] Fix common edges mask --- scripts/scil_connectivity_compute_pca.py | 32 ++++++++++++++++-------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/scripts/scil_connectivity_compute_pca.py b/scripts/scil_connectivity_compute_pca.py index 6ab12ed8e..4ae84ce71 100755 --- a/scripts/scil_connectivity_compute_pca.py +++ b/scripts/scil_connectivity_compute_pca.py @@ -104,6 +104,9 @@ def _build_arg_parser(): p.add_argument('--input_connectoflow', action='store_true', help='If true, script will assume the input folder is a ' 'Connectoflow output.') + p.add_argument('--show', action='store_true', + help="If set, show matplotlib figures. Else, they are " + "only saved in the output folder.") add_verbose_arg(p) add_overwrite_arg(p) @@ -116,8 +119,6 @@ def plot_eigenvalues(pca, principaldf, nb_metrics, out_file): logging.info('Plotting results...') eigenvalues = pca.explained_variance_ pos = list(range(1, nb_metrics + 1)) - plt.clf() - plt.cla() fig = plt.figure() ax = fig.add_subplot(1, 1, 1) @@ -136,8 +137,7 @@ def plot_explained_variance(pca, principaldf, nb_metrics, out_file): # Plot the explained variance. explained_var = pca.explained_variance_ratio_ pos = list(range(1, nb_metrics + 1)) - plt.clf() - plt.cla() + fig = plt.figure() ax = fig.add_subplot(1, 1, 1) bar_var = ax.bar(pos, explained_var, align='center', @@ -157,8 +157,7 @@ def plot_contribution(pca, principaldf, metrics, out_excel, out_image): output_component = pd.DataFrame(component, index=principaldf.columns, columns=metrics) output_component.to_excel(out_excel, index=True, header=True) - plt.clf() - plt.cla() + fig, axs = plt.subplots(2) fig.suptitle('Graph of the contribution of each measures to the first ' 'and second principal component.', fontsize=10) @@ -209,8 +208,8 @@ def main(): with open(args.list_ids) as f: subjects = f.read().split() + # Loading all matrices. if args.input_connectoflow: - # Loading all matrix. logging.info('Loading all matrices from a Connectoflow output...') files_per_metric = [[os.path.join( args.in_folder, a, 'Compute_Connectivity', '{}.npy'.format(m)) @@ -230,23 +229,31 @@ def main(): # Setting individual matrix shape. mat_shape = matrices_per_metric[0][0].shape + # Creating input structure if args.all_edges: - # Creating input structure using all edges from all subjects. logging.info('Creating PCA input structure with all edges...') else: logging.info('Creating PCA input structure with common edges...') + nb_subjects = len(matrices_per_metric[0]) - common_edges_mask = np.sum(matrices_per_metric[0]) != 0 + # Using the first metric + subj_masks = [m !=0 for m in matrices_per_metric[0]] # Binary per subj + common_edges_mask = np.sum(subj_masks, axis=0) == nb_subjects # Verifying that other metrics have the same common edges for i, matrices in enumerate(matrices_per_metric[1:]): - tmp_mask = np.sum(matrices) != 0 - if not np.array_equal(common_edges_mask, tmp_mask): + tmp_subj_masks = [m !=0 for m in matrices] # Binary per subj + tmp_common = np.sum(tmp_subj_masks, axis=0) == nb_subjects + if not np.array_equal(common_edges_mask, tmp_common): parser.error("Different binary masks (common edge) have been " "generated from metric {} than other metrics. " "Please validate your input data." .format(args.metrics[i])) + plt.figure() + plt.imshow(common_edges_mask) + plt.title(common_edges_mask) + logging.info('Data shows {} common connections across the population.' .format(np.sum(common_edges_mask))) @@ -304,6 +311,9 @@ def main(): f'{subjects[s]}_PC{i+1}.npy') save_matrix_in_any_format(filename, out[i, s, :, :]) + if args.show: + plt.show() + if __name__ == "__main__": main() From 48429f890e7e3d39b53aa8711eaa7cd6fc032675 Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Thu, 12 Dec 2024 13:21:35 -0500 Subject: [PATCH 7/8] Fix pep8 --- scripts/scil_connectivity_compute_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/scil_connectivity_compute_pca.py b/scripts/scil_connectivity_compute_pca.py index 4ae84ce71..670c82ddc 100755 --- a/scripts/scil_connectivity_compute_pca.py +++ b/scripts/scil_connectivity_compute_pca.py @@ -237,12 +237,12 @@ def main(): nb_subjects = len(matrices_per_metric[0]) # Using the first metric - subj_masks = [m !=0 for m in matrices_per_metric[0]] # Binary per subj + subj_masks = [m != 0 for m in matrices_per_metric[0]] # Mask per subj common_edges_mask = np.sum(subj_masks, axis=0) == nb_subjects # Verifying that other metrics have the same common edges for i, matrices in enumerate(matrices_per_metric[1:]): - tmp_subj_masks = [m !=0 for m in matrices] # Binary per subj + tmp_subj_masks = [m != 0 for m in matrices] # Binary per subj tmp_common = np.sum(tmp_subj_masks, axis=0) == nb_subjects if not np.array_equal(common_edges_mask, tmp_common): parser.error("Different binary masks (common edge) have been " From 7a6707fcabe934722de3f13f19706b296d29ff64 Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Tue, 17 Dec 2024 13:40:39 -0500 Subject: [PATCH 8/8] Fix title --- scripts/scil_connectivity_compute_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/scil_connectivity_compute_pca.py b/scripts/scil_connectivity_compute_pca.py index 670c82ddc..56b09541b 100755 --- a/scripts/scil_connectivity_compute_pca.py +++ b/scripts/scil_connectivity_compute_pca.py @@ -252,7 +252,7 @@ def main(): plt.figure() plt.imshow(common_edges_mask) - plt.title(common_edges_mask) + plt.title("Common edges mask") logging.info('Data shows {} common connections across the population.' .format(np.sum(common_edges_mask)))