diff --git a/.gitignore b/.gitignore index 9c35720..9c20808 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ qc/ *.jpg *.png .gitignore +*.egg-info/ +dist/ diff --git a/README.md b/README.md index 3e18418..83dfd43 100755 --- a/README.md +++ b/README.md @@ -69,9 +69,10 @@ Per-subject stat: Panda dataframe `df_sub`: - intra-subject MEAN: MEAN[CSA(sI, rX, :)] --> MEAN_intra(sI, rX): `df_sub['mean']` - intra-subject STD: STD[CSA(sI, rX, :)] --> STD_intra(sI, rX): `df_sub['std']` - intra-subject COV: STD[CSA(sI, rX, :)] / MEAN[CSA(sI, rX, :)] --> COV_intra(sI, rX): `df_sub['cov']` -- rescale_estimated_subject MEAN: MEAN[CSA(sI, rX, :) / CSA(sI, 1, :)] --> MEAN_rescale_estimated_subject(sI, rX): `df_sub['rescale_estimated']` +- rescale_estimated_subject MEAN: MEAN[ CSA(sI, rX, :) ] / MEAN[ CSA(sI, 1, :) ] --> MEAN_rescale_estimated_subject(sI, rX): `df_sub['rescale_estimated']` - intra-subject error MEAN: MEAN[CSA(sI, rX, :)] - (rX^2 * MEAN[CSA(sI, 1, :)]) --> MEAN_error_intra(sI, rX): `df_sub['error']` - intra-subject error in percentage MEAN: [MEAN[CSA(sI, rX, :)] - (rX^2 * MEAN[CSA(sI, 1, :)])] / MEAN[CSA(sI, rX, :)] --> MEAN_error_intra_perc(sI, rX): `df_sub['perc_error']` +- intra-subject difference MEAN: [MEAN[CSA(sI, 1, :)] - (rX^2 * MEAN[CSA(sI, rX, :)])] --> MEAN_diff(sI, rX): `df_sub['diff']` Across-subject stats: Panda dataframe `df_rescale` - intra-subject STD: MEAN[STD_intra(:, rX)] --> STD_intra(rX): `df_rescale['std_intra']` @@ -81,9 +82,11 @@ Across-subject stats: Panda dataframe `df_rescale` - rescale_estimated (across subjects) STD: STD[MEAN_rescale_estimated_subject(:, rX)] --> STD_rescale_estimated(rX): `df_rescale['std_rescale_estimated']` - error in percentage (across subjects) MEAN: MEAN[MEAN_error_intra(:, rX)] - error in percentage (across subjects) STD: STD[MEAN_error_intra(:, rX)] +- intra-subject difference STD: STD[MEAN_diff(:, rX)] Power analysis: -- sample size: [(z(uncertainty) + z(power))^2 * (2 * STD[MEAN(:, rX)]^2)] / [MEAN[CSA(sI, 1, :)] - MEAN[CSA(sI, rX, :)]] +- sample size for a between subject study: [(z(uncertainty) + z(power))^2 * (2 * STD[MEAN(:, rX)]^2)] / [MEAN[CSA(sI, 1, :)] - MEAN[CSA(sI, rX, :)]] +- sample size for a within subject study: [(z(uncertainty) + z(power))^2 * (STD[MEAN_diff(:, rX)]^2)] / [MEAN[CSA(sI, 1, :)] - MEAN[CSA(sI, rX, :)]] Plot results: - STD_intersub diff --git a/affine_transfo.py b/affine_transfo.py index ce98e99..5eebf4e 100755 --- a/affine_transfo.py +++ b/affine_transfo.py @@ -120,7 +120,11 @@ def random_values(df, subject_name, config_param): 'shift_PA': shift_PA, 'shift_IS': shift_IS } - df = df.append(transfo_dict, ignore_index=True) + #df = df.append(transfo_dict, ignore_index=True) + print(df) + print(transfo_dict) + df_new_row = pd.DataFrame(transfo_dict, index=[0]) + df = pd.concat([df, df_new_row], ignore_index=True)#.reset_index() return df, angle_IS, angle_PA, angle_LR, shift_LR, shift_PA, shift_IS diff --git a/config_sct_run_batch.yml b/config_sct_run_batch.yml index f0968be..4cf4ef1 100644 --- a/config_sct_run_batch.yml +++ b/config_sct_run_batch.yml @@ -1,10 +1,9 @@ # config file for sct_run_batch -path_data: /scratch/pabaua/data-multi-subject-p -path_output: /scratch/pabaua/results_csa_t2 -script: /home/pabaua/csa-atrophy/process_data.sh -script_args: /home/pabaua/csa-atrophy/config_script.yml -jobs: -1 +path_data: ~/datasets/data-multi-subject +path_output: ~/process_results/results_csa-atrophy_t2-all-2024-08-13 +script: ~/code/csa-atrophy/process_data.sh +script_args: /home/GRAMES.POLYMTL.CA/sebeda/code/csa-atrophy/config_script.yml +jobs: 8 batch_log: sct_run_batch_log.txt continue_on_error: 1 subject_prefix: sub- -zip: true diff --git a/config_sct_run_batch_cc.yml b/config_sct_run_batch_cc.yml new file mode 100644 index 0000000..e9347c0 --- /dev/null +++ b/config_sct_run_batch_cc.yml @@ -0,0 +1,10 @@ +# config file for sct_run_batch +path_data: /home/sabeda/scratch/data-multi-subject +path_output: /home/sabeda/scratch/results_csa-atrophy_t2-all +script: /home/sabeda/csa-atrophy/process_data.sh +script_args: /home/sabeda/csa-atrophy/config_script.yml +jobs: -1 +batch_log: sct_run_batch_log.txt +continue_on_error: 1 +subject_prefix: sub- +zip: true diff --git a/csa_rescale_stat.py b/csa_rescale_stat.py index 1894186..1373bee 100755 --- a/csa_rescale_stat.py +++ b/csa_rescale_stat.py @@ -17,13 +17,26 @@ import pandas as pd import numpy as np +import logging import os +import sys import argparse +import matplotlib import matplotlib.pyplot as plt from math import ceil from ruamel.yaml import YAML +from scipy import stats +from matplotlib import cm +# Initialize logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # default: logging.DEBUG, logging.INFO +hdlr = logging.StreamHandler(sys.stdout) +logging.root.addHandler(hdlr) + +FNAME_LOG = 'log_stats.txt' + # Parser ######################################################################################### @@ -78,9 +91,7 @@ def concatenate_csv_files(path_results): files.append(path) if not files: raise FileExistsError("Folder {} does not contain any results csv file.".format(path_results)) - #metrics = pd.concat( - #[pd.read_csv(f).assign(rescale=os.path.basename(f).split('_')[4].split('.csv')[0]) for f in files]) - print("Concatenate csv files. This will take a few seconds...") + logger.info("Concatenate csv files. This will take a few seconds...") metrics = pd.concat(pd.read_csv(f) for f in files) # output csv file in PATH_RESULTS metrics.to_csv(os.path.join(path_results, r'csa_all.csv')) @@ -109,10 +120,10 @@ def plot_perc_err(df, path_output): plt.xlabel('area rescaling in %') plt.ylabel('STD in %') plt.grid() - plt.tight_layout() + #plt.tight_layout() output_file = os.path.join(path_output, "fig_err.png") plt.savefig(output_file) - print("--> Created figure: {}".format(output_file)) + logger.info(f"--> Created figure: {output_file}") def boxplot_csa(df, path_output): @@ -120,19 +131,22 @@ def boxplot_csa(df, path_output): :param df: dataframe with csv files data: df_sub :param path_output: directory in which plot is saved """ - # TODO: round xlabel - # TODO: find a way to display ylabel title with superscript fig2 = plt.figure() - # round rescale area - df['rescale_area'] = round(df['rescale_area'], ndigits=0).astype(int) - df.boxplot(column=['mean'], by='rescale_area', showmeans=True, meanline=True) + meanpointprops = dict(marker='x' , markeredgecolor='red' , + markerfacecolor='red') + df.boxplot(column=['mean'], by=df['rescale_area'].round(1), positions=sorted(set(df['rescale_area'].values)), showmeans=True, meanprops=meanpointprops) + min_rescale = min(df['rescale_area'].values) + min_rescale_r = min(df['rescale'].values) + max_rescale = max(df['rescale_area'].values) + max_y = df.groupby('rescale').get_group(1).mean()['mean'] + plt.plot([min_rescale, max_rescale], [max_y * (min_rescale_r ** 2), max_y], ls="--", c=".3") plt.title('Boxplot of CSA in function of area rescaling') plt.suptitle("") - plt.ylabel('csa in mm^2') - plt.xlabel('area rescaling in %') + plt.ylabel('CSA in mm^2') + plt.xlabel('CSA scaling') output_file = os.path.join(path_output, "fig_boxplot_csa.png") plt.savefig(output_file) - print("--> Created figure: {}".format(output_file)) + logger.info("--> Created figure: {}".format(output_file)) def boxplot_atrophy(df, path_output): @@ -143,98 +157,86 @@ def boxplot_atrophy(df, path_output): fig = plt.figure() # convert to percentage df['rescale_estimated'] = df['rescale_estimated'] * 100 - # round rescale area - df['rescale_area'] = round(df['rescale_area'], ndigits=0).astype(int) - df.boxplot(column='rescale_estimated', by='rescale_area', positions=sorted(set(df['rescale_area'].values)), showmeans=True, meanline=True, figsize=(10,8)) + meanpointprops = dict(marker='x' , markeredgecolor='red' , + markerfacecolor='red') + df.boxplot(column='rescale_estimated', by=df['rescale_area'].round(1), positions=sorted(set(df['rescale_area'].values)), showmeans=True, meanline=False, figsize=(10, 8), meanprops=meanpointprops) min_rescale = min(df['rescale_area'].values) max_rescale = max(df['rescale_area'].values) plt.plot([min_rescale, max_rescale], [min_rescale, max_rescale], ls="--", c=".3") - plt.title('boxplot of measured atrophy in function of area rescaling') - plt.ylabel('measured atrophy in %') - plt.xlabel('area rescaling in %') + plt.title('Boxplot of measured atrophy in function of CSA scaling') plt.suptitle("") - # TODO: scale x and y similarly - # TODO: add diagonal (remove horizontal lines) + plt.ylabel('Atrophied CSA in % of the un-rescaled CSA') + plt.xlabel('CSA scaling') + plt.axis('scaled') output_file = os.path.join(path_output, "fig_boxplot_atrophy.png") plt.savefig(output_file) - print("--> Created figure: {}".format(output_file)) + logger.info("--> Created figure: {}".format(output_file)) -def plot_sample_size(z_conf, z_power, std, mean_csa, path_output): +def plot_sample_size(z_conf, z_power, std_arr, mean_csa, path_output): """plot minimum number of patients required to detect an atrophy of a given value :param z_conf: z score for X % uncertainty. Example: z_conf=1.96 :param z_power: z score for X % Power. Example: z_power=(0.84, 1.282) - :param std: STD around mean CSA of control subjects (without rescaling), + :param std_arr: STD around mean CSA of control subjects (without rescaling), CSA STD for atrophied subjects and control subjects are considered equal - :param mean_csa: mean value of CSA to compute the atrophy percentage. Example: 80 + :param mean_csa: mean value of CSA to compute the atrophy percentage. Example: 80 mm^2 :param path_output: directory in which plot is saved """ fig_sample, ax = plt.subplots() # data for plotting - n = [] + n_t1 = [] + n_t2 =[] for z_p in z_power: - atrophy = np.arange(1.5, 8.0, 0.05) # x_axis values ranging from 1.5 to 8.0 mm^2 - num_n = 2 * ((z_conf + z_p) ** 2) * (std ** 2) # numerator of sample size equation - n.append(num_n / ((atrophy) ** 2)) + # x_axis values ranging from 1.5 to 8.0 mm^2 + atrophy = np.arange(1.5, 8.0, 0.05) + # numerator of sample size equation T1w + num_n_t1 = 2 * ((z_conf + z_p) ** 2) * (std_arr[0] ** 2) + n_t1.append(num_n_t1 / ((0.01*atrophy*mean_csa[0]) ** 2)) + # numerator of sample size equation T2w + num_n_t2 = 2 * ((z_conf + z_p) ** 2) * (std_arr[1] ** 2) + n_t2.append(num_n_t2 / ((0.01*atrophy*mean_csa[1]) ** 2)) # plot - ax.plot(atrophy, n[0], label='80% power') - ax.plot(atrophy, n[1], label='90% power') + ax.plot(atrophy / mean_csa[0] * 100, n_t1[0], 'tab:blue', label='T1w 80% power') + ax.plot(atrophy / mean_csa[0] * 100, n_t1[1], 'tab:blue', linestyle='--', label='T1w 90% power') + ax.plot(atrophy / mean_csa[1] * 100, n_t2[0], 'tab:red', label='T2w 80% power') + ax.plot(atrophy / mean_csa[1] * 100, n_t2[1], 'tab:red', linestyle='--', label='T2w 90% power') ax.set_ylabel('number of participants per group of study \n(patients or controls) with ratio 1:1') - ax.set_xlabel('atrophy in mm^2') - # create global variable for secax (second axis) functions - global mean_csa_sample - mean_csa_sample = mean_csa - ax.set_title('minimum number of participants to detect an atrophy with 5% uncertainty\n std = ' + str( - round(std, 2)) + 'mm², mean_csa = ' + str(mean_csa_sample) + 'mm²') + ax.set_xlabel('atrophy in %') + plt.suptitle('minimum number of participants to detect an atrophy with 5% uncertainty', fontsize=16, y=1.05) + ax.set_title('T1w (1.0 mm iso): SD = {} mm², mean CSA= {} mm² \nT2w (0.8 mm iso): SD = {} mm², mean CSA= {} mm²'.format( + str( + round(std_arr[0], 2)), str(mean_csa[0]), str(round(std_arr[1], 2)) , str(mean_csa[1]))) ax.legend() ax.grid() - def forward(atrophy): - return atrophy / mean_csa_sample * 100 - - def inverse(atrophy): - return atrophy / 100 * mean_csa_sample - - secax = ax.secondary_xaxis('top', functions=(forward, inverse)) - secax.set_xlabel('atrophy in %') output_file = os.path.join(path_output, "fig_min_subj.png") plt.savefig(output_file, bbox_inches='tight') - print("--> Created figure: {}".format(output_file)) + logger.info("--> Created figure: {}".format(output_file)) -def sample_size(df, config_param): - """ - Calculate the minimum number of patients required to detect an atrophy of a given value (i.e. power analysis), - ratio patients/control 1:1 and with the assumption that both samples have the same STD. - ref: Suresh and Chandrashekara 2012. “Sample size estimation and power analysis for clinical research studies” - doi: 10.4103/0974-1208.97779 - :param df: dataframe for computing stats across subject: df_rescale - :param config_param: configuration parameters can be modified in config.yaml file. Example conf = 0.95 - :return sample_size: sample size for each rescaling +def error_function_of_csa(df, path_output): + """Scatter plot of CSA as a function of Mean error % + :param df: dataframe for computing stats per subject: df_sub + :param path_output: directory in which plot is saved """ - sample_size = [] - # configuration parameters can be modified in config.yaml file - # conf = confidence level - conf = config_param['stats']['sample_size']['conf'] - # power = power level - power = config_param['stats']['sample_size']['power'] - z_score_dict = {'confidence_Level': [0.60, 0.70, 0.8, 0.85, 0.90, 0.95], - 'z_value': [0.842, 1.04, 1.28, 1.44, 1.64, 1.96], } - - df_z = pd.DataFrame(z_score_dict) - df_z = df_z.set_index('confidence_Level') - for name, group in df.groupby('rescale'): - std = group['std_inter'].values[0] - mean_patient = group['mean_inter'].values[0] - mean_control = df.groupby('rescale').get_group(1)['mean_inter'].values[0] - atrophy = mean_control - mean_patient - if atrophy != 0: - num_n = 2 * ((df_z.at[conf, 'z_value'] + df_z.at[power, 'z_value']) ** 2) * (std ** 2) - deno_n = (abs(atrophy)) ** 2 - sample_size.append(ceil(num_n / deno_n)) - else: - sample_size.append('inf') - return sample_size + fig, ax = plt.subplots(figsize=(7, 7)) + df['Normalized CSA in mm²'] = df['mean'].div(df['rescale']**2) + # compute linear regression + z = np.polyfit(x=df.loc[:, 'perc_error'], y=df.loc[:, 'Normalized CSA in mm²'], deg=1) + p = np.poly1d(z) + # plot + viridis = cm.get_cmap('viridis', 6) + df.plot.scatter(x='perc_error', y='Normalized CSA in mm²', c='rescale', colormap=viridis) + min_err = min(df['perc_error'].values) + max_err = max(df['perc_error'].values) + plt.plot([min_err, max_err], [p(min_err), p(max_err)], ls="--", c=".3") + plt.title('Normalized CSA in function of error %,\n linear regression: {}'.format(p)) + plt.xlabel('Mean error %') + plt.grid() + output_file = os.path.join(path_output, "fig_err_in_function_of_csa.png") + plt.savefig(output_file, bbox_inches='tight') + print("--> Created figure: {}".format(output_file)) + def error_function_of_intra_cov(df, path_output): """Scatter plot of intra-subject COV in function of error % @@ -248,64 +250,51 @@ def error_function_of_intra_cov(df, path_output): z = np.polyfit(x=df.loc[:, 'perc_error'], y=df.loc[:, 'cov'], deg=1) p = np.poly1d(z) # plot - df.plot.scatter(x='perc_error', y='cov', c='rescale', colormap='viridis') + viridis = cm.get_cmap('viridis', 6) + df.plot.scatter(x='perc_error', y='cov', c='rescale', colormap=viridis) min_err = min(df['perc_error'].values) max_err = max(df['perc_error'].values) - plt.plot([min_err, max_err], [min_err*z[0]+z[1], max_err*z[0]+z[1]], ls="--", c=".3") - ax.set_xlabel('perc_error') - ax.set_ylabel('cov') + plt.plot([min_err, max_err], [p(min_err), p(max_err)], ls="--", c=".3") + plt.xlabel('Mean error %') + plt.ylabel('COV') plt.title('COV in function of % error,\n linear regression: {}'.format(p)) - plt.title("COV in function of % error") - ax.legend(loc='upper right') plt.grid() output_file = os.path.join(path_output, "fig_err_in_function_of_cov.png") plt.savefig(output_file, bbox_inches='tight') - print("--> Created figure: {}".format(output_file)) + logger.info("--> Created figure: {}".format(output_file)) def error_function_of_intra_cov_outlier(df, path_output): - """Scatter plot of intra-subject COV in function of error % to identify outliers with either high error % - or high intra-subject COV + """Scatter plot of intra-subject COV in function of error % to identify the worst outliers (outside the interval + [Q1-10IQR, Q3+10IQR] of percentage error) :param df: dataframe for computing stats per subject: df_sub :param path_output: directory in which plot is saved """ fig, ax = plt.subplots(figsize=(7, 7)) - # identified outliers either high error % or high intra-subject COV of t1 images - outliers_t1_all = ['sub-brnoUhb01', 'sub-brnoUhb02', 'sub-brnoUhb03', 'sub-brnoUhb06', 'sub-brnoUhb07', 'sub-brnoUhb08', - 'sub-barcelona04', 'sub-barcelona06', 'sub-beijingPrisma03', 'sub-milan03', 'sub-oxfordFmrib01', - 'sub-tokyo750w03', 'sub-pavia04'] - # identified t1 outliers remaining after subjects removed due to missing vertebrae levels missing CSA - outliers_t1 = ['sub-brnoUhb01', 'sub-brnoUhb08', 'sub-milan03', 'sub-oxfordFmrib01', 'sub-cmrrb05', - 'sub-tokyo750w03', 'sub-pavia04'] - # identified outliers either high error % or high intra-subject COV of t2 images - outliers_t2_all = ['sub-tokyo750w02', 'sub-tokyo750w04', 'sub-tokyo750w06', 'sub-beijingVerio02', - 'sub-beijingVerio03', 'sub-ubc02', 'sub-vuiisIngenia05'] - # identified t2 outliers remaining after subjects removed due to missing vertebrae levels missing CSA - outliers_t2 = ['sub-tokyo750w02', 'sub-tokyo750w04', 'sub-tokyo750w06', 'sub-beijingVerio02', 'sub-beijingVerio03', - 'sub-ubc02', 'sub-vuiisIngenia05'] + Q1 = df['perc_error'].quantile(0.25) + Q3 = df['perc_error'].quantile(0.75) + # IQR = inter-quartile range. + IQR = Q3 - Q1 + # identified outliers either high error % or high intra-subject COV + outliers = set(df[(df['perc_error'] <= Q1 - 10 * IQR) | (df['perc_error'] >= Q3 + 10 * IQR)]['subject']) # remove rescale=1 because error=0 df = df[df['rescale'] != 1] ax.scatter(df['perc_error'], df['cov'], color='tab:blue', label='others') - # plot scatter outliers of t1w images - for outlier_t1 in outliers_t1: - df_t1 = df.groupby(['subject']).get_group(outlier_t1) - ax.scatter(df_t1['perc_error'], df_t1['cov'], color='tab:red', label=outlier_t1) + # scatter outliers + for outlier in outliers: + df_t1 = df.groupby(['subject']).get_group(outlier) + ax.scatter(df_t1['perc_error'], df_t1['cov'], color='tab:red', label=outlier) df_t1 = [] - # plot scatter outliers of t2w images - for outlier_t2 in outliers_t2: - df_t2 = df.groupby(['subject']).get_group(outlier_t2) - ax.scatter(df_t2['perc_error'], df_t2['cov'], color='tab:olive', label=outlier_t2) - df_t2 = [] # plot - ax.set_xlabel('perc_error') - ax.set_ylabel('cov') - plt.title("COV in function of error %") + ax.set_xlabel('Mean error %') + ax.set_ylabel('COV') + plt.title("Intra subject COV as a function of mean absolute error %") ax.legend(loc='upper right') plt.grid() # save image output_file = os.path.join(path_output, "fig_err_in_function_of_cov_outlier.png") plt.savefig(output_file, bbox_inches='tight') - print("--> Created figure: {}".format(output_file)) + logger.info("--> Created figure: {}".format(output_file)) def add_columns_df_sub(df): @@ -332,6 +321,97 @@ def add_columns_df_sub(df): df = df.reset_index() return df +def pearson(df, df_rescale): + """ The associated Pearson’s coefficients and p-value between subject’s CSA and the associated Pearson’s + coefficients and p-value between COV across Monte Carlo transformations (i.e. intra-subject variability) and mean + CSA error :param df: dataframe for computing stats per subject: df_sub :param df_rescale: dataframe for computing + stats per rescale: df_rescale :return df_rescale: modified dataframe with added theoretic_csa and + csa_without_rescale + """ + pearson_cov = [] + p_value_cov = [] + pearson_csa = [] + p_value_csa = [] + for rescale, group in df.groupby('rescale'): + if rescale != 1: + pearson_cov.append(stats.pearsonr(group['cov'], group['perc_error'])[0]) + p_value_cov.append(stats.pearsonr(group['cov'], group['perc_error'])[1]) + pearson_csa.append(stats.pearsonr(group['mean'], group['perc_error'])[0]) + p_value_csa.append(stats.pearsonr(group['mean'], group['perc_error'])[1]) + else: + pearson_cov.append(1) + p_value_cov.append(0) + pearson_csa.append(1) + p_value_csa.append(0) + df_rescale['pearson_cov'] = pearson_cov + df_rescale['p_value_cov'] = p_value_cov + df_rescale['pearson_csa'] = pearson_csa + df_rescale['p_value_csa'] = p_value_csa + return df_rescale + +def sample_size(df, df_sub, df_rescale, itt = 50): + """ Minimum sample size ( number of subjects) necessary to detect an atrophy in a between-subject (based on a + two-sample bilateral t-test) and minimum sample size necessary to detect an atrophy in a + within-subject ( repeated-measures in longitudinal study: based on a two-sample bilateral paired t-test). + ref. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3148614/ + :param df: dataframe for computing stats per subject: df_sub :param df_rescale: dataframe for computing stats per + subject: df_rescale :return df_rescale: modified dataframe with added sample_size_80 (between subjects at 80% + power), sample_size_90 (between subjects at 90% power), sample_size_long_80 (within subjects at 80% power) and + sample_size_long_90 (within subjects at 90% power) + """ + sample_size_80 = [] + sample_size_90 = [] + sample_size_long_80 = [] + sample_size_long_90 = [] + print("Computing sample size using Monte Carlo simulation with {} iterations. This might take a while...".format(itt)) + # Compute mean sample size using a Monte Carlo simulation to evaluate variability of measures + for n in range(itt): + for rescale_r, group_r in df.groupby('rescale'): + for sub, subgroup in group_r.groupby('subject'): + # for each subject in each scaling pick one transformation (Monte-Carlo sample) and add value in new column "sample" of df_sub + df_sub.loc[(df_sub['rescale'] == rescale_r) & (df_sub['subject'] == sub), 'sample'] = \ + df.loc[(df['rescale'] == rescale_r) & (df['subject'] == sub)].sample(n=1)['MEAN(area)'].values[0] + # for each subject in each scaling compute difference between one scaled and one un-scaled transformation (Monte-Carlo sample) and add value in new column "diff" of df_sub + df_sub.loc[(df_sub['rescale'] == rescale_r) & (df_sub['subject'] == sub), 'diff'] = \ + df.loc[(df['rescale'] == 1) & (df['subject'] == sub)].sample(n=1)['MEAN(area)'].values[0] - \ + df.loc[(df['rescale'] == rescale_r) & (df['subject'] == sub)].sample(n=1)['MEAN(area)'].values[0] + # regroup results per scaling + df_rescale['mean_diff'] = df_sub.groupby('rescale').mean()['diff'].values + df_rescale['std_diff'] = df_sub.groupby('rescale').std()['diff'].values + df_rescale['std_sample'] = df_sub.groupby('rescale').std()['sample'].values + + for rescale, group in df_sub.groupby('rescale'): + if rescale != 1: + # compute mean difference between groups using theoretic scaling and average un-scaled CSA + CSA_mean_diff = df_sub.groupby('rescale').get_group(1).mean()['mean']*(1-(rescale**2)) + # sample size between-subject + # var is the sum of the variances of un-scaled and scaled CSA across subjects (simulating two unpaired study arms) + var = df_rescale.groupby('rescale').get_group(1)['std_sample'].values[0] ** 2 + df_rescale.groupby('rescale').get_group(rescale)['std_sample'].values[0] ** 2 + sample_size_80.append((((1.96 + 0.84) ** 2) * (var)) / (CSA_mean_diff ** 2)) + sample_size_90.append((((1.96 + 1.28) ** 2) * (var)) / (CSA_mean_diff ** 2)) + # sample size within-subject + # var_diff is the variance of the difference between un-scaled and scaled CSA across subjects (simulating longitudinal CSA measures) + var_diff = df_rescale.groupby('rescale').get_group(rescale)['std_diff'].values[0] ** 2 + sample_size_long_80.append((((1.96 + 0.84) ** 2) * (var_diff)) / (CSA_mean_diff ** 2)) + sample_size_long_90.append((((1.96 + 1.28) ** 2) * (var_diff)) / (CSA_mean_diff ** 2)) + else: + # to avoid dividing by zero + sample_size_80.append(np.inf) + sample_size_90.append(np.inf) + sample_size_long_80.append(np.inf) + sample_size_long_90.append(np.inf) + # Compute mean and SD of computed sample sizes + df_rescale['sample_size_80'] = np.mean(np.reshape(sample_size_80, (itt, -1)), axis=0) + df_rescale['std_sample_size_80'] = np.std(np.reshape(sample_size_80, (itt, -1)), axis=0) + df_rescale['sample_size_90'] = np.mean(np.reshape(sample_size_90, (itt, -1)), axis=0) + df_rescale['std_sample_size_90'] = np.std(np.reshape(sample_size_90, (itt, -1)), axis=0) + df_rescale['sample_size_long_80'] = np.mean(np.reshape(sample_size_long_80, (itt, -1)), axis=0) + df_rescale['std_sample_size_long_80'] = np.std(np.reshape(sample_size_long_80, (itt, -1)), axis=0) + df_rescale['sample_size_long_90'] = np.mean(np.reshape(sample_size_long_90, (itt, -1)), axis=0) + df_rescale['std_sample_size_long_90'] = np.std(np.reshape(sample_size_long_90, (itt, -1)), axis=0) + return df_rescale + + def main(): """ @@ -343,6 +423,14 @@ def main(): path_results = os.path.abspath(os.path.expanduser(arguments.i)) vertlevels_input = arguments.l path_output = os.path.abspath(arguments.o) + if not os.path.exists(path_output): + os.mkdir(path_output) + + # Dump log file there + if os.path.exists(FNAME_LOG): + os.remove(FNAME_LOG) + fh = logging.FileHandler(os.path.join(path_output, FNAME_LOG)) + logging.root.addHandler(fh) # aggregate all csv results files concatenate_csv_files(path_results) @@ -355,23 +443,24 @@ def main(): pd.set_option('display.max_rows', None) # identify rows with missing values - print("Remove rows with missing values...") + logger.info("Remove rows with missing values...") lines_to_drop = df_vert[df_vert['MEAN(area)'] == 'None'].index df_vert['subject'] = list(sub.split('data_processed/')[1].split('/anat')[0] for sub in df_vert['Filename']) # remove rows with missing values df_vert = df_vert.drop(df_vert.index[lines_to_drop]) df_vert['MEAN(area)'] = pd.to_numeric(df_vert['MEAN(area)']) - print(" Rows removed: {}".format(lines_to_drop)) + logger.info(" Rows removed: {}".format(lines_to_drop)) # fetch parameters from config.yaml file config_param = yaml_parser(arguments.config) # add useful columns to dataframe df_vert['basename'] = list(os.path.basename(path).split('.nii.gz')[0] for path in df_vert['Filename']) - df_vert['rescale'] = list(float(b.split('RPI_r_r')[1].split('_')[0]) for b in df_vert['basename']) + df_vert['rescale'] = list(float(b.split('_r')[1].split('_')[0]) for b in df_vert['basename']) df_vert['slices'] = list(int(slices.split(':')[1]) - int(slices.split(':')[0]) + 1 for slices in df_vert['Slice (I->S)']) + # verify if vertlevels of interest were given in input by user if vertlevels_input is None: vertlevels = list(set(df_vert['VertLevel'].values)) @@ -380,7 +469,7 @@ def main(): if not all(elem in set(list(df_vert['VertLevel'].values)) for elem in vertlevels): raise ValueError("\nInput vertebral levels '{}' do not exist in csv files".format(vertlevels)) # register vertebrae levels of interest (Default: all vertebrae levels in csv files) - print("Stats are averaged across vertebral levels: {}".format(vertlevels)) + logger.info(f"Stats are averaged across vertebral levels: {vertlevels}") # Create new dataframe with only selected vertebral levels df = df_vert[df_vert['VertLevel'].isin(vertlevels)] @@ -398,7 +487,7 @@ def main(): df = df.drop('basename', 1) # Create dataframe for computing stats per subject: df_sub - print("\n==================== subject_dataframe ==========================\n") + logger.info("\n==================== subject_dataframe ==========================\n") df_sub = pd.DataFrame() # add necessary columns to df_sub dataframe df_sub['rescale'] = df.groupby(['rescale', 'subject']).mean().reset_index()['rescale'] @@ -412,14 +501,19 @@ def main(): df_sub['cov'] = df_sub['std'].div(df_sub['mean']) df_sub = add_columns_df_sub(df_sub) df_sub['rescale_estimated'] = df_sub['mean'].div(df_sub['csa_without_rescale']) - df_sub['error'] = (df_sub['mean'] - df_sub['theoretic_csa']).abs() - df_sub['perc_error'] = 100 * (df_sub['mean'] - df_sub['theoretic_csa']).abs().div(df_sub['theoretic_csa']) - print(df_sub) + + df_sub['error'] = (df_sub['mean'] - df_sub['theoretic_csa']) + df_sub['perc_error'] = 100 * (df_sub['mean'] - df_sub['theoretic_csa']).div(df_sub['theoretic_csa']) + sample = [] + for rescale, group in df.groupby('rescale'): + for sub, subgroup in group.groupby('subject'): + df_sub.loc[(df_sub['rescale'] == rescale) & (df_sub['subject'] == sub), 'sample'] = subgroup.sample(n=1)['MEAN(area)'].values + # save dataframe in a csv file df_sub.to_csv(os.path.join(path_output, r'csa_sub.csv')) # Create dataframe for computing stats across subject: df_rescale - print("\n==================== rescaling_dataframe ==========================\n") + logger.info("\n==================== rescaling_dataframe ==========================\n") df_rescale = pd.DataFrame() df_rescale['rescale'] = df_sub.groupby(['rescale']).mean().reset_index()['rescale'] df_rescale['rescale_area'] = df_sub.groupby('rescale_area').mean().reset_index()['rescale_area'] @@ -430,15 +524,18 @@ def main(): df_rescale['std_intra'] = df_sub.groupby('rescale').mean()['std'].values df_rescale['cov_intra'] = df_sub.groupby('rescale').mean()['cov'].values df_rescale['std_inter'] = df_sub.groupby('rescale').std()['mean'].values + df_rescale['cov_inter'] = df_sub.groupby('rescale').std()['mean'].div( + df_sub.groupby('rescale').mean()['mean']).values df_rescale['mean_rescale_estimated'] = df_sub.groupby('rescale').mean()['rescale_estimated'].values df_rescale['std_rescale_estimated'] = df_sub.groupby('rescale').std()['rescale_estimated'].values df_rescale['mean_perc_error'] = df_sub.groupby('rescale').mean()['perc_error'].values df_rescale['mean_error'] = df_sub.groupby('rescale').mean()['error'].values df_rescale['std_perc_error'] = df_sub.groupby('rescale').std()['perc_error'].values - df_rescale['sample_size'] = sample_size(df_rescale, config_param) - print(df_rescale) + + df_rescale = pearson(df_sub, df_rescale) + df_rescale = sample_size(df, df_sub, df_rescale) # save dataframe in a csv file - df_sub.to_csv(os.path.join(path_output, r'csa_transfo.csv')) + df_rescale.to_csv(os.path.join(path_output, r'csa_rescale.csv')) # plot graph if verbose is present if arguments.fig: @@ -461,11 +558,14 @@ def main(): z_score_power = config_param['fig']['sample_size']['power'] # std = STD of subjects without rescaling CSA values # mean_csa = mean CSA value of subjects without rescaling - plot_sample_size(z_conf=z_score_confidence, z_power=z_score_power, std=df_rescale.loc[1, 'std_inter'], mean_csa=df_rescale.loc[1, 'mean_inter'], path_output=path_output) + plot_sample_size(z_conf=z_score_confidence , z_power=z_score_power , std_arr=[7.56 , 8.29] , + mean_csa=[69.70 , 76.12] , path_output=path_output) # scatter plot of COV in function of error % error_function_of_intra_cov(df_sub, path_output=path_output) # scatter plot of COV in function of error % to identify outliers error_function_of_intra_cov_outlier(df_sub, path_output=path_output) + # scatter plot of CSA in function of error % + error_function_of_csa(df_sub, path_output=path_output) if __name__ == "__main__": main() diff --git a/job_template.sh b/job_template.sh index 922c5c0..972ba17 100644 --- a/job_template.sh +++ b/job_template.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --cpus-per-task=32 # number of OpenMP processes #SBATCH --mem=128G -#SBATCH --mail-user=paul.bautin@polymtl.ca +#SBATCH --mail-user=sandrine.bedard@polymtl.ca #SBATCH --mail-type=ALL export OMP_NUM_THREADS=1 export MKL_NUM_THREADS=1 diff --git a/process_data.sh b/process_data.sh index 15d9f91..2c389f5 100755 --- a/process_data.sh +++ b/process_data.sh @@ -49,48 +49,55 @@ segment_and_label_if_does_not_exist(){ local file="$1" local contrast="$2" local contrast_str="$3" - segment_if_does_not_exist $file ${contrast} "-qc ${PATH_QC} -qc-subject ${SUBJECT}" + FILESEG="${file}_seg" + sct_deepseg -i ${file}.nii.gz -task seg_sc_contrast_agnostic #-qc $PATH_QC -qc-subject ${SUBJECT} + #sct_qc -i ${file}.nii.gz -s ${FILESEG}.nii.gz -p sct_deepseg_sc -qc $PATH_QC -qc-subject ${SUBJECT} + #segment_if_does_not_exist $file ${contrast} "-qc ${PATH_QC} -qc-subject ${SUBJECT}" local file_seg=${FILESEG} # Update global variable with segmentation file name - FILELABEL="${file}_labels-disc" - FILELABELMANUAL="${path_derivatives}/${SUBJECT}_${contrast_str}_labels-disc-manual" + FILELABEL="${file}_label-discs_dlabel" # TODO change for updated names + FILELABELMANUAL="${path_derivatives}/${SUBJECT}_${contrast_str}_label-discs_dlabel" if [ -e "${FILELABELMANUAL}.nii.gz" ]; then echo "manual labeled file was found: ${FILELABELMANUAL}" + rsync -avzh ${FILELABELMANUAL}.nii.gz ${FILELABEL}.nii.gz # reorienting and resampling image - sct_image -i ${FILELABELMANUAL}.nii.gz -setorient RPI -o "${FILELABELMANUAL}_RPI.nii.gz" - sct_maths -i ${FILELABELMANUAL}_RPI.nii.gz -dilate 2 -o ${FILELABELMANUAL}_RPI_dil.nii.gz - sct_resample -i ${FILELABELMANUAL}_RPI_dil.nii.gz -mm $interp -x nn -o ${FILELABELMANUAL}_RPI_dil_r.nii.gz - rsync -avzh "${FILELABELMANUAL}_RPI_dil_r.nii.gz" ${FILELABEL}.nii.gz + #sct_image -i ${FILELABELMANUAL}.nii.gz -setorient RPI -o "${FILELABELMANUAL}_RPI.nii.gz" + #sct_maths -i ${FILELABELMANUAL}_RPI.nii.gz -dilate 2 -o ${FILELABELMANUAL}_RPI_dil.nii.gz + #sct_resample -i ${FILELABELMANUAL}_RPI_dil.nii.gz -mm $interp -x nn -o ${FILELABELMANUAL}_RPI_dil_r.nii.gz + #rsync -avzh "${FILELABELMANUAL}_RPI_dil_r.nii.gz" ${FILELABEL}.nii.gz # Generate labeled segmentation - sct_label_vertebrae -i ${file}.nii.gz -s ${file_seg}.nii.gz -c ${contrast} -discfile "${FILELABELMANUAL}_RPI_dil_r.nii.gz" -qc ${PATH_QC} -qc-subject ${SUBJECT} + sct_image -i ${file_seg}.nii.gz -set-sform-to-qform + sct_image -i ${file}.nii.gz -set-sform-to-qform + sct_image -i "${FILELABEL}.nii.gz" -set-sform-to-qform + sct_label_vertebrae -i ${file}.nii.gz -s ${file_seg}.nii.gz -c ${contrast} -discfile "${FILELABEL}.nii.gz" #-qc ${PATH_QC} -qc-subject ${SUBJECT} else # Generate labeled segmentation - sct_label_vertebrae -i ${file}.nii.gz -s ${file_seg}.nii.gz -c ${contrast} -qc ${PATH_QC} -qc-subject ${SUBJECT} + sct_label_vertebrae -i ${file}.nii.gz -s ${file_seg}.nii.gz -c ${contrast} #-qc ${PATH_QC} -qc-subject ${SUBJECT} fi # Create labels in the Spinal Cord - sct_label_utils -i ${file_seg}_labeled.nii.gz -vert-body 0 -o ${FILELABEL}.nii.gz + #sct_label_utils -i ${file_seg}_labeled.nii.gz -vert-body 0 -o ${FILELABEL}.nii.gz FILE_SEG_LABEL=${file_seg}_labeled } # Check if manual segmentation already exists. If it does, copy it locally. If # it does not, perform seg. -segment_if_does_not_exist(){ - local file="$1" - local contrast="$2" - local qc=$3 - # Update global variable with segmentation file name - FILESEG="${file}_seg" - FILESEGMANUAL="${path_derivatives}/${FILESEG}-manual" - if [ -e "${FILESEGMANUAL}.nii.gz" ]; then - echo "Found! Using manual segmentation." - sct_resample -i ${FILESEGMANUAL}.nii.gz -mm $interp -x nn -o ${FILESEGMANUAL}_r.nii.gz - rsync -avzh ${FILESEGMANUAL}_r.nii.gz ${FILESEG}.nii.gz - sct_qc -i ${file}.nii.gz -s ${FILESEG}.nii.gz -p sct_deepseg_sc $qc - else - # Segment spinal cord - sct_deepseg_sc -i ${file}.nii.gz -c ${contrast} $qc - fi -} +# segment_if_does_not_exist(){ +# local file="$1" +# local contrast="$2" +# local qc=$3 +# # Update global variable with segmentation file name +# FILESEG="${file}_seg" +# FILESEGMANUAL="${path_derivatives}/${FILESEG}-manual" +# if [ -e "${FILESEGMANUAL}.nii.gz" ]; then +# echo "Found! Using manual segmentation." +# sct_resample -i ${FILESEGMANUAL}.nii.gz -mm $interp -x nn -o ${FILESEGMANUAL}_r.nii.gz +# rsync -avzh ${FILESEGMANUAL}_r.nii.gz ${FILESEG}.nii.gz +# sct_qc -i ${file}.nii.gz -s ${FILESEG}.nii.gz -p sct_deepseg_sc $qc +# else +# # Segment spinal cord +# sct_deepseg_sc -i ${file}.nii.gz -c ${contrast} $qc +# fi +# } # SCRIPT STARTS HERE @@ -98,9 +105,7 @@ segment_if_does_not_exist(){ # Display useful info for the log, such as SCT version, RAM and CPU cores available sct_check_dependencies -short # copy derivatives directory containing manual corrections to PATH_DATA_PROCESSED -mkdir -p "${PATH_DATA_PROCESSED}/derivatives/labels/${SUBJECT}/anat/" -cp -r "${PATH_DATA}/derivatives/labels/${SUBJECT}/anat" "${PATH_DATA_PROCESSED}/derivatives/labels/${SUBJECT}" -path_derivatives="${PATH_DATA_PROCESSED}/derivatives/labels/${SUBJECT}/anat" +path_derivatives="${PATH_DATA}/derivatives/labels/${SUBJECT}/anat" # Go to results folder, where most of the outputs will be located cd $PATH_DATA_PROCESSED # Copy source images @@ -124,13 +129,13 @@ fi file=${SUBJECT}_${contrast_str} # Reorient image to RPI sct_image -i ${file}.nii.gz -setorient RPI -o ${file}_RPI.nii.gz -file=${file}_RPI # Resample image isotropically -sct_resample -i ${file}.nii.gz -mm $interp -o ${file}_r.nii.gz -file=${file}_r -end=`date +%s` -runtime=$((end-start)) -echo "+++++++++++ TIME: Duration of reorienting and resampling: $(($runtime / 3600))hrs $((($runtime / 60) % 60))min $(($runtime % 60))sec" +sct_resample -i ${file}_RPI.nii.gz -mm $interp -o ${file}_RPI_r.nii.gz +mv "${file}_RPI_r.nii.gz" "${SUBJECT}_space-other_${contrast_str}.nii.gz" +file=${SUBJECT}_space-other_${contrast_str} +# end=`date +%s` +# runtime=$((end-start)) +# echo "+++++++++++ TIME: Duration of reorienting and resampling: $(($runtime / 3600))hrs $((($runtime / 60) % 60))min $(($runtime % 60))sec" # Label spinal cord (only if it does not exist) in dir anat start=`date +%s` @@ -198,9 +203,13 @@ for r_coef in ${R_COEFS[@]}; do file_label_r_t=${file_label_r_t}_crop - # Segment spinal cord (only if it does not exist) + # Segment spinal cord start=`date +%s` - sct_deepseg_sc -i ${file_r_t}.nii.gz -c ${contrast} + #sct_deepseg_sc -i ${file_r_t}.nii.gz -c ${contrast} + sct_deepseg -i ${file_r_t}.nii.gz -task seg_sc_contrast_agnostic # -qc $PATH_QC -qc-subject ${SUBJECT} + # TODO: soft csa? + #sct_qc -i ${file_r_t}.nii.gz -s ${file_r_t}_seg.nii.gz -p sct_deepseg_sc -qc $PATH_QC -qc-subject ${SUBJECT} + end=`date +%s` runtime=$((end-start)) echo "+++++++++++ TIME: Duration of segmentation t${i_transfo}: $(($runtime / 3600))hrs $((($runtime / 60) % 60))min $(($runtime % 60))sec" diff --git a/requirements.txt b/requirements_cc.txt similarity index 51% rename from requirements.txt rename to requirements_cc.txt index f110068..dacd8a0 100755 --- a/requirements.txt +++ b/requirements_cc.txt @@ -5,9 +5,9 @@ matplotlib scikit-image nibabel ruamel.yaml +argparse +setuptools +PyYAML +coloredlogs +networkx yaml-1.3 -argparse~=1.4.0 -setuptools~=49.6.0 -PyYAML~=5.3.1 -coloredlogs~=14.0 - diff --git a/requirements_manual_pip.txt b/requirements_manual_pip.txt new file mode 100755 index 0000000..099fd2e --- /dev/null +++ b/requirements_manual_pip.txt @@ -0,0 +1,2 @@ +networkx==3.2.1 +yaml-1.3 diff --git a/setup.py b/setup.py index f0786f9..2ca613a 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ ], keywords='', install_requires=install_reqs, + packages = [], entry_points={ 'console_scripts': [ 'affine_transfo=affine_transfo:main',