Merge pull request #10 from EbonyGunwhy/main

feat: update scripts to comply with modified Reproducibility study functionality
QIB-Sheffield · Sep 23, 2023 · a1bae54 · a1bae54
2 parents f49ed92 + b2f57ee
commit a1bae54
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -97,14 +97,14 @@ results/
 |    |     |
 |    |     |---- 01_model_outputs/
 |    |     |    |---- figures/
-|    |     |    |       |---- per_drug/
+|    |     |    |       |---- per_substudy/
 |    |     |    |       |---- per_rat/
 |    |     |    |---- relaxation_rates_and_signals
 |    |     |    |---- all_parameters.csv
-|    |     |---- 02_effect_sizes/
-|    |     |    |---- figures/
-|    |     |    |---- effect_sizes.csv
 |    |     |    |---- fit_errors.txt
+|    |     |---- 02_analyses/
+|    |     |    |---- figures/
+|    |     |    |---- repeatability/
 ```
 
 As the tracer kinetic model used in this study produces estimated parameter
@@ -113,7 +113,11 @@ Therefore, upon each execution of the code, a top-level directory named after
 the date the analysis was conducted is created for storing the results from
 that particular execution. For reference, the results and figures presented in
 the accompanying manuscript to the `Six Test Compounds` study were created using the
-outputs contained in `results/SixTestCompounds/2022-09-01`.
+outputs contained in `results/SixTestCompounds/2022-09-01`. Please note that the
+results contained in the Zenodo archive were created using a previous release of this
+software (https://github.com/QIB-Sheffield/TRISTAN-rat/releases/tag/v1.0.0) which
+ouput a slightly modified results folder structure. Please see the previous version
+notes for more details.
 
 `01_model_ouputs` contains all outputs generated as result of the tracer kinetic
 model fitting. Within this, plotted signal time curves for each acquistion per 

diff --git a/src/Reproducibility.py b/src/Reproducibility.py
@@ -27,8 +27,11 @@ def main(study: str
     # Split control and treatment groups
     signal_dict = signals.split_groups(files, filenames)
     # Fit data and get all estimated parameter variables
-    all_parameters = signals.fit_data(study, filenames, files,
-                                      signal_dict, TristanRat)
+    all_parameters = signals.fit_data(study,
+                                      filenames,
+                                      files,
+                                      signal_dict,
+                                      TristanRat)
 
     # Get time curve averages per drug and per day
     subject_list = signals.get_subject_list(signal_dict)
@@ -363,7 +366,8 @@ def main(study: str
                     None,
                     'rocket',
                     95,
-                    ylabels=['$K_{trans}$', '$k_{bh}$'])
+                    ylabels=['$K_{trans}$', '$k_{bh}$'],
+                    sharey=False)
     # plot saline-rifampicin data
     print("saline-rifampicin: Plotting individual biomarker distributions between Day 1 and Day 2")
     plots.pairplots(study,
@@ -375,7 +379,8 @@ def main(study: str
                     None,
                     'rocket',
                     95,
-                    ylabels=['$K_{trans}$', '$k_{bh}$'])
+                    ylabels=['$K_{trans}$', '$k_{bh}$'],
+                    sharey=False)
 
     # MIXED ANOVA (saline retest data)
     print("Performing mixed ANOVA for saline-saline retest data")

diff --git a/src/SixTestCompounds.py b/src/SixTestCompounds.py
@@ -4,6 +4,9 @@
 the tracer kinetic modelling and generate all resulting reports
 and figures for a specific study of interest.
 """
+from pathlib import Path
+import os
+import shutil
 import argparse
 import itertools
 import data
@@ -20,8 +23,30 @@ def main(study: str
     Args:
         study: Study name of interest (e.g., 'SixTestCompounds')
     """
-    # Get files and filenames
-    files, filenames = data.get_files(study, '01_signals')
+    # Get original files and filenames
+    orig_files, orig_filenames = data.get_files(study, '01_signals')
+    # Modify filenames to include study number
+    # to comply with updated functionality used in Reproducibility study
+    renamed_folder = os.path.join(Path(orig_files[0]).parent.parent,
+                                  '01_signals_renamed')
+    data.make_dir(renamed_folder)
+    for n in orig_files:
+        if Path(n).stem.split('_')[0]=='Asunaprevir':
+            shutil.copyfile(n, os.path.join(renamed_folder, '5_' + Path(n).stem + '.csv'))
+        elif Path(n).stem.split('_')[0]=='Pioglitazone':
+            shutil.copyfile(n, os.path.join(renamed_folder, '6_' + Path(n).stem + '.csv'))
+        elif Path(n).stem.split('_')[0]=='Ketoconazole':
+            shutil.copyfile(n, os.path.join(renamed_folder, '7_' + Path(n).stem + '.csv'))
+        elif Path(n).stem.split('_')[0]=='Cyclosporine':
+            shutil.copyfile(n, os.path.join(renamed_folder, '8_' + Path(n).stem + '.csv'))
+        elif Path(n).stem.split('_')[0]=='Bosentan':
+            shutil.copyfile(n, os.path.join(renamed_folder, '10_' + Path(n).stem + '.csv'))
+        elif Path(n).stem.split('_')[0]=='BosentanHigh':
+            shutil.copyfile(n, os.path.join(renamed_folder, '9_' + Path(n).stem + '.csv'))
+        elif Path(n).stem.split('_')[0]=='Rifampicin':
+            shutil.copyfile(n, os.path.join(renamed_folder, '12_' + Path(n).stem + '.csv'))
+    # Get updated files and filenames
+    files, filenames = data.get_files(study, '01_signals_renamed')
     # Split control and treatment groups
     signal_dict = signals.split_groups(files, filenames)
     # Fit data and get all estimated parameter variables
@@ -36,31 +61,40 @@ def main(study: str
     for curve in ['Delta R1 Liver (s-1)', 'Delta R1 Liver fit (s-1)',
                   'Delta R1 Spleen (s-1)']:
         signals.get_average_curves(signal_dict, subject_list, curve)
-
-    # Update dictionary keys for average delta R1 plots
+
     fits = signal_dict
-    fits['G2 Ciclosporin'] = fits.pop('Cyclosporine')
-    fits['G2 Rifampicin'] = fits.pop('Rifampicin')
-    fits['D Ketoconazole'] = fits.pop('Ketoconazole')
-    fits['E Asunaprevir'] = fits.pop('Asunaprevir')
-    fits['E Pioglitazone'] = fits.pop('Pioglitazone')
-    fits['G1 Bosentan_2mg'] = fits.pop('Bosentan')
-    fits['G1 Bosentan_high'] = fits.pop('BosentanHigh')
-
     # Plot average delta R1 time curves per drug and per day
-    for drug, day in list(itertools.product(fits.keys(), [1, 2])):
-        print(f"{drug}, Liver fit: Saving average deltaR1 plot")
-        # For fitted liver data
-        plots.get_deltaR1_plots(fits, drug, 'Liver', study,
-                                is_fitted=True, YLIM=(-1.5, 4.5))
-        # For observed liver data only
-        print(f"{drug}, Liver: Saving average deltaR1 plot")
-        plots.get_deltaR1_plots(fits, drug, 'Liver', study,
-                                is_fitted=False, YLIM=(-1.5, 4.5))
-        # For observed spleen data only
-        print(f"{drug}, Spleen: Saving average deltaR1 plot")
-        plots.get_deltaR1_plots(fits, drug, 'Spleen', study,
-                                is_fitted=False, YLIM=(-0.25, 1))
+    for substudy, day in list(itertools.product(fits.keys(), [1, 2])):
+        try:
+            print(f"{substudy}, Liver fit: Saving average deltaR1 plot")
+            # For fitted liver data
+            plots.get_deltaR1_plots(fits,
+                                    substudy,
+                                    'Liver',
+                                    study,
+                                    is_fitted=True,
+                                    YLIM=(-1.5, 4.5))
+            # For observed liver data only
+            print(f"{substudy}, Liver: Saving average deltaR1 plot")
+            plots.get_deltaR1_plots(fits,
+                                    substudy,
+                                    'Liver',
+                                    study,
+                                    is_fitted=False,
+                                    YLIM=(-1.5, 4.5))
+            # For observed spleen data only
+            print(f"{substudy}, Spleen: Saving average deltaR1 plot")
+            plots.get_deltaR1_plots(fits,
+                                    substudy,
+                                    'Spleen',
+                                    study,
+                                    is_fitted=False,
+                                    YLIM=(-0.25, 1))
+        except KeyError:
+            continue
+
+    # Convert substudy string number labels to integers
+    all_parameters['Substudy'] = all_parameters['Substudy'].astype(int)
 
     # Create dictionary to rename sites into more comprehensive format
     site_names = {'Bosentan': 'Bosentan_2mg',
@@ -76,8 +110,12 @@ def main(study: str
     # Remove missing data
     # and computational fitting errors from all estimated parameter data
     print("Removing computational fitting errors and missing data")
-    all_parameters_cleaned = analyses.remove_data_errors(all_parameters,
-                                                         study)
+    all_parameters_cleaned = data.remove_data_errors(all_parameters,
+                                                     study)
+    # Remove subjects with insufficient number of observations
+    all_parameters_cleaned = (data
+                              .remove_insufficient_data(all_parameters_cleaned,
+                                                        study))
 
     # Create list of condition variables to group by
     variables = ['Drug', 'Symbol', 'Site']
@@ -86,17 +124,25 @@ def main(study: str
 
     # Obtain effect size summaries and save as csv
     print("Calculating average effect sizes")
-    analyses.save_effect_sizes(all_parameters_cleaned,
-                               params,
-                               variables,
-                               study)
+    # Get statistical summary for saline-saline data
+    single_subject, overall = (analyses.get_retest_results(study,
+                                                           'effects',
+                                                           all_parameters_cleaned.query("Symbol in @params")))
 
     # Plot biomarker distributions between Day 1 and Day 2 per rat
-    for biomarker in params:
-        print(f"{biomarker}: Plotting individual biomarker \
+    print("Plotting individual biomarker \
               distributions between Day 1 and Day 2")
-        plots.pairplots(all_parameters, str(biomarker), study)
-
+    plots.pairplots(study,
+                    'effects',
+                    all_parameters_cleaned.query("Symbol in @params"),
+                    'Day',
+                    'Rat',
+                    'Drug',
+                    'Symbol',
+                    'rocket',
+                    95,
+                    ylabels=['$K_{trans}$', '$k_{he}$', '$k_{bh}$'],
+                    sharey='row')
     print("Done!")
 
 

diff --git a/src/analyses.py b/src/analyses.py
@@ -339,15 +339,24 @@ def get_retest_results(study: str,
         substudy average levels, respectively.
     """
     # Pivot single-subject data
-    data_per_subject = pd.pivot_table(cleaned_parameter_data,
-                                      values='Value',
-                                      index=['Substudy',
-                                             'Symbol',
-                                             'Site',
-                                             'Fstrength',
-                                             'Time_period',
-                                             'Rat'],
-                                      columns='Day')
+    if study=='Reproducibility':
+        data_per_subject = pd.pivot_table(cleaned_parameter_data,
+                                        values='Value',
+                                        index=['Substudy',
+                                                'Symbol',
+                                                'Site',
+                                                'Fstrength',
+                                                'Time_period',
+                                                'Rat'],
+                                        columns='Day')
+    else:
+        data_per_subject = pd.pivot_table(cleaned_parameter_data,
+                                        values='Value',
+                                        index=['Substudy',
+                                                'Symbol',
+                                                'Site',
+                                                'Rat'],
+                                        columns='Day')
     # Get mean substudy values
     data_per_substudy = (cleaned_parameter_data
                          .groupby(['Substudy',

diff --git a/src/data.py b/src/data.py
@@ -103,8 +103,8 @@ def get_metadata(filename: str,
     within the file. Works only when filename is formatted as
     a string containing study descriptors (metadata) separated
     by underscores, i.e.,
-    filename = "compound_site_RatNumber_dayNumber_dataType"
-    e.g., "Asunaprevir_E_Rat2_2_Signals"
+    filename = "substudy_compound_site_RatNumber_dayNumber_dataType"
+    e.g., "5_Asunaprevir_E_Rat2_2_Signals"
 
     Args:
         filename: File name of interest.
@@ -258,17 +258,27 @@ def remove_insufficient_data(parameter_data: pd.DataFrame,
     Returns:
         Cleaned DataFrame.
     """
-    data_pivoted = pd.pivot_table(parameter_data,
-                                  values='Value',
-                                  columns=['Symbol'],
-                                  index=['Substudy',
-                                         'Drug',
-                                         'Site',
-                                         'Fstrength',
-                                         'Site_Fstrength',
-                                         'Time_period',
-                                         'Rat',
-                                         'Day'])
+    if study=='Reproducibility':
+        data_pivoted = pd.pivot_table(parameter_data,
+                                    values='Value',
+                                    columns=['Symbol'],
+                                    index=['Substudy',
+                                            'Drug',
+                                            'Site',
+                                            'Fstrength',
+                                            'Site_Fstrength',
+                                            'Time_period',
+                                            'Rat',
+                                            'Day'])
+    else:
+        data_pivoted = pd.pivot_table(parameter_data,
+                                    values='Value',
+                                    columns=['Symbol'],
+                                    index=['Substudy',
+                                            'Drug',
+                                            'Site',
+                                            'Rat',
+                                            'Day'])
     # Remove subjects with missing acquisition on day 1 or day 2
     missing_days_removed = (data_pivoted[data_pivoted
                                          .groupby(['Substudy',

diff --git a/src/plots.py b/src/plots.py
@@ -163,8 +163,8 @@ def get_deltaR1_plots(signals: dict,
         fig_name = f"{substudy}_{ROI}_deltaR1"
 
     plt.suptitle(f"Group mean {ROI} gadoxetate profiles in control and \
-                 inhibitory phases \n (error bars represent \
-                 standard deviation)")
+                 \n inhibitory phases \
+                 \n (error bars represent standard deviation)")
     g.set_title(f"{substudy}", weight='bold')
     g.set_xlabel("Time [min]", weight='bold')
     g.set_ylabel("\u0394 $R_{1}$ [$s^{-1}$]", weight='bold')
@@ -302,6 +302,7 @@ def pairplots(study: str,
               palette: str,
               error: int,
               ylabels: list,
+              sharey: 'str'
               ) -> None:
     """Plots paired data distributions per biomarker.
 
@@ -329,24 +330,28 @@ def pairplots(study: str,
                     col=col,
                     row=row,
                     kind="point",
-                    sharey=False,
+                    sharey=sharey,
                     palette=palette,
                     height=8,
                     aspect=1,
                     legend=False,
                     ci=error)
-
-    (g.set_titles("")
-     .axes[0, 1].set(ylim=([0, 0.4])))
-    g.axes[0, 0].set(ylim=([0, 1.5]))
+
+    if study=='Reproducibility':
+        (g.set_titles("")
+         .axes[0, 1].set(ylim=([0, 0.4])))
+        g.axes[0, 0].set(ylim=([0, 1.5]))
+        for i in range(len(ylabels)):
+            g.axes[0, i].set_ylabel(f"{ylabels[i]} [mL/min/mL]")
+    else:
+        g.set_titles(template='{col_name}')
+        for i in range(len(ylabels)):
+            g.axes[i, 0].set_ylabel(f"{ylabels[i]} [mL/min/mL]")
 
     for ax in g.axes.flatten():
         ax.tick_params(labelleft=True, labelbottom=True)
 
-    plt.legend(title='Rat', bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-
-    for i in range(len(ylabels)):
-        g.axes[0, i].set_ylabel(f"{ylabels[i]} [mL/min/mL]")
+    plt.legend(title=hue, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
 
     g.fig.tight_layout()
     save_name = data.get_results_folder(study,