From 1747d657b23d2b5f719d231d9c98e138dbe6ae6f Mon Sep 17 00:00:00 2001
From: balajtimate <51365402+balajtimate@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:02:07 +0200
Subject: [PATCH] Fix: cdna-generator input and fragment-selector output (#19)

* add header to structure-gen output csv #16

* refactor cli arguments #16

* minor changes to cdna-gen #16

* update bug issue template

* feat: add headers to fragment-selector output #17

* semantic fixes

* refactor: psp output to match cdna-gen input

* refactor: psp output to match cdna-gen input #16
---
 .github/ISSUE_TEMPLATE/bug_report.md         | 11 ----
 scRNAsim_toolz/cdna_generator/cdna.py        | 18 ++++---
 scRNAsim_toolz/cdna_generator/cli.py         | 22 ++++++--
 scRNAsim_toolz/fragment_selector/cli.py      |  4 +-
 scRNAsim_toolz/priming_site_predictor/psp.py | 57 +++++++++++++-------
 scRNAsim_toolz/structure_generator/main.py   |  7 +++
 6 files changed, 75 insertions(+), 44 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index dd84ea7..891c617 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen.
 **Screenshots**
 If applicable, add screenshots to help explain your problem.
 
-**Desktop (please complete the following information):**
- - OS: [e.g. iOS]
- - Browser [e.g. chrome, safari]
- - Version [e.g. 22]
-
-**Smartphone (please complete the following information):**
- - Device: [e.g. iPhone6]
- - OS: [e.g. iOS8.1]
- - Browser [e.g. stock browser, safari]
- - Version [e.g. 22]
-
 **Additional context**
 Add any other context about the problem here.
diff --git a/scRNAsim_toolz/cdna_generator/cdna.py b/scRNAsim_toolz/cdna_generator/cdna.py
index db41389..a4f2c29 100644
--- a/scRNAsim_toolz/cdna_generator/cdna.py
+++ b/scRNAsim_toolz/cdna_generator/cdna.py
@@ -173,15 +173,15 @@ def read_csv(self) -> None:
     def read_gtf(self) -> None:
         """Read and process the GTF file.
 
-        Reads a GTF file and determines copy numbers from \
-            normalized probabilities.
+        Reads a GTF file and determines copy numbers from
+        normalized probabilities.
 
         Returns: None
 
         """
-        # returns GTF with essential columns such as \
+        # returns GTF with essential columns such as
         # "feature", "seqname", "start", "end"
-        # alongside the names of any optional keys \
+        # alongside the names of any optional keys
         # which appeared in the attribute column
         gtf_df = read_gtf(self.gtf)
 
@@ -204,7 +204,7 @@ def read_gtf(self) -> None:
                 count += 1
             else:
                 count = 0  # reset count
-                # CVS transcript ID
+            # CSV transcript ID
             id_csv = str(row["seqname"]).split("_")[1]
             # Calculate Normalized_Binding_Probability and add to GTF dataframe
             gtf_df.loc[index, "Normalized_Binding_Probability"] = (
@@ -212,7 +212,7 @@ def read_gtf(self) -> None:
             )
             # Calculate Normalized_Binding_Probability and add to GTF dataframe
             csv_transcript_copy_number = self.csv_df.loc[
-                self.csv_df["ID of transcript"] == int(id_csv),
+                self.csv_df.iloc[:, 1] == int(id_csv),
                 "Transcript copy number",
             ].iloc[0]  # pop the first value in the frame
             gtf_df.loc[index, "Transcript_Copy_Number"] = round(
@@ -222,6 +222,9 @@ def read_gtf(self) -> None:
             gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}"
             prev_id = id_
 
+        gtf_df['Transcript_Copy_Number'] = gtf_df[
+            'Transcript_Copy_Number'
+        ].astype(int)
         self.gtf_df = gtf_df
 
     def write_fasta(self) -> None:
@@ -244,6 +247,7 @@ def write_csv(self) -> None:
 
         """
         df_to_save = self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]]
-        df_to_save.to_csv(self.output_csv, index=False)
+        # Stop outputting header
+        df_to_save.to_csv(self.output_csv, index=False, header=False)
         LOG.info("Copy number csv file successfully written to: %s",
                  self.output_csv)
diff --git a/scRNAsim_toolz/cdna_generator/cli.py b/scRNAsim_toolz/cdna_generator/cli.py
index 55129db..567763c 100644
--- a/scRNAsim_toolz/cdna_generator/cli.py
+++ b/scRNAsim_toolz/cdna_generator/cli.py
@@ -26,11 +26,16 @@ def main():
         description="Generate cDNA sequences based on primer probabilities.",
     )
     parser.add_argument(
-        "-ifa", "--input_fasta", help="genome fasta file", required=True
+        "-ifa",
+        "--input_fasta",
+        help="genome fasta file",
+        required=True
     )
-    parser.add_argument("-igtf", "--input_gtf", help="gtf file", required=True)
     parser.add_argument(
-        "-ofa", "--output_fasta", help="output fasta file", required=True
+        "-igtf",
+        "--input_gtf",
+        help="gtf file",
+        required=True
     )
     parser.add_argument(
         "-icpn",
@@ -39,7 +44,16 @@ def main():
         required=True,
     )
     parser.add_argument(
-        "-ocsv", "--output_csv", help="output fasta file", required=True
+        "-ofa",
+        "--output_fasta",
+        help="output fasta file",
+        required=True
+    )
+    parser.add_argument(
+        "-ocsv",
+        "--output_csv",
+        help="output fasta file",
+        required=True
     )
     parser.add_argument(
         '-v', '--version', action='version',
diff --git a/scRNAsim_toolz/fragment_selector/cli.py b/scRNAsim_toolz/fragment_selector/cli.py
index d12f3ac..ed21fc3 100644
--- a/scRNAsim_toolz/fragment_selector/cli.py
+++ b/scRNAsim_toolz/fragment_selector/cli.py
@@ -49,8 +49,8 @@ def main():
 
         logger.info("Writing batch %s sequences to %s...", i, args.output)
         with open(args.output, 'a', encoding="utf-8") as out_file:
-            for line in term_frags:
-                out_file.write(f"{line}\n")
+            for i, line in enumerate(term_frags, 1):
+                out_file.write(f">Terminal fragment {i}\n{line}\n")
 
 
 def file_validation(fasta_file: str,
diff --git a/scRNAsim_toolz/priming_site_predictor/psp.py b/scRNAsim_toolz/priming_site_predictor/psp.py
index 2ec4fa7..190c18b 100644
--- a/scRNAsim_toolz/priming_site_predictor/psp.py
+++ b/scRNAsim_toolz/priming_site_predictor/psp.py
@@ -75,24 +75,45 @@ def create_pandas_df(self):
         """Create interaction df."""
         interaction_list = self.create_list_from_output()
         interaction_df = pd.DataFrame(interaction_list)
-        interaction_df['Number_of_interactions'] = int(0)
-        interaction_df['Interaction_Energy'] = float(0)
-        transcript = 3
-        energy = 5
+        # Add header row to interaction_df
+        interaction_df.columns = [
+            'Id',
+            'Query_name',
+            'Query_length',
+            'Target_name',
+            'Target_length',
+            'Accessibility_Energy',
+            'Hybridization_Energy',
+            'Interaction_Energy',
+            'Query_start_bp',
+            'Query_end_bp',
+            'Target start',
+            'Target end']
+        interaction_df['Number_of_binding_sites'] = int(0)
+        interaction_df['Binding_Energy'] = float(0)
+        transcript = 'Target_name'
+        energy = 'Accessibility_Energy'
 
         for _ in interaction_df.index:
-            interaction_df['Number_of_interactions'] = interaction_df[
+            interaction_df['Number_of_binding_sites'] = interaction_df[
                 transcript
                 ].apply(
                 lambda x: interaction_df[transcript].value_counts()[x]
             )
-            interaction_df['Interaction_Energy'] = interaction_df[
+            interaction_df['Binding_Energy'] = interaction_df[
                 energy
                 ].apply(self.calculate_energy)
 
         LOG.info("Calculating normalised interaction energies...")
-        interaction_df['Normalised_interaction_energy'] = interaction_df[
-            'Interaction_Energy']/interaction_df['Number_of_interactions']
+        interaction_df['Binding_Probability'] = interaction_df[
+            'Binding_Energy']/interaction_df['Number_of_binding_sites']
+
+        # Round energy columns
+        column_indices = [5, 6, 7, 13, 14]
+        for index in column_indices:
+            interaction_df.iloc[:, index] = interaction_df.iloc[
+                :, index
+            ].astype(float).round(2)
 
         return interaction_df
 
@@ -101,19 +122,15 @@ def generate_gtf(self):
         interaction_df = self.create_pandas_df()
         result = str()
 
-        for index in interaction_df.index:
+        for _, row in interaction_df.iterrows():
             result += (
-                str(interaction_df.iloc[:, 3][index])
-                + '\tRIBlast\tPriming_site\t'
-                + str(interaction_df.iloc[:, 13][index])
-                + '\t'
-                + str(interaction_df.iloc[:, 12][index])
-                + '\t.\t+\t.\t'
-                + 'Interaction_Energy' + '\t'
-                + str(interaction_df[
-                    "Normalised_interaction_energy"
-                    ][index])
-                + '\n'
+                f'{row.iloc[3]}\tRIBlast\tPriming_site\t'
+                f'{row.iloc[10]}\t{row.iloc[11]}\t.\t+\t.\t'
+                'Accessibility_Energy ' + f'"{row.iloc[5]}"; '
+                'Hybridization_Energy ' + f'"{row.iloc[6]}"; '
+                'Interaction_Energy ' + f'"{row.iloc[7]}"; '
+                'Number_of_binding_sites ' + f'"{row.iloc[12]}"; '
+                'Binding_Probability ' + f'"{row.iloc[14]}"\n'
             )
 
         LOG.info("Generating output gtf file...")
diff --git a/scRNAsim_toolz/structure_generator/main.py b/scRNAsim_toolz/structure_generator/main.py
index 532af66..c53e6f7 100644
--- a/scRNAsim_toolz/structure_generator/main.py
+++ b/scRNAsim_toolz/structure_generator/main.py
@@ -439,6 +439,13 @@ def write_sequences(self, filename: str) -> None:
         """
         ids, _, counts = self.get_unique_inclusions()
         with open(filename, "a", encoding="utf_8") as file_handle:
+            # Add header to output csv for cdna-generator
+            if file_handle.tell() == 0:
+                file_handle.write(
+                    "ID of transcript,ID of parent transcript,"
+                    "Transcript copy number\n"
+                )
+
             for transcript_id, transcript_count in zip(ids, counts):
                 file_handle.write(
                     f"{transcript_id},{self.ts_id},{transcript_count}\n"