From 1747d657b23d2b5f719d231d9c98e138dbe6ae6f Mon Sep 17 00:00:00 2001 From: balajtimate <51365402+balajtimate@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:02:07 +0200 Subject: [PATCH] Fix: cdna-generator input and fragment-selector output (#19) * add header to structure-gen output csv #16 * refactor cli arguments #16 * minor changes to cdna-gen #16 * update bug issue template * feat: add headers to fragment-selector output #17 * semantic fixes * refactor: psp output to match cdna-gen input * refactor: psp output to match cdna-gen input #16 --- .github/ISSUE_TEMPLATE/bug_report.md | 11 ---- scRNAsim_toolz/cdna_generator/cdna.py | 18 ++++--- scRNAsim_toolz/cdna_generator/cli.py | 22 ++++++-- scRNAsim_toolz/fragment_selector/cli.py | 4 +- scRNAsim_toolz/priming_site_predictor/psp.py | 57 +++++++++++++------- scRNAsim_toolz/structure_generator/main.py | 7 +++ 6 files changed, 75 insertions(+), 44 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index dd84ea7..891c617 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] - **Additional context** Add any other context about the problem here. diff --git a/scRNAsim_toolz/cdna_generator/cdna.py b/scRNAsim_toolz/cdna_generator/cdna.py index db41389..a4f2c29 100644 --- a/scRNAsim_toolz/cdna_generator/cdna.py +++ b/scRNAsim_toolz/cdna_generator/cdna.py @@ -173,15 +173,15 @@ def read_csv(self) -> None: def read_gtf(self) -> None: """Read and process the GTF file. - Reads a GTF file and determines copy numbers from \ - normalized probabilities. + Reads a GTF file and determines copy numbers from + normalized probabilities. Returns: None """ - # returns GTF with essential columns such as \ + # returns GTF with essential columns such as # "feature", "seqname", "start", "end" - # alongside the names of any optional keys \ + # alongside the names of any optional keys # which appeared in the attribute column gtf_df = read_gtf(self.gtf) @@ -204,7 +204,7 @@ def read_gtf(self) -> None: count += 1 else: count = 0 # reset count - # CVS transcript ID + # CSV transcript ID id_csv = str(row["seqname"]).split("_")[1] # Calculate Normalized_Binding_Probability and add to GTF dataframe gtf_df.loc[index, "Normalized_Binding_Probability"] = ( @@ -212,7 +212,7 @@ def read_gtf(self) -> None: ) # Calculate Normalized_Binding_Probability and add to GTF dataframe csv_transcript_copy_number = self.csv_df.loc[ - self.csv_df["ID of transcript"] == int(id_csv), + self.csv_df.iloc[:, 1] == int(id_csv), "Transcript copy number", ].iloc[0] # pop the first value in the frame gtf_df.loc[index, "Transcript_Copy_Number"] = round( @@ -222,6 +222,9 @@ def read_gtf(self) -> None: gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}" prev_id = id_ + gtf_df['Transcript_Copy_Number'] = gtf_df[ + 'Transcript_Copy_Number' + ].astype(int) self.gtf_df = gtf_df def write_fasta(self) -> None: @@ -244,6 +247,7 @@ def write_csv(self) -> None: """ df_to_save = self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]] - df_to_save.to_csv(self.output_csv, index=False) + # Stop outputting header + df_to_save.to_csv(self.output_csv, index=False, header=False) LOG.info("Copy number csv file successfully written to: %s", self.output_csv) diff --git a/scRNAsim_toolz/cdna_generator/cli.py b/scRNAsim_toolz/cdna_generator/cli.py index 55129db..567763c 100644 --- a/scRNAsim_toolz/cdna_generator/cli.py +++ b/scRNAsim_toolz/cdna_generator/cli.py @@ -26,11 +26,16 @@ def main(): description="Generate cDNA sequences based on primer probabilities.", ) parser.add_argument( - "-ifa", "--input_fasta", help="genome fasta file", required=True + "-ifa", + "--input_fasta", + help="genome fasta file", + required=True ) - parser.add_argument("-igtf", "--input_gtf", help="gtf file", required=True) parser.add_argument( - "-ofa", "--output_fasta", help="output fasta file", required=True + "-igtf", + "--input_gtf", + help="gtf file", + required=True ) parser.add_argument( "-icpn", @@ -39,7 +44,16 @@ def main(): required=True, ) parser.add_argument( - "-ocsv", "--output_csv", help="output fasta file", required=True + "-ofa", + "--output_fasta", + help="output fasta file", + required=True + ) + parser.add_argument( + "-ocsv", + "--output_csv", + help="output fasta file", + required=True ) parser.add_argument( '-v', '--version', action='version', diff --git a/scRNAsim_toolz/fragment_selector/cli.py b/scRNAsim_toolz/fragment_selector/cli.py index d12f3ac..ed21fc3 100644 --- a/scRNAsim_toolz/fragment_selector/cli.py +++ b/scRNAsim_toolz/fragment_selector/cli.py @@ -49,8 +49,8 @@ def main(): logger.info("Writing batch %s sequences to %s...", i, args.output) with open(args.output, 'a', encoding="utf-8") as out_file: - for line in term_frags: - out_file.write(f"{line}\n") + for i, line in enumerate(term_frags, 1): + out_file.write(f">Terminal fragment {i}\n{line}\n") def file_validation(fasta_file: str, diff --git a/scRNAsim_toolz/priming_site_predictor/psp.py b/scRNAsim_toolz/priming_site_predictor/psp.py index 2ec4fa7..190c18b 100644 --- a/scRNAsim_toolz/priming_site_predictor/psp.py +++ b/scRNAsim_toolz/priming_site_predictor/psp.py @@ -75,24 +75,45 @@ def create_pandas_df(self): """Create interaction df.""" interaction_list = self.create_list_from_output() interaction_df = pd.DataFrame(interaction_list) - interaction_df['Number_of_interactions'] = int(0) - interaction_df['Interaction_Energy'] = float(0) - transcript = 3 - energy = 5 + # Add header row to interaction_df + interaction_df.columns = [ + 'Id', + 'Query_name', + 'Query_length', + 'Target_name', + 'Target_length', + 'Accessibility_Energy', + 'Hybridization_Energy', + 'Interaction_Energy', + 'Query_start_bp', + 'Query_end_bp', + 'Target start', + 'Target end'] + interaction_df['Number_of_binding_sites'] = int(0) + interaction_df['Binding_Energy'] = float(0) + transcript = 'Target_name' + energy = 'Accessibility_Energy' for _ in interaction_df.index: - interaction_df['Number_of_interactions'] = interaction_df[ + interaction_df['Number_of_binding_sites'] = interaction_df[ transcript ].apply( lambda x: interaction_df[transcript].value_counts()[x] ) - interaction_df['Interaction_Energy'] = interaction_df[ + interaction_df['Binding_Energy'] = interaction_df[ energy ].apply(self.calculate_energy) LOG.info("Calculating normalised interaction energies...") - interaction_df['Normalised_interaction_energy'] = interaction_df[ - 'Interaction_Energy']/interaction_df['Number_of_interactions'] + interaction_df['Binding_Probability'] = interaction_df[ + 'Binding_Energy']/interaction_df['Number_of_binding_sites'] + + # Round energy columns + column_indices = [5, 6, 7, 13, 14] + for index in column_indices: + interaction_df.iloc[:, index] = interaction_df.iloc[ + :, index + ].astype(float).round(2) return interaction_df @@ -101,19 +122,15 @@ def generate_gtf(self): interaction_df = self.create_pandas_df() result = str() - for index in interaction_df.index: + for _, row in interaction_df.iterrows(): result += ( - str(interaction_df.iloc[:, 3][index]) - + '\tRIBlast\tPriming_site\t' - + str(interaction_df.iloc[:, 13][index]) - + '\t' - + str(interaction_df.iloc[:, 12][index]) - + '\t.\t+\t.\t' - + 'Interaction_Energy' + '\t' - + str(interaction_df[ - "Normalised_interaction_energy" - ][index]) - + '\n' + f'{row.iloc[3]}\tRIBlast\tPriming_site\t' + f'{row.iloc[10]}\t{row.iloc[11]}\t.\t+\t.\t' + 'Accessibility_Energy ' + f'"{row.iloc[5]}"; ' + 'Hybridization_Energy ' + f'"{row.iloc[6]}"; ' + 'Interaction_Energy ' + f'"{row.iloc[7]}"; ' + 'Number_of_binding_sites ' + f'"{row.iloc[12]}"; ' + 'Binding_Probability ' + f'"{row.iloc[14]}"\n' ) LOG.info("Generating output gtf file...") diff --git a/scRNAsim_toolz/structure_generator/main.py b/scRNAsim_toolz/structure_generator/main.py index 532af66..c53e6f7 100644 --- a/scRNAsim_toolz/structure_generator/main.py +++ b/scRNAsim_toolz/structure_generator/main.py @@ -439,6 +439,13 @@ def write_sequences(self, filename: str) -> None: """ ids, _, counts = self.get_unique_inclusions() with open(filename, "a", encoding="utf_8") as file_handle: + # Add header to output csv for cdna-generator + if file_handle.tell() == 0: + file_handle.write( + "ID of transcript,ID of parent transcript," + "Transcript copy number\n" + ) + for transcript_id, transcript_count in zip(ids, counts): file_handle.write( f"{transcript_id},{self.ts_id},{transcript_count}\n"