From 8da7bd0c57b46373252421b98251e616663bf24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 23 Oct 2023 14:16:08 +0200 Subject: [PATCH 1/8] add header to structure-gen output csv #16 --- scRNAsim_toolz/structure_generator/main.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scRNAsim_toolz/structure_generator/main.py b/scRNAsim_toolz/structure_generator/main.py index 532af66..1a453f7 100644 --- a/scRNAsim_toolz/structure_generator/main.py +++ b/scRNAsim_toolz/structure_generator/main.py @@ -439,6 +439,12 @@ def write_sequences(self, filename: str) -> None: """ ids, _, counts = self.get_unique_inclusions() with open(filename, "a", encoding="utf_8") as file_handle: + # Add header to output csv for cdna-generator + if file_handle.tell() == 0: + file_handle.write( + "ID of transcript,ID of parent transcript,Transcript copy number\n" + ) + for transcript_id, transcript_count in zip(ids, counts): file_handle.write( f"{transcript_id},{self.ts_id},{transcript_count}\n" From 31478737f004e9e82961680960332c783e711e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 23 Oct 2023 15:24:30 +0200 Subject: [PATCH 2/8] refactor cli arguments #16 --- scRNAsim_toolz/cdna_generator/cli.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/scRNAsim_toolz/cdna_generator/cli.py b/scRNAsim_toolz/cdna_generator/cli.py index 55129db..567763c 100644 --- a/scRNAsim_toolz/cdna_generator/cli.py +++ b/scRNAsim_toolz/cdna_generator/cli.py @@ -26,11 +26,16 @@ def main(): description="Generate cDNA sequences based on primer probabilities.", ) parser.add_argument( - "-ifa", "--input_fasta", help="genome fasta file", required=True + "-ifa", + "--input_fasta", + help="genome fasta file", + required=True ) - parser.add_argument("-igtf", "--input_gtf", help="gtf file", required=True) parser.add_argument( - "-ofa", "--output_fasta", help="output fasta file", required=True + "-igtf", + "--input_gtf", + help="gtf file", + required=True ) parser.add_argument( "-icpn", @@ -39,7 +44,16 @@ def main(): required=True, ) parser.add_argument( - "-ocsv", "--output_csv", help="output fasta file", required=True + "-ofa", + "--output_fasta", + help="output fasta file", + required=True + ) + parser.add_argument( + "-ocsv", + "--output_csv", + help="output fasta file", + required=True ) parser.add_argument( '-v', '--version', action='version', From eccef31580d615c65ebcba1750453e9d85beae5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 23 Oct 2023 15:33:52 +0200 Subject: [PATCH 3/8] minor changes to cdna-gen #16 --- scRNAsim_toolz/cdna_generator/cdna.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scRNAsim_toolz/cdna_generator/cdna.py b/scRNAsim_toolz/cdna_generator/cdna.py index db41389..c2e2bed 100644 --- a/scRNAsim_toolz/cdna_generator/cdna.py +++ b/scRNAsim_toolz/cdna_generator/cdna.py @@ -173,15 +173,15 @@ def read_csv(self) -> None: def read_gtf(self) -> None: """Read and process the GTF file. - Reads a GTF file and determines copy numbers from \ - normalized probabilities. + Reads a GTF file and determines copy numbers from + normalized probabilities. Returns: None """ - # returns GTF with essential columns such as \ + # returns GTF with essential columns such as # "feature", "seqname", "start", "end" - # alongside the names of any optional keys \ + # alongside the names of any optional keys # which appeared in the attribute column gtf_df = read_gtf(self.gtf) @@ -204,7 +204,7 @@ def read_gtf(self) -> None: count += 1 else: count = 0 # reset count - # CVS transcript ID + # CSV transcript ID id_csv = str(row["seqname"]).split("_")[1] # Calculate Normalized_Binding_Probability and add to GTF dataframe gtf_df.loc[index, "Normalized_Binding_Probability"] = ( @@ -212,16 +212,17 @@ def read_gtf(self) -> None: ) # Calculate Normalized_Binding_Probability and add to GTF dataframe csv_transcript_copy_number = self.csv_df.loc[ - self.csv_df["ID of transcript"] == int(id_csv), - "Transcript copy number", + self.csv_df.iloc[:, 1] == int(id_csv), "Transcript copy number", ].iloc[0] # pop the first value in the frame gtf_df.loc[index, "Transcript_Copy_Number"] = round( csv_transcript_copy_number * gtf_df.loc[index, "Normalized_Binding_Probability"] ) + gtf_df.loc[index, "Transcript_Copy_Number"] gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}" prev_id = id_ + gtf_df['Transcript_Copy_Number'] = gtf_df['Transcript_Copy_Number'].astype(int) self.gtf_df = gtf_df def write_fasta(self) -> None: @@ -244,6 +245,7 @@ def write_csv(self) -> None: """ df_to_save = self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]] - df_to_save.to_csv(self.output_csv, index=False) + # Stop outputting header + df_to_save.to_csv(self.output_csv, index=False, header=False) LOG.info("Copy number csv file successfully written to: %s", self.output_csv) From fe4788d692dda7a0baf569416089639b3ff36f0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 23 Oct 2023 16:15:00 +0200 Subject: [PATCH 4/8] update bug issue template --- .github/ISSUE_TEMPLATE/bug_report.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index dd84ea7..891c617 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] - **Additional context** Add any other context about the problem here. From 572a10aa8943d33755057646a435a0652f7433a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 25 Oct 2023 11:14:17 +0200 Subject: [PATCH 5/8] feat: add headers to fragment-selector output #17 --- scRNAsim_toolz/fragment_selector/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scRNAsim_toolz/fragment_selector/cli.py b/scRNAsim_toolz/fragment_selector/cli.py index d12f3ac..ed21fc3 100644 --- a/scRNAsim_toolz/fragment_selector/cli.py +++ b/scRNAsim_toolz/fragment_selector/cli.py @@ -49,8 +49,8 @@ def main(): logger.info("Writing batch %s sequences to %s...", i, args.output) with open(args.output, 'a', encoding="utf-8") as out_file: - for line in term_frags: - out_file.write(f"{line}\n") + for i, line in enumerate(term_frags, 1): + out_file.write(f">Terminal fragment {i}\n{line}\n") def file_validation(fasta_file: str, From 695ef43a16d62074ad0ea1154f3def3f98ae3339 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 25 Oct 2023 11:22:04 +0200 Subject: [PATCH 6/8] semantic fixes --- scRNAsim_toolz/cdna_generator/cdna.py | 8 +++++--- scRNAsim_toolz/structure_generator/main.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scRNAsim_toolz/cdna_generator/cdna.py b/scRNAsim_toolz/cdna_generator/cdna.py index c2e2bed..a4f2c29 100644 --- a/scRNAsim_toolz/cdna_generator/cdna.py +++ b/scRNAsim_toolz/cdna_generator/cdna.py @@ -212,17 +212,19 @@ def read_gtf(self) -> None: ) # Calculate Normalized_Binding_Probability and add to GTF dataframe csv_transcript_copy_number = self.csv_df.loc[ - self.csv_df.iloc[:, 1] == int(id_csv), "Transcript copy number", + self.csv_df.iloc[:, 1] == int(id_csv), + "Transcript copy number", ].iloc[0] # pop the first value in the frame gtf_df.loc[index, "Transcript_Copy_Number"] = round( csv_transcript_copy_number * gtf_df.loc[index, "Normalized_Binding_Probability"] ) - gtf_df.loc[index, "Transcript_Copy_Number"] gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}" prev_id = id_ - gtf_df['Transcript_Copy_Number'] = gtf_df['Transcript_Copy_Number'].astype(int) + gtf_df['Transcript_Copy_Number'] = gtf_df[ + 'Transcript_Copy_Number' + ].astype(int) self.gtf_df = gtf_df def write_fasta(self) -> None: diff --git a/scRNAsim_toolz/structure_generator/main.py b/scRNAsim_toolz/structure_generator/main.py index 1a453f7..c53e6f7 100644 --- a/scRNAsim_toolz/structure_generator/main.py +++ b/scRNAsim_toolz/structure_generator/main.py @@ -442,7 +442,8 @@ def write_sequences(self, filename: str) -> None: # Add header to output csv for cdna-generator if file_handle.tell() == 0: file_handle.write( - "ID of transcript,ID of parent transcript,Transcript copy number\n" + "ID of transcript,ID of parent transcript," + "Transcript copy number\n" ) for transcript_id, transcript_count in zip(ids, counts): From a2b3b973dfc878290d140fde308fe1194c3297d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 25 Oct 2023 14:44:37 +0200 Subject: [PATCH 7/8] refactor: psp output to match cdna-gen input --- scRNAsim_toolz/priming_site_predictor/psp.py | 57 +++++++++++++------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/scRNAsim_toolz/priming_site_predictor/psp.py b/scRNAsim_toolz/priming_site_predictor/psp.py index 2ec4fa7..190c18b 100644 --- a/scRNAsim_toolz/priming_site_predictor/psp.py +++ b/scRNAsim_toolz/priming_site_predictor/psp.py @@ -75,24 +75,45 @@ def create_pandas_df(self): """Create interaction df.""" interaction_list = self.create_list_from_output() interaction_df = pd.DataFrame(interaction_list) - interaction_df['Number_of_interactions'] = int(0) - interaction_df['Interaction_Energy'] = float(0) - transcript = 3 - energy = 5 + # Add header row to interaction_df + interaction_df.columns = [ + 'Id', + 'Query_name', + 'Query_length', + 'Target_name', + 'Target_length', + 'Accessibility_Energy', + 'Hybridization_Energy', + 'Interaction_Energy', + 'Query_start_bp', + 'Query_end_bp', + 'Target start', + 'Target end'] + interaction_df['Number_of_binding_sites'] = int(0) + interaction_df['Binding_Energy'] = float(0) + transcript = 'Target_name' + energy = 'Accessibility_Energy' for _ in interaction_df.index: - interaction_df['Number_of_interactions'] = interaction_df[ + interaction_df['Number_of_binding_sites'] = interaction_df[ transcript ].apply( lambda x: interaction_df[transcript].value_counts()[x] ) - interaction_df['Interaction_Energy'] = interaction_df[ + interaction_df['Binding_Energy'] = interaction_df[ energy ].apply(self.calculate_energy) LOG.info("Calculating normalised interaction energies...") - interaction_df['Normalised_interaction_energy'] = interaction_df[ - 'Interaction_Energy']/interaction_df['Number_of_interactions'] + interaction_df['Binding_Probability'] = interaction_df[ + 'Binding_Energy']/interaction_df['Number_of_binding_sites'] + + # Round energy columns + column_indices = [5, 6, 7, 13, 14] + for index in column_indices: + interaction_df.iloc[:, index] = interaction_df.iloc[ + :, index + ].astype(float).round(2) return interaction_df @@ -101,19 +122,15 @@ def generate_gtf(self): interaction_df = self.create_pandas_df() result = str() - for index in interaction_df.index: + for _, row in interaction_df.iterrows(): result += ( - str(interaction_df.iloc[:, 3][index]) - + '\tRIBlast\tPriming_site\t' - + str(interaction_df.iloc[:, 13][index]) - + '\t' - + str(interaction_df.iloc[:, 12][index]) - + '\t.\t+\t.\t' - + 'Interaction_Energy' + '\t' - + str(interaction_df[ - "Normalised_interaction_energy" - ][index]) - + '\n' + f'{row.iloc[3]}\tRIBlast\tPriming_site\t' + f'{row.iloc[10]}\t{row.iloc[11]}\t.\t+\t.\t' + 'Accessibility_Energy ' + f'"{row.iloc[5]}"; ' + 'Hybridization_Energy ' + f'"{row.iloc[6]}"; ' + 'Interaction_Energy ' + f'"{row.iloc[7]}"; ' + 'Number_of_binding_sites ' + f'"{row.iloc[12]}"; ' + 'Binding_Probability ' + f'"{row.iloc[14]}"\n' ) LOG.info("Generating output gtf file...") From 0176e73ed4747015de4b8ee3c796ec0a27c13899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 25 Oct 2023 14:44:37 +0200 Subject: [PATCH 8/8] refactor: psp output to match cdna-gen input #16 --- scRNAsim_toolz/priming_site_predictor/psp.py | 57 +++++++++++++------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/scRNAsim_toolz/priming_site_predictor/psp.py b/scRNAsim_toolz/priming_site_predictor/psp.py index 2ec4fa7..190c18b 100644 --- a/scRNAsim_toolz/priming_site_predictor/psp.py +++ b/scRNAsim_toolz/priming_site_predictor/psp.py @@ -75,24 +75,45 @@ def create_pandas_df(self): """Create interaction df.""" interaction_list = self.create_list_from_output() interaction_df = pd.DataFrame(interaction_list) - interaction_df['Number_of_interactions'] = int(0) - interaction_df['Interaction_Energy'] = float(0) - transcript = 3 - energy = 5 + # Add header row to interaction_df + interaction_df.columns = [ + 'Id', + 'Query_name', + 'Query_length', + 'Target_name', + 'Target_length', + 'Accessibility_Energy', + 'Hybridization_Energy', + 'Interaction_Energy', + 'Query_start_bp', + 'Query_end_bp', + 'Target start', + 'Target end'] + interaction_df['Number_of_binding_sites'] = int(0) + interaction_df['Binding_Energy'] = float(0) + transcript = 'Target_name' + energy = 'Accessibility_Energy' for _ in interaction_df.index: - interaction_df['Number_of_interactions'] = interaction_df[ + interaction_df['Number_of_binding_sites'] = interaction_df[ transcript ].apply( lambda x: interaction_df[transcript].value_counts()[x] ) - interaction_df['Interaction_Energy'] = interaction_df[ + interaction_df['Binding_Energy'] = interaction_df[ energy ].apply(self.calculate_energy) LOG.info("Calculating normalised interaction energies...") - interaction_df['Normalised_interaction_energy'] = interaction_df[ - 'Interaction_Energy']/interaction_df['Number_of_interactions'] + interaction_df['Binding_Probability'] = interaction_df[ + 'Binding_Energy']/interaction_df['Number_of_binding_sites'] + + # Round energy columns + column_indices = [5, 6, 7, 13, 14] + for index in column_indices: + interaction_df.iloc[:, index] = interaction_df.iloc[ + :, index + ].astype(float).round(2) return interaction_df @@ -101,19 +122,15 @@ def generate_gtf(self): interaction_df = self.create_pandas_df() result = str() - for index in interaction_df.index: + for _, row in interaction_df.iterrows(): result += ( - str(interaction_df.iloc[:, 3][index]) - + '\tRIBlast\tPriming_site\t' - + str(interaction_df.iloc[:, 13][index]) - + '\t' - + str(interaction_df.iloc[:, 12][index]) - + '\t.\t+\t.\t' - + 'Interaction_Energy' + '\t' - + str(interaction_df[ - "Normalised_interaction_energy" - ][index]) - + '\n' + f'{row.iloc[3]}\tRIBlast\tPriming_site\t' + f'{row.iloc[10]}\t{row.iloc[11]}\t.\t+\t.\t' + 'Accessibility_Energy ' + f'"{row.iloc[5]}"; ' + 'Hybridization_Energy ' + f'"{row.iloc[6]}"; ' + 'Interaction_Energy ' + f'"{row.iloc[7]}"; ' + 'Number_of_binding_sites ' + f'"{row.iloc[12]}"; ' + 'Binding_Probability ' + f'"{row.iloc[14]}"\n' ) LOG.info("Generating output gtf file...")