Skip to content

Commit

Permalink
Fix: cdna-generator input and fragment-selector output (#19)
Browse files Browse the repository at this point in the history
* add header to structure-gen output csv #16

* refactor cli arguments #16

* minor changes to cdna-gen #16

* update bug issue template

* feat: add headers to fragment-selector output #17

* semantic fixes

* refactor: psp output to match cdna-gen input

* refactor: psp output to match cdna-gen input #16
  • Loading branch information
balajtimate authored Oct 25, 2023
1 parent e246915 commit 1747d65
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 44 deletions.
11 changes: 0 additions & 11 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.

**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]

**Smartphone (please complete the following information):**
- Device: [e.g. iPhone6]
- OS: [e.g. iOS8.1]
- Browser [e.g. stock browser, safari]
- Version [e.g. 22]

**Additional context**
Add any other context about the problem here.
18 changes: 11 additions & 7 deletions scRNAsim_toolz/cdna_generator/cdna.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,15 @@ def read_csv(self) -> None:
def read_gtf(self) -> None:
"""Read and process the GTF file.
Reads a GTF file and determines copy numbers from \
normalized probabilities.
Reads a GTF file and determines copy numbers from
normalized probabilities.
Returns: None
"""
# returns GTF with essential columns such as \
# returns GTF with essential columns such as
# "feature", "seqname", "start", "end"
# alongside the names of any optional keys \
# alongside the names of any optional keys
# which appeared in the attribute column
gtf_df = read_gtf(self.gtf)

Expand All @@ -204,15 +204,15 @@ def read_gtf(self) -> None:
count += 1
else:
count = 0 # reset count
# CVS transcript ID
# CSV transcript ID
id_csv = str(row["seqname"]).split("_")[1]
# Calculate Normalized_Binding_Probability and add to GTF dataframe
gtf_df.loc[index, "Normalized_Binding_Probability"] = (
row["Binding_Probability"] / df_norm_bind_prob[id_]
)
# Calculate Normalized_Binding_Probability and add to GTF dataframe
csv_transcript_copy_number = self.csv_df.loc[
self.csv_df["ID of transcript"] == int(id_csv),
self.csv_df.iloc[:, 1] == int(id_csv),
"Transcript copy number",
].iloc[0] # pop the first value in the frame
gtf_df.loc[index, "Transcript_Copy_Number"] = round(
Expand All @@ -222,6 +222,9 @@ def read_gtf(self) -> None:
gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}"
prev_id = id_

gtf_df['Transcript_Copy_Number'] = gtf_df[
'Transcript_Copy_Number'
].astype(int)
self.gtf_df = gtf_df

def write_fasta(self) -> None:
Expand All @@ -244,6 +247,7 @@ def write_csv(self) -> None:
"""
df_to_save = self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]]
df_to_save.to_csv(self.output_csv, index=False)
# Stop outputting header
df_to_save.to_csv(self.output_csv, index=False, header=False)
LOG.info("Copy number csv file successfully written to: %s",
self.output_csv)
22 changes: 18 additions & 4 deletions scRNAsim_toolz/cdna_generator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ def main():
description="Generate cDNA sequences based on primer probabilities.",
)
parser.add_argument(
"-ifa", "--input_fasta", help="genome fasta file", required=True
"-ifa",
"--input_fasta",
help="genome fasta file",
required=True
)
parser.add_argument("-igtf", "--input_gtf", help="gtf file", required=True)
parser.add_argument(
"-ofa", "--output_fasta", help="output fasta file", required=True
"-igtf",
"--input_gtf",
help="gtf file",
required=True
)
parser.add_argument(
"-icpn",
Expand All @@ -39,7 +44,16 @@ def main():
required=True,
)
parser.add_argument(
"-ocsv", "--output_csv", help="output fasta file", required=True
"-ofa",
"--output_fasta",
help="output fasta file",
required=True
)
parser.add_argument(
"-ocsv",
"--output_csv",
help="output fasta file",
required=True
)
parser.add_argument(
'-v', '--version', action='version',
Expand Down
4 changes: 2 additions & 2 deletions scRNAsim_toolz/fragment_selector/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def main():

logger.info("Writing batch %s sequences to %s...", i, args.output)
with open(args.output, 'a', encoding="utf-8") as out_file:
for line in term_frags:
out_file.write(f"{line}\n")
for i, line in enumerate(term_frags, 1):
out_file.write(f">Terminal fragment {i}\n{line}\n")


def file_validation(fasta_file: str,
Expand Down
57 changes: 37 additions & 20 deletions scRNAsim_toolz/priming_site_predictor/psp.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,45 @@ def create_pandas_df(self):
"""Create interaction df."""
interaction_list = self.create_list_from_output()
interaction_df = pd.DataFrame(interaction_list)
interaction_df['Number_of_interactions'] = int(0)
interaction_df['Interaction_Energy'] = float(0)
transcript = 3
energy = 5
# Add header row to interaction_df
interaction_df.columns = [
'Id',
'Query_name',
'Query_length',
'Target_name',
'Target_length',
'Accessibility_Energy',
'Hybridization_Energy',
'Interaction_Energy',
'Query_start_bp',
'Query_end_bp',
'Target start',
'Target end']
interaction_df['Number_of_binding_sites'] = int(0)
interaction_df['Binding_Energy'] = float(0)
transcript = 'Target_name'
energy = 'Accessibility_Energy'

for _ in interaction_df.index:
interaction_df['Number_of_interactions'] = interaction_df[
interaction_df['Number_of_binding_sites'] = interaction_df[
transcript
].apply(
lambda x: interaction_df[transcript].value_counts()[x]
)
interaction_df['Interaction_Energy'] = interaction_df[
interaction_df['Binding_Energy'] = interaction_df[
energy
].apply(self.calculate_energy)

LOG.info("Calculating normalised interaction energies...")
interaction_df['Normalised_interaction_energy'] = interaction_df[
'Interaction_Energy']/interaction_df['Number_of_interactions']
interaction_df['Binding_Probability'] = interaction_df[
'Binding_Energy']/interaction_df['Number_of_binding_sites']

# Round energy columns
column_indices = [5, 6, 7, 13, 14]
for index in column_indices:
interaction_df.iloc[:, index] = interaction_df.iloc[
:, index
].astype(float).round(2)

return interaction_df

Expand All @@ -101,19 +122,15 @@ def generate_gtf(self):
interaction_df = self.create_pandas_df()
result = str()

for index in interaction_df.index:
for _, row in interaction_df.iterrows():
result += (
str(interaction_df.iloc[:, 3][index])
+ '\tRIBlast\tPriming_site\t'
+ str(interaction_df.iloc[:, 13][index])
+ '\t'
+ str(interaction_df.iloc[:, 12][index])
+ '\t.\t+\t.\t'
+ 'Interaction_Energy' + '\t'
+ str(interaction_df[
"Normalised_interaction_energy"
][index])
+ '\n'
f'{row.iloc[3]}\tRIBlast\tPriming_site\t'
f'{row.iloc[10]}\t{row.iloc[11]}\t.\t+\t.\t'
'Accessibility_Energy ' + f'"{row.iloc[5]}"; '
'Hybridization_Energy ' + f'"{row.iloc[6]}"; '
'Interaction_Energy ' + f'"{row.iloc[7]}"; '
'Number_of_binding_sites ' + f'"{row.iloc[12]}"; '
'Binding_Probability ' + f'"{row.iloc[14]}"\n'
)

LOG.info("Generating output gtf file...")
Expand Down
7 changes: 7 additions & 0 deletions scRNAsim_toolz/structure_generator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,13 @@ def write_sequences(self, filename: str) -> None:
"""
ids, _, counts = self.get_unique_inclusions()
with open(filename, "a", encoding="utf_8") as file_handle:
# Add header to output csv for cdna-generator
if file_handle.tell() == 0:
file_handle.write(
"ID of transcript,ID of parent transcript,"
"Transcript copy number\n"
)

for transcript_id, transcript_count in zip(ids, counts):
file_handle.write(
f"{transcript_id},{self.ts_id},{transcript_count}\n"
Expand Down

0 comments on commit 1747d65

Please sign in to comment.