Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: cdna-generator input and fragment-selector output #19

Merged
merged 9 commits into from
Oct 25, 2023
11 changes: 0 additions & 11 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.

**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]

**Smartphone (please complete the following information):**
- Device: [e.g. iPhone6]
- OS: [e.g. iOS8.1]
- Browser [e.g. stock browser, safari]
- Version [e.g. 22]

**Additional context**
Add any other context about the problem here.
18 changes: 11 additions & 7 deletions scRNAsim_toolz/cdna_generator/cdna.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,15 @@ def read_csv(self) -> None:
def read_gtf(self) -> None:
"""Read and process the GTF file.

Reads a GTF file and determines copy numbers from \
normalized probabilities.
Reads a GTF file and determines copy numbers from
normalized probabilities.

Returns: None

"""
# returns GTF with essential columns such as \
# returns GTF with essential columns such as
# "feature", "seqname", "start", "end"
# alongside the names of any optional keys \
# alongside the names of any optional keys
# which appeared in the attribute column
gtf_df = read_gtf(self.gtf)

Expand All @@ -204,15 +204,15 @@ def read_gtf(self) -> None:
count += 1
else:
count = 0 # reset count
# CVS transcript ID
# CSV transcript ID
id_csv = str(row["seqname"]).split("_")[1]
# Calculate Normalized_Binding_Probability and add to GTF dataframe
gtf_df.loc[index, "Normalized_Binding_Probability"] = (
row["Binding_Probability"] / df_norm_bind_prob[id_]
)
# Calculate Normalized_Binding_Probability and add to GTF dataframe
csv_transcript_copy_number = self.csv_df.loc[
self.csv_df["ID of transcript"] == int(id_csv),
self.csv_df.iloc[:, 1] == int(id_csv),
"Transcript copy number",
].iloc[0] # pop the first value in the frame
gtf_df.loc[index, "Transcript_Copy_Number"] = round(
Expand All @@ -222,6 +222,9 @@ def read_gtf(self) -> None:
gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}"
prev_id = id_

gtf_df['Transcript_Copy_Number'] = gtf_df[
'Transcript_Copy_Number'
].astype(int)
self.gtf_df = gtf_df

def write_fasta(self) -> None:
Expand All @@ -244,6 +247,7 @@ def write_csv(self) -> None:

"""
df_to_save = self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]]
df_to_save.to_csv(self.output_csv, index=False)
# Stop outputting header
df_to_save.to_csv(self.output_csv, index=False, header=False)
LOG.info("Copy number csv file successfully written to: %s",
self.output_csv)
22 changes: 18 additions & 4 deletions scRNAsim_toolz/cdna_generator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ def main():
description="Generate cDNA sequences based on primer probabilities.",
)
parser.add_argument(
"-ifa", "--input_fasta", help="genome fasta file", required=True
"-ifa",
"--input_fasta",
help="genome fasta file",
required=True
)
parser.add_argument("-igtf", "--input_gtf", help="gtf file", required=True)
parser.add_argument(
"-ofa", "--output_fasta", help="output fasta file", required=True
"-igtf",
"--input_gtf",
help="gtf file",
required=True
)
parser.add_argument(
"-icpn",
Expand All @@ -39,7 +44,16 @@ def main():
required=True,
)
parser.add_argument(
"-ocsv", "--output_csv", help="output fasta file", required=True
"-ofa",
"--output_fasta",
help="output fasta file",
required=True
)
parser.add_argument(
"-ocsv",
"--output_csv",
help="output fasta file",
required=True
)
parser.add_argument(
'-v', '--version', action='version',
Expand Down
4 changes: 2 additions & 2 deletions scRNAsim_toolz/fragment_selector/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def main():

logger.info("Writing batch %s sequences to %s...", i, args.output)
with open(args.output, 'a', encoding="utf-8") as out_file:
for line in term_frags:
out_file.write(f"{line}\n")
for i, line in enumerate(term_frags, 1):
out_file.write(f">Terminal fragment {i}\n{line}\n")


def file_validation(fasta_file: str,
Expand Down
57 changes: 37 additions & 20 deletions scRNAsim_toolz/priming_site_predictor/psp.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,45 @@ def create_pandas_df(self):
"""Create interaction df."""
interaction_list = self.create_list_from_output()
interaction_df = pd.DataFrame(interaction_list)
interaction_df['Number_of_interactions'] = int(0)
interaction_df['Interaction_Energy'] = float(0)
transcript = 3
energy = 5
# Add header row to interaction_df
interaction_df.columns = [
'Id',
'Query_name',
'Query_length',
'Target_name',
'Target_length',
'Accessibility_Energy',
'Hybridization_Energy',
'Interaction_Energy',
'Query_start_bp',
'Query_end_bp',
'Target start',
'Target end']
interaction_df['Number_of_binding_sites'] = int(0)
interaction_df['Binding_Energy'] = float(0)
transcript = 'Target_name'
energy = 'Accessibility_Energy'

for _ in interaction_df.index:
interaction_df['Number_of_interactions'] = interaction_df[
interaction_df['Number_of_binding_sites'] = interaction_df[
transcript
].apply(
lambda x: interaction_df[transcript].value_counts()[x]
)
interaction_df['Interaction_Energy'] = interaction_df[
interaction_df['Binding_Energy'] = interaction_df[
energy
].apply(self.calculate_energy)

LOG.info("Calculating normalised interaction energies...")
interaction_df['Normalised_interaction_energy'] = interaction_df[
'Interaction_Energy']/interaction_df['Number_of_interactions']
interaction_df['Binding_Probability'] = interaction_df[
'Binding_Energy']/interaction_df['Number_of_binding_sites']

# Round energy columns
column_indices = [5, 6, 7, 13, 14]
for index in column_indices:
interaction_df.iloc[:, index] = interaction_df.iloc[
:, index
].astype(float).round(2)

return interaction_df

Expand All @@ -101,19 +122,15 @@ def generate_gtf(self):
interaction_df = self.create_pandas_df()
result = str()

for index in interaction_df.index:
for _, row in interaction_df.iterrows():
result += (
str(interaction_df.iloc[:, 3][index])
+ '\tRIBlast\tPriming_site\t'
+ str(interaction_df.iloc[:, 13][index])
+ '\t'
+ str(interaction_df.iloc[:, 12][index])
+ '\t.\t+\t.\t'
+ 'Interaction_Energy' + '\t'
+ str(interaction_df[
"Normalised_interaction_energy"
][index])
+ '\n'
f'{row.iloc[3]}\tRIBlast\tPriming_site\t'
f'{row.iloc[10]}\t{row.iloc[11]}\t.\t+\t.\t'
'Accessibility_Energy ' + f'"{row.iloc[5]}"; '
'Hybridization_Energy ' + f'"{row.iloc[6]}"; '
'Interaction_Energy ' + f'"{row.iloc[7]}"; '
'Number_of_binding_sites ' + f'"{row.iloc[12]}"; '
'Binding_Probability ' + f'"{row.iloc[14]}"\n'
)

LOG.info("Generating output gtf file...")
Expand Down
7 changes: 7 additions & 0 deletions scRNAsim_toolz/structure_generator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,13 @@ def write_sequences(self, filename: str) -> None:
"""
ids, _, counts = self.get_unique_inclusions()
with open(filename, "a", encoding="utf_8") as file_handle:
# Add header to output csv for cdna-generator
if file_handle.tell() == 0:
file_handle.write(
"ID of transcript,ID of parent transcript,"
"Transcript copy number\n"
)

for transcript_id, transcript_count in zip(ids, counts):
file_handle.write(
f"{transcript_id},{self.ts_id},{transcript_count}\n"
Expand Down
Loading