Skip to content

Commit

Permalink
fix: transcript sampler (#15)
Browse files Browse the repository at this point in the history
* feat: update ci

* feat: update envs

* feat: update ci

* feat: update ci

* feat: update ci

* feat: update ci

* feat: update ci

* feat: update ci

* feat: update ci

* update ci, remove dev env

* update ci, remove dev env

* refactor: major refactoring of transcript-sampler

* update and add test files
  • Loading branch information
balajtimate authored Oct 20, 2023
1 parent b8d858b commit e246915
Show file tree
Hide file tree
Showing 8 changed files with 601 additions and 283 deletions.
12 changes: 3 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,10 @@ jobs:
with:
mamba-version: "*"
auto-update-conda: true
activate-environment: scrnasim-toolz
activate-environment: "scrnasim-toolz"
environment-file: environment.yml
auto-activate-base: false

- name: Update scrnasim-toolz env with dev packages
run: mamba env update -n scrnasim-toolz -f environment-dev.yml


- name: Display environment info
run: |
conda info -a
Expand Down Expand Up @@ -62,13 +59,10 @@ jobs:
with:
mamba-version: "*"
auto-update-conda: true
activate-environment: scrnasim-toolz
activate-environment: "scrnasim-toolz"
environment-file: environment.yml
auto-activate-base: false

- name: Update scrnasim-toolz env with dev packages
run: mamba env update -n scrnasim-toolz -f environment-dev.yml

- name: Display environment info
run: |
conda info -a
Expand Down
20 changes: 0 additions & 20 deletions environment-dev.yml

This file was deleted.

10 changes: 9 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,16 @@ dependencies:
- polars==0.16.17
- gtfparse
- numpy>=1.23.3
- nextflow
- pandas>=1.4.4
- python>=3.6, <=3.10
- pyarrow
- black
- coverage
- flake8
- flake8-docstrings
- mypy
- pylint
- pytest
- pip:
- -e .
- '-e .'
2 changes: 1 addition & 1 deletion pylint.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[MESSAGES CONTROL]
disable=C0103,E1101,I1101,R0801
disable=C0103,E1101,I1101,R0801,R0914
141 changes: 44 additions & 97 deletions scRNAsim_toolz/transcript_sampler/find_reptrans.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Find representative transcripts."""
import logging
from typing import Union
import pandas as pd # type: ignore

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -95,125 +96,71 @@ def get_rep_trans(self, file_name: str) -> dict:
Raises:
ValueError: If an unexpected entry is encountered in the GTF file.
"""
# setting default variables
rep_transcripts: dict = {}
cur_g_id = ""
cur_t_id = ""
pot_best_trans: list = []
cur_best_trans: list = []
# [transcript_id, transcript_support_level, transcript_length]
cur_best_trans = ["", 100, 0]
result_list = []

with open(file_name, "r", encoding="utf-8") as file:
for line in file:
entry = line.split("\t")
entry = line.strip().split("\t")

# removes expected but unneeded entries
if len(entry) == 1 or entry[2] in [
"CDS", "stop_codon",
"five_prime_utr", "three_prime_utr",
"start_codon", "Selenocysteine"
]:
]:
continue

# this function turns the less organized part of the entry
# into a readable list
attributes = self.attributes_converter(entry[8])

# looking for and processing exons entries
if entry[2] == "exon":
if cur_g_id != attributes[1]:
LOG.error("Exon from an unexpected gene")
raise ValueError("Exon from an unexpected gene")
if (
self.find_in_attributes(
attributes, "transcript_id"
) != cur_t_id
):
LOG.error("Exon from an unexpected transcript")
raise ValueError("Exon from an unexpected transcript")

# adding the length of the exon to the appropriate list and
# checking for changes in best transcript
if pot_best_trans:
pot_best_trans[2] += int(entry[4]) - int(entry[3])
if pot_best_trans[2] > cur_best_trans[2]:
cur_best_trans = pot_best_trans
else:
cur_best_trans[2] += int(entry[4]) - int(entry[3])
# 1. GENE entries
if entry[2] == "gene":
cur_g_id = self.find_in_attributes(attributes, "gene_id")

# looking for and processing transcript entries
# 2. TRANSCRIPT entries
elif entry[2] == "transcript":
# verify that the gen is correct
if cur_g_id != attributes[1]:
LOG.error("Transcript from an unexpected gene")
raise ValueError("Transcript from an unexpected gene")

# finding the transcript id and the support level
cur_t_id = self.find_in_attributes(
attributes, "transcript_id"
)
t_supp_lvl: Union[int, str] = self.find_in_attributes(
tsl: Union[int, str] = self.find_in_attributes(
attributes, "transcript_support_level"
)

# If there is no transcript support level or the level is
# given as NA it is nomed as 100. else the transcript
# support level is turned into int
if t_supp_lvl == "NA":
t_supp_lvl = 100
else:
if isinstance(
t_supp_lvl, str
) and t_supp_lvl.isdigit():
t_supp_lvl = int(t_supp_lvl)
else:
t_supp_lvl = 100

# decides if the transcript has potential to become the
# representative transcript
if (
t_supp_lvl < cur_best_trans[1] or
cur_best_trans[0] == ""
):
cur_best_trans = [cur_t_id, t_supp_lvl, 0]
elif t_supp_lvl == cur_best_trans[1]:
pot_best_trans = [cur_t_id, t_supp_lvl, 0]

# looking for and processing gene entries
elif entry[2] == "gene":
# updating rep_transcripts dict
if cur_g_id in rep_transcripts:
if (rep_transcripts[cur_g_id][1] > cur_best_trans[1]
or (rep_transcripts[cur_g_id][1] ==
cur_best_trans[1]
and rep_transcripts[cur_g_id][2] <
cur_best_trans[2])):
rep_transcripts[cur_g_id] = cur_best_trans
if tsl == "NA":
tsl = 100
else:
rep_transcripts[cur_g_id] = cur_best_trans
tsl = int(tsl)

# updating cur_g_id and resetting cur_best_trans
cur_g_id = attributes[1]
cur_best_trans = ["", 100, 0]
result_list.append(
{"gene_id": cur_g_id, "transcript_id": cur_t_id,
"tsl": tsl, "exon_length": 0}
)

# raises an error for unidentifiable entries
else:
LOG.error("This entry could not be identified")
raise ValueError("This entry could not be identified")

# adding the final gene to the dictionary
if cur_g_id in rep_transcripts:
if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] or
(rep_transcripts[cur_g_id][1] == cur_best_trans[1] and
rep_transcripts[cur_g_id][2] < cur_best_trans[2])):
rep_transcripts[cur_g_id] = cur_best_trans
else:
rep_transcripts[cur_g_id] = cur_best_trans

del rep_transcripts[""]
rep_transcripts = self.reformat_reptrans(rep_transcripts)
return rep_transcripts
# EXON entries
elif entry[2] == "exon":
exon_length = int(entry[4]) - int(entry[3])

for item in result_list:
if (
item["gene_id"] == cur_g_id
and item["transcript_id"] == cur_t_id
):
item["exon_length"] += exon_length # type: ignore

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(result_list)

# Create dictionary of representative transcripts
sorted_df = df.sort_values(
by=['gene_id', 'tsl', 'exon_length'], ascending=[True, True, False]
)

# Keep the first row for each gene_id (best representative transcript)
best_rep_df = sorted_df.drop_duplicates(subset='gene_id', keep='first')

# Create a dictionary with gene_id as keys and transcript_id as values
best_rep_transcripts = best_rep_df.set_index(
'gene_id'
)['transcript_id'].to_dict()

return best_rep_transcripts

def gtf_file_writer(self, original_file: str,
rep_transcript_dict: dict, output_file: str):
Expand Down
5 changes: 4 additions & 1 deletion tests/transcript_sampler/inputs/expression.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
ENST00000472194,0.8914783511010855
ENST00000308647,1.0887715239511602
ENST00000442483,0.8381441606416928
ENST00000624431,0.8381441606416928
ENST00000511072,0.9145581387636652
ENST00000378391,1.5383487213482392
ENST00000514189,0.6293910822183929
ENST00000667159,1.1524934812301821
Loading

0 comments on commit e246915

Please sign in to comment.