fix: transcript sampler (#15)

* feat: update ci * feat: update envs * feat: update ci * feat: update ci * feat: update ci * feat: update ci * feat: update ci * feat: update ci * feat: update ci * update ci, remove dev env * update ci, remove dev env * refactor: major refactoring of transcript-sampler * update and add test files
zavolanlab · Oct 20, 2023 · e246915 · e246915
1 parent b8d858b
commit e246915
Show file tree

Hide file tree

Showing 8 changed files with 601 additions and 283 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,13 +25,10 @@ jobs:
         with:
           mamba-version: "*"
           auto-update-conda: true
-          activate-environment: scrnasim-toolz
+          activate-environment: "scrnasim-toolz"
           environment-file: environment.yml
           auto-activate-base: false
-
-      - name: Update scrnasim-toolz env with dev packages
-        run: mamba env update -n scrnasim-toolz -f environment-dev.yml
-
+
       - name: Display environment info
         run: |
           conda info -a
@@ -62,13 +59,10 @@ jobs:
         with:
           mamba-version: "*"
           auto-update-conda: true
-          activate-environment: scrnasim-toolz
+          activate-environment: "scrnasim-toolz"
           environment-file: environment.yml
           auto-activate-base: false
 
-      - name: Update scrnasim-toolz env with dev packages
-        run: mamba env update -n scrnasim-toolz -f environment-dev.yml
-
       - name: Display environment info
         run: |
           conda info -a

diff --git a/environment-dev.yml b/environment-dev.yml
diff --git a/environment.yml b/environment.yml
@@ -8,8 +8,16 @@ dependencies:
   - polars==0.16.17
   - gtfparse
   - numpy>=1.23.3
+  - nextflow
   - pandas>=1.4.4
   - python>=3.6, <=3.10
   - pyarrow
+  - black
+  - coverage
+  - flake8
+  - flake8-docstrings
+  - mypy
+  - pylint
+  - pytest
   - pip:
-    - -e .
+    - '-e .'
diff --git a/pylint.cfg b/pylint.cfg
@@ -1,2 +1,2 @@
 [MESSAGES CONTROL]
-disable=C0103,E1101,I1101,R0801
+disable=C0103,E1101,I1101,R0801,R0914
diff --git a/scRNAsim_toolz/transcript_sampler/find_reptrans.py b/scRNAsim_toolz/transcript_sampler/find_reptrans.py
@@ -1,6 +1,7 @@
 """Find representative transcripts."""
 import logging
 from typing import Union
+import pandas as pd  # type: ignore
 
 LOG = logging.getLogger(__name__)
 
@@ -95,125 +96,71 @@ def get_rep_trans(self, file_name: str) -> dict:
         Raises:
             ValueError: If an unexpected entry is encountered in the GTF file.
         """
-        # setting default variables
-        rep_transcripts: dict = {}
-        cur_g_id = ""
-        cur_t_id = ""
-        pot_best_trans: list = []
-        cur_best_trans: list = []
-        # [transcript_id, transcript_support_level, transcript_length]
-        cur_best_trans = ["", 100, 0]
+        result_list = []
 
         with open(file_name, "r", encoding="utf-8") as file:
             for line in file:
-                entry = line.split("\t")
+                entry = line.strip().split("\t")
 
-                # removes expected but unneeded entries
                 if len(entry) == 1 or entry[2] in [
                     "CDS", "stop_codon",
                     "five_prime_utr", "three_prime_utr",
                     "start_codon", "Selenocysteine"
-                        ]:
+                ]:
                     continue
 
-                # this function turns the less organized part of the entry
-                # into a readable list
                 attributes = self.attributes_converter(entry[8])
 
-                # looking for and processing exons entries
-                if entry[2] == "exon":
-                    if cur_g_id != attributes[1]:
-                        LOG.error("Exon from an unexpected gene")
-                        raise ValueError("Exon from an unexpected gene")
-                    if (
-                        self.find_in_attributes(
-                            attributes, "transcript_id"
-                        ) != cur_t_id
-                    ):
-                        LOG.error("Exon from an unexpected transcript")
-                        raise ValueError("Exon from an unexpected transcript")
-
-                    # adding the length of the exon to the appropriate list and
-                    # checking for changes in best transcript
-                    if pot_best_trans:
-                        pot_best_trans[2] += int(entry[4]) - int(entry[3])
-                        if pot_best_trans[2] > cur_best_trans[2]:
-                            cur_best_trans = pot_best_trans
-                    else:
-                        cur_best_trans[2] += int(entry[4]) - int(entry[3])
+                # 1. GENE entries
+                if entry[2] == "gene":
+                    cur_g_id = self.find_in_attributes(attributes, "gene_id")
 
-                # looking for and processing transcript entries
+                # 2. TRANSCRIPT entries
                 elif entry[2] == "transcript":
-                    # verify that the gen is correct
-                    if cur_g_id != attributes[1]:
-                        LOG.error("Transcript from an unexpected gene")
-                        raise ValueError("Transcript from an unexpected gene")
-
-                    # finding the transcript id and the support level
                     cur_t_id = self.find_in_attributes(
                         attributes, "transcript_id"
                         )
-                    t_supp_lvl: Union[int, str] = self.find_in_attributes(
+                    tsl: Union[int, str] = self.find_in_attributes(
                         attributes, "transcript_support_level"
                         )
-
-                    # If there is no transcript support level or the level is
-                    # given as NA it is nomed as 100. else the transcript
-                    # support level is turned into int
-                    if t_supp_lvl == "NA":
-                        t_supp_lvl = 100
-                    else:
-                        if isinstance(
-                            t_supp_lvl, str
-                        ) and t_supp_lvl.isdigit():
-                            t_supp_lvl = int(t_supp_lvl)
-                        else:
-                            t_supp_lvl = 100
-
-                    # decides if the transcript has potential to become the
-                    # representative transcript
-                    if (
-                        t_supp_lvl < cur_best_trans[1] or
-                        cur_best_trans[0] == ""
-                    ):
-                        cur_best_trans = [cur_t_id, t_supp_lvl, 0]
-                    elif t_supp_lvl == cur_best_trans[1]:
-                        pot_best_trans = [cur_t_id, t_supp_lvl, 0]
-
-                # looking for and processing gene entries
-                elif entry[2] == "gene":
-                    # updating rep_transcripts dict
-                    if cur_g_id in rep_transcripts:
-                        if (rep_transcripts[cur_g_id][1] > cur_best_trans[1]
-                            or (rep_transcripts[cur_g_id][1] ==
-                                cur_best_trans[1]
-                                and rep_transcripts[cur_g_id][2] <
-                                cur_best_trans[2])):
-                            rep_transcripts[cur_g_id] = cur_best_trans
+                    if tsl == "NA":
+                        tsl = 100
                     else:
-                        rep_transcripts[cur_g_id] = cur_best_trans
+                        tsl = int(tsl)
 
-                    # updating cur_g_id and resetting cur_best_trans
-                    cur_g_id = attributes[1]
-                    cur_best_trans = ["", 100, 0]
+                    result_list.append(
+                        {"gene_id": cur_g_id, "transcript_id": cur_t_id,
+                         "tsl": tsl, "exon_length": 0}
+                    )
 
-                # raises an error for unidentifiable entries
-                else:
-                    LOG.error("This entry could not be identified")
-                    raise ValueError("This entry could not be identified")
-
-            # adding the final gene to the dictionary
-            if cur_g_id in rep_transcripts:
-                if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] or
-                        (rep_transcripts[cur_g_id][1] == cur_best_trans[1] and
-                         rep_transcripts[cur_g_id][2] < cur_best_trans[2])):
-                    rep_transcripts[cur_g_id] = cur_best_trans
-            else:
-                rep_transcripts[cur_g_id] = cur_best_trans
-
-            del rep_transcripts[""]
-            rep_transcripts = self.reformat_reptrans(rep_transcripts)
-            return rep_transcripts
+                # EXON entries
+                elif entry[2] == "exon":
+                    exon_length = int(entry[4]) - int(entry[3])
+
+                    for item in result_list:
+                        if (
+                            item["gene_id"] == cur_g_id
+                            and item["transcript_id"] == cur_t_id
+                        ):
+                            item["exon_length"] += exon_length  # type: ignore
+
+        # Convert the list of dictionaries into a DataFrame
+        df = pd.DataFrame(result_list)
+
+        # Create dictionary of representative transcripts
+        sorted_df = df.sort_values(
+            by=['gene_id', 'tsl', 'exon_length'], ascending=[True, True, False]
+            )
+
+        # Keep the first row for each gene_id (best representative transcript)
+        best_rep_df = sorted_df.drop_duplicates(subset='gene_id', keep='first')
+
+        # Create a dictionary with gene_id as keys and transcript_id as values
+        best_rep_transcripts = best_rep_df.set_index(
+            'gene_id'
+        )['transcript_id'].to_dict()
+
+        return best_rep_transcripts
 
     def gtf_file_writer(self, original_file: str,
                         rep_transcript_dict: dict, output_file: str):

diff --git a/tests/transcript_sampler/inputs/expression.csv b/tests/transcript_sampler/inputs/expression.csv
@@ -1,4 +1,7 @@
 ENST00000472194,0.8914783511010855
 ENST00000308647,1.0887715239511602
-ENST00000442483,0.8381441606416928
+ENST00000624431,0.8381441606416928
 ENST00000511072,0.9145581387636652
+ENST00000378391,1.5383487213482392
+ENST00000514189,0.6293910822183929
+ENST00000667159,1.1524934812301821