perf: use estimated counts instead of TPM (#170)

* feat: use estimated counts instead of tpm for total expression * final transcript list * feat: final transcript list * feat: final transcript list * update docstrings for get_source_expression
zavolanlab · Oct 29, 2024 · ef82690 · ef82690
1 parent a081c35
commit ef82690
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 16 deletions.
diff --git a/data/transcripts.fasta.gz b/data/transcripts.fasta.gz
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
@@ -164,7 +164,7 @@ def get_source(
         )
 
         # process expression levels
-        tpm_df = self.get_source_expression(
+        counts_df = self.get_source_expression(
             kallisto_dir=kallisto_dir,
         )
 
@@ -173,7 +173,7 @@ def get_source(
             Path(self.out_dir) / f"library_source_{fastq.name}.json"
         )
         LOGGER.debug(f"Writing results to file: {filename}")
-        tpm_df.to_json(
+        counts_df.to_json(
             filename,
             orient='split',
             index=False,
@@ -182,13 +182,13 @@ def get_source(
 
         # validate results
         if validate_top_score(
-            vector=tpm_df['tpm'].to_list(),
+            vector=counts_df['est_counts'].to_list(),
             min_value=self.min_match_pct,
             min_ratio=self.min_freq_ratio,
             rev_sorted=True,
             accept_zero=True,
         ):
-            source.short_name, taxon_id = tpm_df.iloc[0]['source_ids']
+            source.short_name, taxon_id = counts_df.iloc[0]['source_ids']
             source.taxon_id = int(taxon_id)
 
         LOGGER.debug(f"Source: {source}")
@@ -247,17 +247,17 @@ def run_kallisto_quantification(
     def get_source_expression(
         kallisto_dir: Path,
     ) -> DataFrame:
-        """Return percentages of total expression per read source.
+        """Return percentages of total estimated counts per read source.
 
         Args:
             kallisto_dir: Directory containing Kallisto quantification results.
 
         Returns:
             Data frame with columns `source_ids` (a tuple of source short name
-                and taxon identifier, e.g., `("hsapiens", 9606)`) and `tpm`,
-                signifying the percentages of total expression per read source.
-                The data frame is sorted by total expression in descending
-                order.
+                and taxon identifier, e.g., `("hsapiens", 9606)`) and
+                `est_counts`, signifying the percentages of total estimated
+                counts per read source. The data frame is sorted by total
+                estimated counts in descending order.
 
         Raises:
             FileProblem: Kallisto quantification results could not be
@@ -283,9 +283,9 @@ def get_source_expression(
             )
 
         # handle case where no alignments are found
-        dat.tpm.fillna(0, inplace=True)
+        dat.est_counts.fillna(0, inplace=True)
 
-        # aggregate expression by source identifiers
+        # aggregate counts by source identifiers
         dat[[
             'gene_symbol',
             'gene_id',
@@ -294,17 +294,17 @@ def get_source_expression(
             'taxon_id'
         ]] = dat.target_id.str.split('|', n=4, expand=True)
         dat['source_ids'] = list(zip(dat.short_name, dat.taxon_id))
-        total_tpm = dat.tpm.sum()
-        dat_agg = dat.groupby(['source_ids'])[['tpm']].agg('sum')
+        total_counts = dat.est_counts.sum()
+        dat_agg = dat.groupby(['source_ids'])[['est_counts']].agg('sum')
         dat_agg['source_ids'] = dat_agg.index
         dat_agg.reset_index(drop=True, inplace=True)
 
         # calculate percentages
-        if total_tpm != 0:
-            dat_agg.tpm = dat_agg.tpm / total_tpm * 100
+        if total_counts != 0:
+            dat_agg.est_counts = dat_agg.est_counts / total_counts * 100
 
         # return as dictionary
-        return dat_agg.sort_values(["tpm"], ascending=False)
+        return dat_agg.sort_values(["est_counts"], ascending=False)
 
     @staticmethod
     def get_source_name(