Skip to content

Commit

Permalink
perf: use estimated counts instead of TPM (#170)
Browse files Browse the repository at this point in the history
* feat: use estimated counts instead of tpm for total expression

* final transcript list

* feat: final transcript list

* feat: final transcript list

* update docstrings for get_source_expression
  • Loading branch information
balajtimate authored Oct 29, 2024
1 parent a081c35 commit ef82690
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
Binary file modified data/transcripts.fasta.gz
Binary file not shown.
32 changes: 16 additions & 16 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def get_source(
)

# process expression levels
tpm_df = self.get_source_expression(
counts_df = self.get_source_expression(
kallisto_dir=kallisto_dir,
)

Expand All @@ -173,7 +173,7 @@ def get_source(
Path(self.out_dir) / f"library_source_{fastq.name}.json"
)
LOGGER.debug(f"Writing results to file: {filename}")
tpm_df.to_json(
counts_df.to_json(
filename,
orient='split',
index=False,
Expand All @@ -182,13 +182,13 @@ def get_source(

# validate results
if validate_top_score(
vector=tpm_df['tpm'].to_list(),
vector=counts_df['est_counts'].to_list(),
min_value=self.min_match_pct,
min_ratio=self.min_freq_ratio,
rev_sorted=True,
accept_zero=True,
):
source.short_name, taxon_id = tpm_df.iloc[0]['source_ids']
source.short_name, taxon_id = counts_df.iloc[0]['source_ids']
source.taxon_id = int(taxon_id)

LOGGER.debug(f"Source: {source}")
Expand Down Expand Up @@ -247,17 +247,17 @@ def run_kallisto_quantification(
def get_source_expression(
kallisto_dir: Path,
) -> DataFrame:
"""Return percentages of total expression per read source.
"""Return percentages of total estimated counts per read source.
Args:
kallisto_dir: Directory containing Kallisto quantification results.
Returns:
Data frame with columns `source_ids` (a tuple of source short name
and taxon identifier, e.g., `("hsapiens", 9606)`) and `tpm`,
signifying the percentages of total expression per read source.
The data frame is sorted by total expression in descending
order.
and taxon identifier, e.g., `("hsapiens", 9606)`) and
`est_counts`, signifying the percentages of total estimated
counts per read source. The data frame is sorted by total
estimated counts in descending order.
Raises:
FileProblem: Kallisto quantification results could not be
Expand All @@ -283,9 +283,9 @@ def get_source_expression(
)

# handle case where no alignments are found
dat.tpm.fillna(0, inplace=True)
dat.est_counts.fillna(0, inplace=True)

# aggregate expression by source identifiers
# aggregate counts by source identifiers
dat[[
'gene_symbol',
'gene_id',
Expand All @@ -294,17 +294,17 @@ def get_source_expression(
'taxon_id'
]] = dat.target_id.str.split('|', n=4, expand=True)
dat['source_ids'] = list(zip(dat.short_name, dat.taxon_id))
total_tpm = dat.tpm.sum()
dat_agg = dat.groupby(['source_ids'])[['tpm']].agg('sum')
total_counts = dat.est_counts.sum()
dat_agg = dat.groupby(['source_ids'])[['est_counts']].agg('sum')
dat_agg['source_ids'] = dat_agg.index
dat_agg.reset_index(drop=True, inplace=True)

# calculate percentages
if total_tpm != 0:
dat_agg.tpm = dat_agg.tpm / total_tpm * 100
if total_counts != 0:
dat_agg.est_counts = dat_agg.est_counts / total_counts * 100

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)
return dat_agg.sort_values(["est_counts"], ascending=False)

@staticmethod
def get_source_name(
Expand Down

0 comments on commit ef82690

Please sign in to comment.