diff --git a/data/transcripts.fasta.gz b/data/transcripts.fasta.gz index 2332507..67464d5 100644 Binary files a/data/transcripts.fasta.gz and b/data/transcripts.fasta.gz differ diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py index 15a8c83..5fcf945 100644 --- a/htsinfer/get_library_source.py +++ b/htsinfer/get_library_source.py @@ -164,7 +164,7 @@ def get_source( ) # process expression levels - tpm_df = self.get_source_expression( + counts_df = self.get_source_expression( kallisto_dir=kallisto_dir, ) @@ -173,7 +173,7 @@ def get_source( Path(self.out_dir) / f"library_source_{fastq.name}.json" ) LOGGER.debug(f"Writing results to file: {filename}") - tpm_df.to_json( + counts_df.to_json( filename, orient='split', index=False, @@ -182,13 +182,13 @@ def get_source( # validate results if validate_top_score( - vector=tpm_df['tpm'].to_list(), + vector=counts_df['est_counts'].to_list(), min_value=self.min_match_pct, min_ratio=self.min_freq_ratio, rev_sorted=True, accept_zero=True, ): - source.short_name, taxon_id = tpm_df.iloc[0]['source_ids'] + source.short_name, taxon_id = counts_df.iloc[0]['source_ids'] source.taxon_id = int(taxon_id) LOGGER.debug(f"Source: {source}") @@ -247,17 +247,17 @@ def run_kallisto_quantification( def get_source_expression( kallisto_dir: Path, ) -> DataFrame: - """Return percentages of total expression per read source. + """Return percentages of total estimated counts per read source. Args: kallisto_dir: Directory containing Kallisto quantification results. Returns: Data frame with columns `source_ids` (a tuple of source short name - and taxon identifier, e.g., `("hsapiens", 9606)`) and `tpm`, - signifying the percentages of total expression per read source. - The data frame is sorted by total expression in descending - order. + and taxon identifier, e.g., `("hsapiens", 9606)`) and + `est_counts`, signifying the percentages of total estimated + counts per read source. The data frame is sorted by total + estimated counts in descending order. Raises: FileProblem: Kallisto quantification results could not be @@ -283,9 +283,9 @@ def get_source_expression( ) # handle case where no alignments are found - dat.tpm.fillna(0, inplace=True) + dat.est_counts.fillna(0, inplace=True) - # aggregate expression by source identifiers + # aggregate counts by source identifiers dat[[ 'gene_symbol', 'gene_id', @@ -294,17 +294,17 @@ def get_source_expression( 'taxon_id' ]] = dat.target_id.str.split('|', n=4, expand=True) dat['source_ids'] = list(zip(dat.short_name, dat.taxon_id)) - total_tpm = dat.tpm.sum() - dat_agg = dat.groupby(['source_ids'])[['tpm']].agg('sum') + total_counts = dat.est_counts.sum() + dat_agg = dat.groupby(['source_ids'])[['est_counts']].agg('sum') dat_agg['source_ids'] = dat_agg.index dat_agg.reset_index(drop=True, inplace=True) # calculate percentages - if total_tpm != 0: - dat_agg.tpm = dat_agg.tpm / total_tpm * 100 + if total_counts != 0: + dat_agg.est_counts = dat_agg.est_counts / total_counts * 100 # return as dictionary - return dat_agg.sort_values(["tpm"], ascending=False) + return dat_agg.sort_values(["est_counts"], ascending=False) @staticmethod def get_source_name(