From 652923daf4346b59f4ebbe61649853c1e3f1619e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <mate.balajti@unibas.ch>
Date: Tue, 14 Nov 2023 17:28:48 +0100
Subject: [PATCH] refactor get_library_source

---
 htsinfer/exceptions.py         |  6 +++++
 htsinfer/get_library_source.py | 47 ++++++++++------------------------
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py
index e40278c3..f526841a 100644
--- a/htsinfer/exceptions.py
+++ b/htsinfer/exceptions.py
@@ -44,3 +44,9 @@ class TranscriptsFastaProblem(Exception):
 
 class CutadaptProblem(Exception):
     """Exception raised when running cutadapt commands."""
+
+
+class UnsupportedSampleSourceException(Exception):
+    """Exception raised when taxonomy ID is not found in the source
+    organism list.
+    """
diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
index 3fb70f84..25eecb51 100644
--- a/htsinfer/get_library_source.py
+++ b/htsinfer/get_library_source.py
@@ -4,7 +4,6 @@
 from pathlib import Path
 import subprocess as sp
 import tempfile
-from typing import Optional
 
 from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
@@ -14,6 +13,7 @@
     FileProblem,
     KallistoProblem,
     TranscriptsFastaProblem,
+    UnsupportedSampleSourceException,
 )
 from htsinfer.models import (
     ResultsSource,
@@ -82,34 +82,11 @@ def evaluate(self) -> ResultsSource:
                 self.tax_id,
                 self.transcripts_file
             )
+            source.file_1.short_name = src_name
 
-            if src_name is not None:
-                source.file_1.short_name = src_name
-
-                if self.paths[1] is not None:
-                    source.file_2.taxon_id = self.tax_id
-                    source.file_2.short_name = source.file_1.short_name
-
-            else:
-                LOGGER.warning(
-                    f"Taxon ID '{self.tax_id}' not found in "
-                    "organism dictionary, inferring source organism..."
-                )
-                index = self.create_kallisto_index()
-                library_source = self.get_source(
-                    fastq=self.paths[0],
-                    index=index,
-                )
-                source.file_1.short_name = library_source.short_name
-                source.file_1.taxon_id = library_source.taxon_id
-
-                if self.paths[1] is not None:
-                    library_source = self.get_source(
-                        fastq=self.paths[1],
-                        index=index,
-                    )
-                    source.file_2.short_name = library_source.short_name
-                    source.file_2.taxon_id = library_source.taxon_id
+            if self.paths[1] is not None:
+                source.file_2.taxon_id = self.tax_id
+                source.file_2.short_name = source.file_1.short_name
 
         else:
             index = self.create_kallisto_index()
@@ -333,7 +310,7 @@ def get_source_expression(
     def get_source_name(
         taxon_id: int,
         transcripts_file: Path,
-    ) -> Optional[str]:
+    ) -> str:
         """Return name of the source organism, based on tax ID.
 
         Args:
@@ -344,10 +321,11 @@ def get_source_name(
             Short name of the organism belonging to the given tax ID.
 
         Raises:
-            Could not process input FASTA file.
+            FileProblem: Could not process input FASTA file.
+            UnsupportedSampleSourceException: Taxon ID is not supported.
         """
         src_dict = {}
-        # Construct dictionary of taxonomy ID's and short names
+
         try:
             for record in list(SeqIO.parse(
                     handle=transcripts_file,
@@ -363,7 +341,10 @@ def get_source_name(
                 f"Could not process file '{transcripts_file}'"
             ) from exc
 
-        if taxon_id in src_dict:
+        try:
             return src_dict[taxon_id]
 
-        return None
+        except KeyError as exc:
+            raise UnsupportedSampleSourceException(
+                f'Taxon ID "{taxon_id}" is not supported by HTSinfer.'
+            ) from exc