feat(130): Introduce SEQREPO_FD_CACHE_SIZE env var to override the in…

…ternal fd_cache_size to allow for increased performance with forcing code changes to any SeqRepo clients.
biocommons · Jan 9, 2024 · 8b70167 · 8b70167
1 parent 90e1b41
commit 8b70167
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -151,6 +151,10 @@ SEQREPO_LRU_CACHE_MAXSIZE sets the lru_cache maxsize for the sqlite
 query response caching. It defaults to 1 million but can also be set to
 "none" to be unlimited.
 
+SEQREPO_FD_CACHE_SIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals. 
+It defaults to 0 to disable any caching, but can be set to a specific value or "none" to be unlimited. Using 
+a moderate value will greatly increase performance of sequence retrieval.
+
 ## Developing
 
 Here's how to get started developing:

diff --git a/src/biocommons/seqrepo/config.py b/src/biocommons/seqrepo/config.py
@@ -1,5 +1,7 @@
 import os
 
+SEQREPO_FD_CACHE_SIZE_ENV_NAME = "SEQREPO_FD_CACHE_SIZE"
+
 try:
     seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
     SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var)

diff --git a/src/biocommons/seqrepo/fastadir/fastadir.py b/src/biocommons/seqrepo/fastadir/fastadir.py
@@ -81,17 +81,18 @@ def __init__(self, root_dir, writeable=False, check_same_thread=True, fd_cache_s
                     schema_version, expected_schema_version
                 )
             )
-            
+
         if fd_cache_size == 0:
             _logger.info(f"File descriptor caching disabled")
-            def _open_for_reading(path):
-                _logger.debug("Opening for reading uncached: " + path)
-                return FabgzReader(path)
         else:
             _logger.warning(f"File descriptor caching enabled (size={fd_cache_size})")
-            @functools.lru_cache(maxsize=fd_cache_size)
-            def _open_for_reading(path):
-                return FabgzReader(path)
+
+        @functools.lru_cache(maxsize=fd_cache_size)
+        def _open_for_reading(path):
+            if fd_cache_size == 0:
+                _logger.debug("Opening for reading uncached: " + path)
+            return FabgzReader(path)
+
         self._open_for_reading = _open_for_reading
 
     def __del__(self):

diff --git a/src/biocommons/seqrepo/seqrepo.py b/src/biocommons/seqrepo/seqrepo.py
@@ -10,6 +10,7 @@
 from .config import SEQREPO_LRU_CACHE_MAXSIZE
 from .fastadir import FastaDir
 from .seqaliasdb import SeqAliasDB
+from .utils import resolve_fd_cache_size
 
 _logger = logging.getLogger(__name__)
 
@@ -123,7 +124,7 @@ def __init__(
             self._seq_path,
             writeable=self._writeable,
             check_same_thread=self._check_same_thread,
-            fd_cache_size=fd_cache_size
+            fd_cache_size=resolve_fd_cache_size(fd_cache_size)
         )
         self.aliases = SeqAliasDB(
             self._db_path,

diff --git a/src/biocommons/seqrepo/utils.py b/src/biocommons/seqrepo/utils.py
@@ -1,9 +1,42 @@
+import os
 import re
+from typing import Optional
+
+from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME
 
 ncbi_defline_re = re.compile(r"(?P<namespace>ref)\|(?P<alias>[^|]+)")
 invalid_alias_chars_re = re.compile(r"[^-+./_\w]")
 
 
+def resolve_fd_cache_size(internal_fd_cache_size: Optional[int]) -> Optional[int]:
+    """
+    Determines what the fd_cache_size should be set to. If the SEQREPO_FD_CACHE_SIZE env var
+    is set, that value takes priority, otherwise whatever passed into the SeqRepo init is used. If
+    nothing is set, it'll end up being 0. Setting this value helps performance of reading the
+    fasta files, but one must be careful of resource exhaustion.
+    Details:
+        0 - No cache at all
+        None - Unbounded caching
+        >=1 - Specific cache size
+    """
+    env_fd_cache_size = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
+    # If the env var is not set, use what is defined in the code
+    if env_fd_cache_size is None:
+        return internal_fd_cache_size
+
+    # Else parse out what is in the env var
+    if env_fd_cache_size.lower() == "none":
+        return None
+    try:
+        env_fd_cache_size_i = int(env_fd_cache_size)
+    except ValueError:
+        raise ValueError(
+            f"{SEQREPO_FD_CACHE_SIZE_ENV_NAME} must be a valid int, none, or not set, "
+            "currently it is " + env_fd_cache_size
+        )
+    return env_fd_cache_size_i
+
+
 def parse_defline(defline, namespace):
     """parse fasta defline, returning a list of zero or more dicts
     like [{namespace: , alias: }]

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,35 @@
+import os
+
 import pytest
 
-from biocommons.seqrepo.utils import parse_defline, validate_aliases
+from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME
+from biocommons.seqrepo.utils import parse_defline, validate_aliases, resolve_fd_cache_size
+
+
+def test_resolve_fd_cache_size():
+    # Preserve any data for this env var before we try different values
+    orig_env = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
+    if orig_env:
+        del os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME]
+
+    instance_fd_cache_size = 10
+    # With no env var set, resolve_fd_cache_size should pass through its input value
+    assert resolve_fd_cache_size(instance_fd_cache_size) == instance_fd_cache_size
+    # Otherwise any env var will override the instance value
+    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "none"
+    assert resolve_fd_cache_size(instance_fd_cache_size) is None
+    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "100"
+    assert resolve_fd_cache_size(instance_fd_cache_size) == 100
+    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "0"
+    assert resolve_fd_cache_size(instance_fd_cache_size) == 0
+
+    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "foo"
+    with pytest.raises(ValueError):
+        assert resolve_fd_cache_size(instance_fd_cache_size)
+
+    # Restore original env var
+    if orig_env:
+        os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = orig_env
 
 
 def test_parse_defline():