From 1d4a5be2fb857965bbef624ddc8f320c67211123 Mon Sep 17 00:00:00 2001
From: Kevin Kazmierczak <kevin.kazmierczak@invitae.com>
Date: Tue, 9 Jan 2024 10:51:47 -0500
Subject: [PATCH] feat(130): Cleans up logic around env var parsing and changes
 env var name

---
 README.md                         |  4 ++--
 src/biocommons/seqrepo/config.py  | 22 +++++++++++++--------
 src/biocommons/seqrepo/seqrepo.py |  5 ++---
 src/biocommons/seqrepo/utils.py   | 33 -------------------------------
 tests/test_config.py              |  6 ++++++
 tests/test_utils.py               | 31 +----------------------------
 6 files changed, 25 insertions(+), 76 deletions(-)

diff --git a/README.md b/README.md
index 74a2a54..8a4eb00 100644
--- a/README.md
+++ b/README.md
@@ -151,9 +151,9 @@ SEQREPO_LRU_CACHE_MAXSIZE sets the lru_cache maxsize for the sqlite
 query response caching. It defaults to 1 million but can also be set to
 "none" to be unlimited.
 
-SEQREPO_FD_CACHE_SIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals. 
+SEQREPO_FD_CACHE_MAXSIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals. 
 It defaults to 0 to disable any caching, but can be set to a specific value or "none" to be unlimited. Using 
-a moderate value will greatly increase performance of sequence retrieval.
+a moderate value (>10) will greatly increase performance of sequence retrieval.
 
 ## Developing
 
diff --git a/src/biocommons/seqrepo/config.py b/src/biocommons/seqrepo/config.py
index 33b612c..fb40b1f 100644
--- a/src/biocommons/seqrepo/config.py
+++ b/src/biocommons/seqrepo/config.py
@@ -1,15 +1,21 @@
 import os
 
-SEQREPO_FD_CACHE_SIZE_ENV_NAME = "SEQREPO_FD_CACHE_SIZE"
 
-try:
-    seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
-    SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var)
-except ValueError:
+def parse_caching_env_var(env_name, env_default):
+    seqrepo_env_var = os.environ.get(env_name, env_default)
     if seqrepo_env_var.lower() == "none":
-        SEQREPO_LRU_CACHE_MAXSIZE = None
-    else:
+        return None
+
+    try:
+        seqrepo_env_var_int = int(seqrepo_env_var)
+    except ValueError:
         raise ValueError(
-            "SEQREPO_LRU_CACHE_MAXSIZE must be a valid int, none, or not set, "
+            f"{env_name} must be a valid int, none, or not set, "
             "currently it is " + seqrepo_env_var
         )
+    return seqrepo_env_var_int
+
+
+SEQREPO_LRU_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
+# Using a default value here of -1 to differentiate not setting this value and an explicit None (unbounded cache)
+SEQREPO_FD_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_FD_CACHE_MAXSIZE", "-1")
diff --git a/src/biocommons/seqrepo/seqrepo.py b/src/biocommons/seqrepo/seqrepo.py
index ed30c20..a5482b6 100644
--- a/src/biocommons/seqrepo/seqrepo.py
+++ b/src/biocommons/seqrepo/seqrepo.py
@@ -7,10 +7,9 @@
 import bioutils.digests
 from bioutils.digests import seq_seqhash as sha512t24u
 
-from .config import SEQREPO_LRU_CACHE_MAXSIZE
+from .config import SEQREPO_LRU_CACHE_MAXSIZE, SEQREPO_FD_CACHE_MAXSIZE
 from .fastadir import FastaDir
 from .seqaliasdb import SeqAliasDB
-from .utils import resolve_fd_cache_size
 
 _logger = logging.getLogger(__name__)
 
@@ -124,7 +123,7 @@ def __init__(
             self._seq_path,
             writeable=self._writeable,
             check_same_thread=self._check_same_thread,
-            fd_cache_size=resolve_fd_cache_size(fd_cache_size)
+            fd_cache_size=SEQREPO_FD_CACHE_MAXSIZE if SEQREPO_FD_CACHE_MAXSIZE != -1 else fd_cache_size
         )
         self.aliases = SeqAliasDB(
             self._db_path,
diff --git a/src/biocommons/seqrepo/utils.py b/src/biocommons/seqrepo/utils.py
index c137d60..bc6b8d1 100644
--- a/src/biocommons/seqrepo/utils.py
+++ b/src/biocommons/seqrepo/utils.py
@@ -1,42 +1,9 @@
-import os
 import re
-from typing import Optional
-
-from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME
 
 ncbi_defline_re = re.compile(r"(?P<namespace>ref)\|(?P<alias>[^|]+)")
 invalid_alias_chars_re = re.compile(r"[^-+./_\w]")
 
 
-def resolve_fd_cache_size(internal_fd_cache_size: Optional[int]) -> Optional[int]:
-    """
-    Determines what the fd_cache_size should be set to. If the SEQREPO_FD_CACHE_SIZE env var
-    is set, that value takes priority, otherwise whatever passed into the SeqRepo init is used. If
-    nothing is set, it'll end up being 0. Setting this value helps performance of reading the
-    fasta files, but one must be careful of resource exhaustion.
-    Details:
-        0 - No cache at all
-        None - Unbounded caching
-        >=1 - Specific cache size
-    """
-    env_fd_cache_size = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
-    # If the env var is not set, use what is defined in the code
-    if env_fd_cache_size is None:
-        return internal_fd_cache_size
-
-    # Else parse out what is in the env var
-    if env_fd_cache_size.lower() == "none":
-        return None
-    try:
-        env_fd_cache_size_i = int(env_fd_cache_size)
-    except ValueError:
-        raise ValueError(
-            f"{SEQREPO_FD_CACHE_SIZE_ENV_NAME} must be a valid int, none, or not set, "
-            "currently it is " + env_fd_cache_size
-        )
-    return env_fd_cache_size_i
-
-
 def parse_defline(defline, namespace):
     """parse fasta defline, returning a list of zero or more dicts
     like [{namespace: , alias: }]
diff --git a/tests/test_config.py b/tests/test_config.py
index f240eab..067f339 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -5,6 +5,12 @@
 from biocommons.seqrepo import config
 
 
+def test_SEQREPO_FD_CACHE_MAXSIZE_default(monkeypatch):
+    monkeypatch.delenv("SEQREPO_FD_CACHE_MAXSIZE", raising=False)
+    reload(config)
+    assert config.SEQREPO_FD_CACHE_MAXSIZE == -1
+
+
 def test_SEQREPO_LRU_CACHE_MAXSIZE_default(monkeypatch):
     monkeypatch.delenv("SEQREPO_LRU_CACHE_MAXSIZE", raising=False)
     reload(config)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c21bbc7..48221ca 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,35 +1,6 @@
-import os
-
 import pytest
 
-from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME
-from biocommons.seqrepo.utils import parse_defline, validate_aliases, resolve_fd_cache_size
-
-
-def test_resolve_fd_cache_size():
-    # Preserve any data for this env var before we try different values
-    orig_env = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
-    if orig_env:
-        del os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME]
-
-    instance_fd_cache_size = 10
-    # With no env var set, resolve_fd_cache_size should pass through its input value
-    assert resolve_fd_cache_size(instance_fd_cache_size) == instance_fd_cache_size
-    # Otherwise any env var will override the instance value
-    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "none"
-    assert resolve_fd_cache_size(instance_fd_cache_size) is None
-    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "100"
-    assert resolve_fd_cache_size(instance_fd_cache_size) == 100
-    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "0"
-    assert resolve_fd_cache_size(instance_fd_cache_size) == 0
-
-    os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "foo"
-    with pytest.raises(ValueError):
-        assert resolve_fd_cache_size(instance_fd_cache_size)
-
-    # Restore original env var
-    if orig_env:
-        os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = orig_env
+from biocommons.seqrepo.utils import parse_defline, validate_aliases
 
 
 def test_parse_defline():