From 1d4a5be2fb857965bbef624ddc8f320c67211123 Mon Sep 17 00:00:00 2001 From: Kevin Kazmierczak Date: Tue, 9 Jan 2024 10:51:47 -0500 Subject: [PATCH] feat(130): Cleans up logic around env var parsing and changes env var name --- README.md | 4 ++-- src/biocommons/seqrepo/config.py | 22 +++++++++++++-------- src/biocommons/seqrepo/seqrepo.py | 5 ++--- src/biocommons/seqrepo/utils.py | 33 ------------------------------- tests/test_config.py | 6 ++++++ tests/test_utils.py | 31 +---------------------------- 6 files changed, 25 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 74a2a54..8a4eb00 100644 --- a/README.md +++ b/README.md @@ -151,9 +151,9 @@ SEQREPO_LRU_CACHE_MAXSIZE sets the lru_cache maxsize for the sqlite query response caching. It defaults to 1 million but can also be set to "none" to be unlimited. -SEQREPO_FD_CACHE_SIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals. +SEQREPO_FD_CACHE_MAXSIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals. It defaults to 0 to disable any caching, but can be set to a specific value or "none" to be unlimited. Using -a moderate value will greatly increase performance of sequence retrieval. +a moderate value (>10) will greatly increase performance of sequence retrieval. ## Developing diff --git a/src/biocommons/seqrepo/config.py b/src/biocommons/seqrepo/config.py index 33b612c..fb40b1f 100644 --- a/src/biocommons/seqrepo/config.py +++ b/src/biocommons/seqrepo/config.py @@ -1,15 +1,21 @@ import os -SEQREPO_FD_CACHE_SIZE_ENV_NAME = "SEQREPO_FD_CACHE_SIZE" -try: - seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000") - SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var) -except ValueError: +def parse_caching_env_var(env_name, env_default): + seqrepo_env_var = os.environ.get(env_name, env_default) if seqrepo_env_var.lower() == "none": - SEQREPO_LRU_CACHE_MAXSIZE = None - else: + return None + + try: + seqrepo_env_var_int = int(seqrepo_env_var) + except ValueError: raise ValueError( - "SEQREPO_LRU_CACHE_MAXSIZE must be a valid int, none, or not set, " + f"{env_name} must be a valid int, none, or not set, " "currently it is " + seqrepo_env_var ) + return seqrepo_env_var_int + + +SEQREPO_LRU_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_LRU_CACHE_MAXSIZE", "1000000") +# Using a default value here of -1 to differentiate not setting this value and an explicit None (unbounded cache) +SEQREPO_FD_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_FD_CACHE_MAXSIZE", "-1") diff --git a/src/biocommons/seqrepo/seqrepo.py b/src/biocommons/seqrepo/seqrepo.py index ed30c20..a5482b6 100644 --- a/src/biocommons/seqrepo/seqrepo.py +++ b/src/biocommons/seqrepo/seqrepo.py @@ -7,10 +7,9 @@ import bioutils.digests from bioutils.digests import seq_seqhash as sha512t24u -from .config import SEQREPO_LRU_CACHE_MAXSIZE +from .config import SEQREPO_LRU_CACHE_MAXSIZE, SEQREPO_FD_CACHE_MAXSIZE from .fastadir import FastaDir from .seqaliasdb import SeqAliasDB -from .utils import resolve_fd_cache_size _logger = logging.getLogger(__name__) @@ -124,7 +123,7 @@ def __init__( self._seq_path, writeable=self._writeable, check_same_thread=self._check_same_thread, - fd_cache_size=resolve_fd_cache_size(fd_cache_size) + fd_cache_size=SEQREPO_FD_CACHE_MAXSIZE if SEQREPO_FD_CACHE_MAXSIZE != -1 else fd_cache_size ) self.aliases = SeqAliasDB( self._db_path, diff --git a/src/biocommons/seqrepo/utils.py b/src/biocommons/seqrepo/utils.py index c137d60..bc6b8d1 100644 --- a/src/biocommons/seqrepo/utils.py +++ b/src/biocommons/seqrepo/utils.py @@ -1,42 +1,9 @@ -import os import re -from typing import Optional - -from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME ncbi_defline_re = re.compile(r"(?Pref)\|(?P[^|]+)") invalid_alias_chars_re = re.compile(r"[^-+./_\w]") -def resolve_fd_cache_size(internal_fd_cache_size: Optional[int]) -> Optional[int]: - """ - Determines what the fd_cache_size should be set to. If the SEQREPO_FD_CACHE_SIZE env var - is set, that value takes priority, otherwise whatever passed into the SeqRepo init is used. If - nothing is set, it'll end up being 0. Setting this value helps performance of reading the - fasta files, but one must be careful of resource exhaustion. - Details: - 0 - No cache at all - None - Unbounded caching - >=1 - Specific cache size - """ - env_fd_cache_size = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME) - # If the env var is not set, use what is defined in the code - if env_fd_cache_size is None: - return internal_fd_cache_size - - # Else parse out what is in the env var - if env_fd_cache_size.lower() == "none": - return None - try: - env_fd_cache_size_i = int(env_fd_cache_size) - except ValueError: - raise ValueError( - f"{SEQREPO_FD_CACHE_SIZE_ENV_NAME} must be a valid int, none, or not set, " - "currently it is " + env_fd_cache_size - ) - return env_fd_cache_size_i - - def parse_defline(defline, namespace): """parse fasta defline, returning a list of zero or more dicts like [{namespace: , alias: }] diff --git a/tests/test_config.py b/tests/test_config.py index f240eab..067f339 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -5,6 +5,12 @@ from biocommons.seqrepo import config +def test_SEQREPO_FD_CACHE_MAXSIZE_default(monkeypatch): + monkeypatch.delenv("SEQREPO_FD_CACHE_MAXSIZE", raising=False) + reload(config) + assert config.SEQREPO_FD_CACHE_MAXSIZE == -1 + + def test_SEQREPO_LRU_CACHE_MAXSIZE_default(monkeypatch): monkeypatch.delenv("SEQREPO_LRU_CACHE_MAXSIZE", raising=False) reload(config) diff --git a/tests/test_utils.py b/tests/test_utils.py index c21bbc7..48221ca 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,35 +1,6 @@ -import os - import pytest -from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME -from biocommons.seqrepo.utils import parse_defline, validate_aliases, resolve_fd_cache_size - - -def test_resolve_fd_cache_size(): - # Preserve any data for this env var before we try different values - orig_env = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME) - if orig_env: - del os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] - - instance_fd_cache_size = 10 - # With no env var set, resolve_fd_cache_size should pass through its input value - assert resolve_fd_cache_size(instance_fd_cache_size) == instance_fd_cache_size - # Otherwise any env var will override the instance value - os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "none" - assert resolve_fd_cache_size(instance_fd_cache_size) is None - os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "100" - assert resolve_fd_cache_size(instance_fd_cache_size) == 100 - os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "0" - assert resolve_fd_cache_size(instance_fd_cache_size) == 0 - - os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "foo" - with pytest.raises(ValueError): - assert resolve_fd_cache_size(instance_fd_cache_size) - - # Restore original env var - if orig_env: - os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = orig_env +from biocommons.seqrepo.utils import parse_defline, validate_aliases def test_parse_defline():