Skip to content

Commit

Permalink
feat(130): Cleans up logic around env var parsing and changes env var…
Browse files Browse the repository at this point in the history
… name
  • Loading branch information
kazmiekr committed Jan 9, 2024
1 parent 8b70167 commit 1d4a5be
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 76 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ SEQREPO_LRU_CACHE_MAXSIZE sets the lru_cache maxsize for the sqlite
query response caching. It defaults to 1 million but can also be set to
"none" to be unlimited.

SEQREPO_FD_CACHE_SIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals.
SEQREPO_FD_CACHE_MAXSIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals.
It defaults to 0 to disable any caching, but can be set to a specific value or "none" to be unlimited. Using
a moderate value will greatly increase performance of sequence retrieval.
a moderate value (>10) will greatly increase performance of sequence retrieval.

## Developing

Expand Down
22 changes: 14 additions & 8 deletions src/biocommons/seqrepo/config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
import os

SEQREPO_FD_CACHE_SIZE_ENV_NAME = "SEQREPO_FD_CACHE_SIZE"

try:
seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var)
except ValueError:
def parse_caching_env_var(env_name, env_default):
seqrepo_env_var = os.environ.get(env_name, env_default)
if seqrepo_env_var.lower() == "none":
SEQREPO_LRU_CACHE_MAXSIZE = None
else:
return None

try:
seqrepo_env_var_int = int(seqrepo_env_var)
except ValueError:
raise ValueError(
"SEQREPO_LRU_CACHE_MAXSIZE must be a valid int, none, or not set, "
f"{env_name} must be a valid int, none, or not set, "
"currently it is " + seqrepo_env_var
)
return seqrepo_env_var_int


SEQREPO_LRU_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
# Using a default value here of -1 to differentiate not setting this value and an explicit None (unbounded cache)
SEQREPO_FD_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_FD_CACHE_MAXSIZE", "-1")
5 changes: 2 additions & 3 deletions src/biocommons/seqrepo/seqrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
import bioutils.digests
from bioutils.digests import seq_seqhash as sha512t24u

from .config import SEQREPO_LRU_CACHE_MAXSIZE
from .config import SEQREPO_LRU_CACHE_MAXSIZE, SEQREPO_FD_CACHE_MAXSIZE
from .fastadir import FastaDir
from .seqaliasdb import SeqAliasDB
from .utils import resolve_fd_cache_size

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -124,7 +123,7 @@ def __init__(
self._seq_path,
writeable=self._writeable,
check_same_thread=self._check_same_thread,
fd_cache_size=resolve_fd_cache_size(fd_cache_size)
fd_cache_size=SEQREPO_FD_CACHE_MAXSIZE if SEQREPO_FD_CACHE_MAXSIZE != -1 else fd_cache_size
)
self.aliases = SeqAliasDB(
self._db_path,
Expand Down
33 changes: 0 additions & 33 deletions src/biocommons/seqrepo/utils.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,9 @@
import os
import re
from typing import Optional

from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME

ncbi_defline_re = re.compile(r"(?P<namespace>ref)\|(?P<alias>[^|]+)")
invalid_alias_chars_re = re.compile(r"[^-+./_\w]")


def resolve_fd_cache_size(internal_fd_cache_size: Optional[int]) -> Optional[int]:
"""
Determines what the fd_cache_size should be set to. If the SEQREPO_FD_CACHE_SIZE env var
is set, that value takes priority, otherwise whatever passed into the SeqRepo init is used. If
nothing is set, it'll end up being 0. Setting this value helps performance of reading the
fasta files, but one must be careful of resource exhaustion.
Details:
0 - No cache at all
None - Unbounded caching
>=1 - Specific cache size
"""
env_fd_cache_size = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
# If the env var is not set, use what is defined in the code
if env_fd_cache_size is None:
return internal_fd_cache_size

# Else parse out what is in the env var
if env_fd_cache_size.lower() == "none":
return None
try:
env_fd_cache_size_i = int(env_fd_cache_size)
except ValueError:
raise ValueError(
f"{SEQREPO_FD_CACHE_SIZE_ENV_NAME} must be a valid int, none, or not set, "
"currently it is " + env_fd_cache_size
)
return env_fd_cache_size_i


def parse_defline(defline, namespace):
"""parse fasta defline, returning a list of zero or more dicts
like [{namespace: , alias: }]
Expand Down
6 changes: 6 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
from biocommons.seqrepo import config


def test_SEQREPO_FD_CACHE_MAXSIZE_default(monkeypatch):
monkeypatch.delenv("SEQREPO_FD_CACHE_MAXSIZE", raising=False)
reload(config)
assert config.SEQREPO_FD_CACHE_MAXSIZE == -1


def test_SEQREPO_LRU_CACHE_MAXSIZE_default(monkeypatch):
monkeypatch.delenv("SEQREPO_LRU_CACHE_MAXSIZE", raising=False)
reload(config)
Expand Down
31 changes: 1 addition & 30 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,6 @@
import os

import pytest

from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME
from biocommons.seqrepo.utils import parse_defline, validate_aliases, resolve_fd_cache_size


def test_resolve_fd_cache_size():
# Preserve any data for this env var before we try different values
orig_env = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
if orig_env:
del os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME]

instance_fd_cache_size = 10
# With no env var set, resolve_fd_cache_size should pass through its input value
assert resolve_fd_cache_size(instance_fd_cache_size) == instance_fd_cache_size
# Otherwise any env var will override the instance value
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "none"
assert resolve_fd_cache_size(instance_fd_cache_size) is None
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "100"
assert resolve_fd_cache_size(instance_fd_cache_size) == 100
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "0"
assert resolve_fd_cache_size(instance_fd_cache_size) == 0

os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "foo"
with pytest.raises(ValueError):
assert resolve_fd_cache_size(instance_fd_cache_size)

# Restore original env var
if orig_env:
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = orig_env
from biocommons.seqrepo.utils import parse_defline, validate_aliases


def test_parse_defline():
Expand Down

0 comments on commit 1d4a5be

Please sign in to comment.