Skip to content

Commit

Permalink
feat(130): Introduce SEQREPO_FD_CACHE_SIZE env var to override the in…
Browse files Browse the repository at this point in the history
…ternal fd_cache_size to allow for increased performance with forcing code changes to any SeqRepo clients.
  • Loading branch information
kazmiekr committed Jan 9, 2024
1 parent 90e1b41 commit 8b70167
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 9 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ SEQREPO_LRU_CACHE_MAXSIZE sets the lru_cache maxsize for the sqlite
query response caching. It defaults to 1 million but can also be set to
"none" to be unlimited.

SEQREPO_FD_CACHE_SIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals.
It defaults to 0 to disable any caching, but can be set to a specific value or "none" to be unlimited. Using
a moderate value will greatly increase performance of sequence retrieval.

## Developing

Here's how to get started developing:
Expand Down
2 changes: 2 additions & 0 deletions src/biocommons/seqrepo/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

SEQREPO_FD_CACHE_SIZE_ENV_NAME = "SEQREPO_FD_CACHE_SIZE"

try:
seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var)
Expand Down
15 changes: 8 additions & 7 deletions src/biocommons/seqrepo/fastadir/fastadir.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ def __init__(self, root_dir, writeable=False, check_same_thread=True, fd_cache_s
schema_version, expected_schema_version
)
)

if fd_cache_size == 0:
_logger.info(f"File descriptor caching disabled")
def _open_for_reading(path):
_logger.debug("Opening for reading uncached: " + path)
return FabgzReader(path)
else:
_logger.warning(f"File descriptor caching enabled (size={fd_cache_size})")
@functools.lru_cache(maxsize=fd_cache_size)
def _open_for_reading(path):
return FabgzReader(path)

@functools.lru_cache(maxsize=fd_cache_size)
def _open_for_reading(path):
if fd_cache_size == 0:
_logger.debug("Opening for reading uncached: " + path)
return FabgzReader(path)

self._open_for_reading = _open_for_reading

def __del__(self):
Expand Down
3 changes: 2 additions & 1 deletion src/biocommons/seqrepo/seqrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .config import SEQREPO_LRU_CACHE_MAXSIZE
from .fastadir import FastaDir
from .seqaliasdb import SeqAliasDB
from .utils import resolve_fd_cache_size

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -123,7 +124,7 @@ def __init__(
self._seq_path,
writeable=self._writeable,
check_same_thread=self._check_same_thread,
fd_cache_size=fd_cache_size
fd_cache_size=resolve_fd_cache_size(fd_cache_size)
)
self.aliases = SeqAliasDB(
self._db_path,
Expand Down
33 changes: 33 additions & 0 deletions src/biocommons/seqrepo/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,42 @@
import os
import re
from typing import Optional

from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME

ncbi_defline_re = re.compile(r"(?P<namespace>ref)\|(?P<alias>[^|]+)")
invalid_alias_chars_re = re.compile(r"[^-+./_\w]")


def resolve_fd_cache_size(internal_fd_cache_size: Optional[int]) -> Optional[int]:
"""
Determines what the fd_cache_size should be set to. If the SEQREPO_FD_CACHE_SIZE env var
is set, that value takes priority, otherwise whatever passed into the SeqRepo init is used. If
nothing is set, it'll end up being 0. Setting this value helps performance of reading the
fasta files, but one must be careful of resource exhaustion.
Details:
0 - No cache at all
None - Unbounded caching
>=1 - Specific cache size
"""
env_fd_cache_size = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
# If the env var is not set, use what is defined in the code
if env_fd_cache_size is None:
return internal_fd_cache_size

# Else parse out what is in the env var
if env_fd_cache_size.lower() == "none":
return None
try:
env_fd_cache_size_i = int(env_fd_cache_size)
except ValueError:
raise ValueError(
f"{SEQREPO_FD_CACHE_SIZE_ENV_NAME} must be a valid int, none, or not set, "
"currently it is " + env_fd_cache_size
)
return env_fd_cache_size_i


def parse_defline(defline, namespace):
"""parse fasta defline, returning a list of zero or more dicts
like [{namespace: , alias: }]
Expand Down
31 changes: 30 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,35 @@
import os

import pytest

from biocommons.seqrepo.utils import parse_defline, validate_aliases
from biocommons.seqrepo.config import SEQREPO_FD_CACHE_SIZE_ENV_NAME
from biocommons.seqrepo.utils import parse_defline, validate_aliases, resolve_fd_cache_size


def test_resolve_fd_cache_size():
# Preserve any data for this env var before we try different values
orig_env = os.environ.get(SEQREPO_FD_CACHE_SIZE_ENV_NAME)
if orig_env:
del os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME]

instance_fd_cache_size = 10
# With no env var set, resolve_fd_cache_size should pass through its input value
assert resolve_fd_cache_size(instance_fd_cache_size) == instance_fd_cache_size
# Otherwise any env var will override the instance value
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "none"
assert resolve_fd_cache_size(instance_fd_cache_size) is None
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "100"
assert resolve_fd_cache_size(instance_fd_cache_size) == 100
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "0"
assert resolve_fd_cache_size(instance_fd_cache_size) == 0

os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = "foo"
with pytest.raises(ValueError):
assert resolve_fd_cache_size(instance_fd_cache_size)

# Restore original env var
if orig_env:
os.environ[SEQREPO_FD_CACHE_SIZE_ENV_NAME] = orig_env


def test_parse_defline():
Expand Down

0 comments on commit 8b70167

Please sign in to comment.