Skip to content

Commit

Permalink
feat(130): Introduces SEQREPO_FD_CACHE_MAXSIZE env var (#131)
Browse files Browse the repository at this point in the history
* feat(130): Introduce SEQREPO_FD_CACHE_SIZE env var to override the internal fd_cache_size to allow for increased performance with forcing code changes to any SeqRepo clients.

* feat(130): Cleans up logic around env var parsing and changes env var name

* feat(130): Type hinting and cleanup
  • Loading branch information
kazmiekr authored Jan 23, 2024
1 parent 5fadcfc commit 4d1a349
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 18 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ SEQREPO_LRU_CACHE_MAXSIZE sets the lru_cache maxsize for the sqlite
query response caching. It defaults to 1 million but can also be set to
"none" to be unlimited.

SEQREPO_FD_CACHE_MAXSIZE sets the lru_cache size for file handler caching during FASTA sequence retrievals.
It defaults to 0 to disable any caching, but can be set to a specific value or "none" to be unlimited. Using
a moderate value (>10) will greatly increase performance of sequence retrieval.

## Developing

Here's how to get started developing:
Expand Down
27 changes: 18 additions & 9 deletions src/biocommons/seqrepo/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
import os
from typing import Optional

try:
seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var)
except ValueError:
if seqrepo_env_var.lower() == "none":
SEQREPO_LRU_CACHE_MAXSIZE = None
else:

def parse_caching_env_var(env_name: str, env_default: str) -> Optional[int]:
caching_env_var = os.environ.get(env_name, env_default)
if caching_env_var.lower() == "none":
return None

try:
caching_env_var_int = int(caching_env_var)
except ValueError:
raise ValueError(
"SEQREPO_LRU_CACHE_MAXSIZE must be a valid int, none, or not set, "
"currently it is " + seqrepo_env_var
f"{env_name} must be a valid int, none, or not set, "
"currently it is " + caching_env_var
)
return caching_env_var_int


SEQREPO_LRU_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_LRU_CACHE_MAXSIZE", "1000000")
# Using a default value here of -1 to differentiate not setting this env var and an explicit None (unbounded cache)
SEQREPO_FD_CACHE_MAXSIZE = parse_caching_env_var("SEQREPO_FD_CACHE_MAXSIZE", "-1")
15 changes: 8 additions & 7 deletions src/biocommons/seqrepo/fastadir/fastadir.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ def __init__(self, root_dir, writeable=False, check_same_thread=True, fd_cache_s
schema_version, expected_schema_version
)
)

if fd_cache_size == 0:
_logger.info(f"File descriptor caching disabled")
def _open_for_reading(path):
_logger.debug("Opening for reading uncached: " + path)
return FabgzReader(path)
else:
_logger.warning(f"File descriptor caching enabled (size={fd_cache_size})")
@functools.lru_cache(maxsize=fd_cache_size)
def _open_for_reading(path):
return FabgzReader(path)

@functools.lru_cache(maxsize=fd_cache_size)
def _open_for_reading(path):
if fd_cache_size == 0:
_logger.debug("Opening for reading uncached: " + path)
return FabgzReader(path)

self._open_for_reading = _open_for_reading

def __del__(self):
Expand Down
4 changes: 2 additions & 2 deletions src/biocommons/seqrepo/seqrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import bioutils.digests
from bioutils.digests import seq_seqhash as sha512t24u

from .config import SEQREPO_LRU_CACHE_MAXSIZE
from .config import SEQREPO_LRU_CACHE_MAXSIZE, SEQREPO_FD_CACHE_MAXSIZE
from .fastadir import FastaDir
from .seqaliasdb import SeqAliasDB

Expand Down Expand Up @@ -123,7 +123,7 @@ def __init__(
self._seq_path,
writeable=self._writeable,
check_same_thread=self._check_same_thread,
fd_cache_size=fd_cache_size
fd_cache_size=SEQREPO_FD_CACHE_MAXSIZE if SEQREPO_FD_CACHE_MAXSIZE != -1 else fd_cache_size
)
self.aliases = SeqAliasDB(
self._db_path,
Expand Down
6 changes: 6 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
from biocommons.seqrepo import config


def test_SEQREPO_FD_CACHE_MAXSIZE_default(monkeypatch):
monkeypatch.delenv("SEQREPO_FD_CACHE_MAXSIZE", raising=False)
reload(config)
assert config.SEQREPO_FD_CACHE_MAXSIZE == -1


def test_SEQREPO_LRU_CACHE_MAXSIZE_default(monkeypatch):
monkeypatch.delenv("SEQREPO_LRU_CACHE_MAXSIZE", raising=False)
reload(config)
Expand Down

0 comments on commit 4d1a349

Please sign in to comment.