Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bed file updating, and partial partial processing #72

Merged
merged 10 commits into from
Dec 19, 2024
12 changes: 7 additions & 5 deletions bbconf/bbagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,20 @@ class BedBaseAgent(object):
def __init__(
self,
config: Union[Path, str],
init_ml: bool = True,
):
"""
Initialize connection to the pep_db database. You can use the basic connection parameters
or libpq connection string.

:param config: path to the configuration file
:param init_ml: initialize ML models for search (default: True)
"""
_LOGGER.info(f"Initializing BedBaseConfig object")
self.config = BedBaseConfig(config)
_LOGGER.info(f"Initializing BedBaseAgent object")

self.config = BedBaseConfig(config, init_ml)

self._bed = BedAgentBedFile(self.config, self)
_LOGGER.info(f"Initializing BedAgentBedSet object")
self._bedset = BedAgentBedSet(self.config)
_LOGGER.info(f"Initializing BBObjects object")
self._objects = BBObjects(self.config)

@property
Expand Down
52 changes: 37 additions & 15 deletions bbconf/config_parser/bedbaseconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,34 @@ class BedBaseConfig(object):
Class to handle BEDbase configuration file and create objects for different modules.
"""

def __init__(self, config: Union[Path, str], init_search_interfaces: bool = True):
_LOGGER.info(f"Loading configuration file: {config}")
def __init__(self, config: Union[Path, str], init_ml: bool = True):
"""
Initialize BedBaseConfig object

:param config: path to the configuration file
:param init_ml: initialize machine learning models used for search
"""

self.cfg_path = get_bedbase_cfg(config)
self._config = self._read_config_file(self.cfg_path)

_LOGGER.info(f"Initializing database engine...")
self._db_engine = self._init_db_engine()
_LOGGER.info(f"Initializing qdrant engine...")
self._qdrant_engine = self._init_qdrant_backend()

_LOGGER.info(f"Initializing qdrant text engine...")
self._qdrant_engine = self._init_qdrant_backend()
self._qdrant_text_engine = self._init_qdrant_text_backend()

if init_search_interfaces:
_LOGGER.info(f"Initializing search interfaces...")
if init_ml:
self._b2bsi = self._init_b2bsi_object()
_LOGGER.info(f"Initializing R2V object...")
self._r2v = self._init_r2v_object()
_LOGGER.info(f"Initializing Bivec object...")
self._bivec = self._init_bivec_object()
else:
_LOGGER.info(
f"Skipping initialization of ML models, init_ml parameter set to False."
)

self._b2bsi = None
self._r2v = None
self._bivec = None

_LOGGER.info(f"Initializing PEPHub client...")
self._phc = self._init_pephubclient()
self._boto3_client = self._init_boto3_client()

Expand Down Expand Up @@ -197,6 +203,11 @@ def zarr_root(self) -> Union[Z_GROUP, None]:
return zarr.group(store=cache, overwrite=False)

def _init_db_engine(self) -> BaseEngine:
"""
Create database engine object using credentials provided in config file
"""

_LOGGER.info(f"Initializing database engine...")
return BaseEngine(
host=self._config.database.host,
port=self._config.database.port,
Expand All @@ -212,6 +223,8 @@ def _init_qdrant_backend(self) -> QdrantBackend:

:return: QdrantClient
"""

_LOGGER.info(f"Initializing qdrant engine...")
try:
return QdrantBackend(
collection=self._config.qdrant.file_collection,
Expand All @@ -225,21 +238,27 @@ def _init_qdrant_backend(self) -> QdrantBackend:
f"error in Connection to qdrant! skipping... Error: {err}", UserWarning
)

def _init_qdrant_text_backend(self) -> QdrantBackend:
def _init_qdrant_text_backend(self) -> Union[QdrantBackend, None]:
"""
Create qdrant client text embedding object using credentials provided in config file

:return: QdrantClient
"""

_LOGGER.info(f"Initializing qdrant text engine...")
try:
return QdrantBackend(
dim=TEXT_EMBEDDING_DIMENSION,
collection=self.config.qdrant.text_collection,
qdrant_host=self.config.qdrant.host,
qdrant_api_key=self.config.qdrant.api_key,
)
except Exception as e:
_LOGGER.error(f"Error while connecting to qdrant text engine: {e}")
except Exception as _:
khoroshevskyi marked this conversation as resolved.
Show resolved Hide resolved
_LOGGER.error("Error in Connection to qdrant text! skipping...")
warnings.warn(
"Error in Connection to qdrant text! skipping...", UserWarning
)
return None

def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]:
"""
Expand All @@ -266,6 +285,7 @@ def _init_b2bsi_object(self) -> Union[BED2BEDSearchInterface, None]:
:return: Bed2BEDSearchInterface object
"""
try:
_LOGGER.info(f"Initializing search interfaces...")
return BED2BEDSearchInterface(
backend=self.qdrant_engine,
query2vec=BED2Vec(model=self._config.path.region2vec),
Expand All @@ -286,6 +306,7 @@ def _init_pephubclient() -> Union[PEPHubClient, None]:
:return: PephubClient
"""
try:
_LOGGER.info(f"Initializing PEPHub client...")
return PEPHubClient()
except Exception as e:
_LOGGER.error(f"Error in creating PephubClient object: {e}")
Expand Down Expand Up @@ -317,6 +338,7 @@ def _init_r2v_object(self) -> Union[Region2VecExModel, None]:
Create Region2VecExModel object using credentials provided in config file
"""
try:
_LOGGER.info(f"Initializing R2V object...")
return Region2VecExModel(self.config.path.region2vec)
except Exception as e:
_LOGGER.error(f"Error in creating Region2VecExModel object: {e}")
Expand Down
16 changes: 14 additions & 2 deletions bbconf/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ class Bed(Base):
ref_classifier: Mapped["GenomeRefStats"] = relationship(
"GenomeRefStats", back_populates="bed", cascade="all, delete-orphan"
)
processed: Mapped[bool] = mapped_column(
default=False, comment="Whether the bed file was processed"
)


class BedMetadata(Base):
Expand Down Expand Up @@ -255,6 +258,11 @@ class Files(Base):
bedfile: Mapped["Bed"] = relationship("Bed", back_populates="files")
bedset: Mapped["BedSets"] = relationship("BedSets", back_populates="files")

__table_args__ = (
UniqueConstraint("name", "bedfile_id"),
UniqueConstraint("name", "bedset_id"),
)


class BedFileBedSetRelation(Base):
__tablename__ = "bedfile_bedset_relation"
Expand Down Expand Up @@ -303,6 +311,10 @@ class BedSets(Base):
author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset")
source: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset")

processed: Mapped[bool] = mapped_column(
default=False, comment="Whether the bedset was processed"
)


class Universes(Base):
__tablename__ = "universes"
Expand Down Expand Up @@ -339,7 +351,7 @@ class TokenizedBed(Base):
nullable=False,
)
universe_id: Mapped[str] = mapped_column(
ForeignKey("universes.id", ondelete="CASCADE", passive_deletes=True),
ForeignKey("universes.id", ondelete="CASCADE"),
primary_key=True,
index=True,
nullable=False,
Expand All @@ -352,7 +364,7 @@ class TokenizedBed(Base):
universe: Mapped["Universes"] = relationship(
"Universes",
back_populates="tokenized",
passive_deletes=True,
passive_deletes="all",
khoroshevskyi marked this conversation as resolved.
Show resolved Hide resolved
)


Expand Down
2 changes: 2 additions & 0 deletions bbconf/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def get_bedbase_cfg(cfg: str = None) -> str:
Optional, the $BEDBASE config env var will be used if not provided
:return str: absolute configuration file path
"""

_LOGGER.info(f"Loading configuration file: {cfg}")
selected_cfg = select_config(config_filepath=cfg, config_env_vars=CFG_ENV_VARS)
if not selected_cfg:
raise BedBaseConnectionError(
Expand Down
8 changes: 7 additions & 1 deletion bbconf/models/bed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class BedStatsModel(BaseModel):


class BedPEPHub(BaseModel):
sample_name: str
sample_name: str = ""
genome: str = ""
organism: str = ""
species_id: str = ""
Expand Down Expand Up @@ -233,3 +233,9 @@ class RefGenValidModel(BaseModel):
tier_ranking: int

model_config = ConfigDict(extra="forbid")


class RefGenValidReturnModel(BaseModel):
id: str
provided_genome: Union[str, None] = None
compared_genome: List[RefGenValidModel]
Loading
Loading