Skip to content

Commit

Permalink
Merge pull request #62 from databio/dev
Browse files Browse the repository at this point in the history
Release 0.8.0
  • Loading branch information
khoroshevskyi authored Oct 24, 2024
2 parents 1e83da8 + e7f4136 commit 25beff9
Show file tree
Hide file tree
Showing 13 changed files with 169 additions and 60 deletions.
2 changes: 1 addition & 1 deletion bbconf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.1"
__version__ = "0.8.0"
74 changes: 43 additions & 31 deletions bbconf/config_parser/bedbaseconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@
import zarr
from botocore.exceptions import BotoCoreError, EndpointConnectionError
from geniml.region2vec.main import Region2VecExModel
from geniml.search import BED2BEDSearchInterface, QdrantBackend, Text2BEDSearchInterface
from geniml.search.query2vec import BED2Vec, Text2Vec
from geniml.search import BED2BEDSearchInterface
from geniml.search.backends import BiVectorBackend, QdrantBackend
from geniml.search.interfaces import BiVectorSearchInterface
from geniml.search.query2vec import BED2Vec
from pephubclient import PEPHubClient
from zarr import Group as Z_GROUP

from bbconf.config_parser.const import (
S3_BEDSET_PATH_FOLDER,
S3_FILE_PATH_FOLDER,
S3_PLOTS_PATH_FOLDER,
TEXT_EMBEDDING_DIMENSION,
)
from bbconf.config_parser.models import ConfigFile
from bbconf.const import PKG_NAME, ZARR_TOKENIZED_FOLDER
Expand All @@ -45,9 +48,10 @@ def __init__(self, config: Union[Path, str]):

self._db_engine = self._init_db_engine()
self._qdrant_engine = self._init_qdrant_backend()
self._t2bsi = self._init_t2bsi_object()
self._qdrant_text_engine = self._init_qdrant_text_backend()
self._b2bsi = self._init_b2bsi_object()
self._r2v = self._init_r2v_object()
self._bivec = self._init_bivec_object()

self._phc = self._init_pephubclient()
self._boto3_client = self._init_boto3_client()
Expand Down Expand Up @@ -94,15 +98,6 @@ def db_engine(self) -> BaseEngine:
"""
return self._db_engine

@property
def t2bsi(self) -> Union[Text2BEDSearchInterface, None]:
"""
Get text2bednn object
:return: text2bednn object
"""
return self._t2bsi

@property
def b2bsi(self) -> Union[BED2BEDSearchInterface, None]:
"""
Expand All @@ -121,6 +116,16 @@ def r2v(self) -> Region2VecExModel:
"""
return self._r2v

@property
def bivec(self) -> BiVectorSearchInterface:
"""
Get bivec search interface object
:return: bivec search interface object
"""

return self._bivec

@property
def qdrant_engine(self) -> QdrantBackend:
"""
Expand Down Expand Up @@ -194,7 +199,7 @@ def _init_qdrant_backend(self) -> QdrantBackend:
"""
try:
return QdrantBackend(
collection=self._config.qdrant.collection,
collection=self._config.qdrant.file_collection,
qdrant_host=self._config.qdrant.host,
qdrant_port=self._config.qdrant.port,
qdrant_api_key=self._config.qdrant.api_key,
Expand All @@ -205,28 +210,35 @@ def _init_qdrant_backend(self) -> QdrantBackend:
f"error in Connection to qdrant! skipping... Error: {err}", UserWarning
)

def _init_t2bsi_object(self) -> Union[Text2BEDSearchInterface, None]:
def _init_qdrant_text_backend(self) -> QdrantBackend:
"""
Create Text 2 BED search interface and return this object
Create qdrant client text embedding object using credentials provided in config file
:return: Text2BEDSearchInterface object
:return: QdrantClient
"""

try:
return Text2BEDSearchInterface(
backend=self.qdrant_engine,
query2vec=Text2Vec(
hf_repo=self._config.path.text2vec,
v2v=self._config.path.vec2vec,
),
)
except Exception as e:
_LOGGER.error("Error in creating Text2BEDSearchInterface object: " + str(e))
warnings.warn(
"Error in creating Text2BEDSearchInterface object: " + str(e),
UserWarning,
)
return None
return QdrantBackend(
dim=TEXT_EMBEDDING_DIMENSION,
collection=self.config.qdrant.text_collection,
qdrant_host=self.config.qdrant.host,
qdrant_api_key=self.config.qdrant.api_key,
)

def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]:
"""
Create BiVectorSearchInterface object using credentials provided in config file
:return: BiVectorSearchInterface
"""

search_backend = BiVectorBackend(
metadata_backend=self._qdrant_text_engine, bed_backend=self._qdrant_engine
)
search_interface = BiVectorSearchInterface(
backend=search_backend,
query2vec=self.config.path.text2vec,
)
return search_interface

def _init_b2bsi_object(self) -> Union[BED2BEDSearchInterface, None]:
"""
Expand Down
4 changes: 3 additions & 1 deletion bbconf/config_parser/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
DEFAULT_QDRANT_HOST = "localhost"
DEFAULT_QDRANT_PORT = 6333
DEFAULT_QDRANT_COLLECTION_NAME = "bedbase"
DEFAULT_QDRANT_TEXT_COLLECTION_NAME = "bed_text"
DEFAULT_QDRANT_API_KEY = None

DEFAULT_SERVER_PORT = 80
DEFAULT_SERVER_HOST = "0.0.0.0"

DEFAULT_TEXT2VEC_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEFAULT_VEC2VEC_MODEL = "databio/v2v-MiniLM-v2-ATAC-hg38"
DEFAULT_REGION2_VEC_MODEL = "databio/r2v-ChIP-atlas-hg38"

DEFAULT_PEPHUB_NAMESPACE = "databio"
Expand All @@ -25,3 +25,5 @@
S3_FILE_PATH_FOLDER = "files"
S3_PLOTS_PATH_FOLDER = "stats"
S3_BEDSET_PATH_FOLDER = "bedsets"

TEXT_EMBEDDING_DIMENSION = 384
9 changes: 5 additions & 4 deletions bbconf/config_parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pydantic import BaseModel, ConfigDict, computed_field, field_validator
from yacman import load_yaml

from bbconf.config_parser.const import (
from bbconf.config_parser.const import ( # DEFAULT_VEC2VEC_MODEL,
DEFAULT_DB_DIALECT,
DEFAULT_DB_DRIVER,
DEFAULT_DB_NAME,
Expand All @@ -15,12 +15,12 @@
DEFAULT_PEPHUB_TAG,
DEFAULT_QDRANT_COLLECTION_NAME,
DEFAULT_QDRANT_PORT,
DEFAULT_QDRANT_TEXT_COLLECTION_NAME,
DEFAULT_REGION2_VEC_MODEL,
DEFAULT_S3_BUCKET,
DEFAULT_SERVER_HOST,
DEFAULT_SERVER_PORT,
DEFAULT_TEXT2VEC_MODEL,
DEFAULT_VEC2VEC_MODEL,
)

_LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -52,7 +52,8 @@ class ConfigQdrant(BaseModel):
host: str
port: int = DEFAULT_QDRANT_PORT
api_key: Optional[str] = None
collection: str = DEFAULT_QDRANT_COLLECTION_NAME
file_collection: str = DEFAULT_QDRANT_COLLECTION_NAME
text_collection: Optional[str] = DEFAULT_QDRANT_TEXT_COLLECTION_NAME


class ConfigServer(BaseModel):
Expand All @@ -62,7 +63,7 @@ class ConfigServer(BaseModel):

class ConfigPath(BaseModel):
region2vec: str = DEFAULT_REGION2_VEC_MODEL
vec2vec: str = DEFAULT_VEC2VEC_MODEL
# vec2vec: str = DEFAULT_VEC2VEC_MODEL
text2vec: str = DEFAULT_TEXT2VEC_MODEL


Expand Down
2 changes: 1 addition & 1 deletion bbconf/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
ForeignKey,
Result,
Select,
UniqueConstraint,
event,
select,
UniqueConstraint,
)
from sqlalchemy.dialects.postgresql import JSON
from sqlalchemy.engine import URL, Engine, create_engine
Expand Down
2 changes: 1 addition & 1 deletion bbconf/models/bed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class BedClassification(BaseModel):


class BedStatsModel(BaseModel):
number_of_regions: Optional[float] = Field(None, alias="regions_no")
number_of_regions: Optional[float] = None
gc_content: Optional[float] = None
median_tss_dist: Optional[float] = None
mean_region_width: Optional[float] = None
Expand Down
22 changes: 21 additions & 1 deletion bbconf/models/bedset_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Union

from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, model_validator

from .base_models import FileModel
from .bed_models import BedMetadataBasic, BedStatsModel
Expand Down Expand Up @@ -37,3 +37,23 @@ class BedSetListResult(BaseModel):
class BedSetBedFiles(BaseModel):
count: int
results: List[BedMetadataBasic]


class BedSetPEP(BaseModel):
sample_name: str
original_name: str
genome_alias: Union[str, None] = ""
genome_digest: Union[str, None] = ""
bed_type: Union[str, None] = ""
bed_format: Union[str, None] = ""
description: Union[str, None] = ""
url: Union[str, None] = ""

@model_validator(mode="before")
def remove_underscore_keys(cls, values):
"""
Remove keys that start with an underscore, as these values are not sorted by sqlalchemy
"""
return {k: v for k, v in values.items() if not k.startswith("_")}

model_config = ConfigDict(extra="allow")
28 changes: 13 additions & 15 deletions bbconf/modules/bedfiles.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import os
from logging import getLogger
from typing import Dict, Union
from pydantic import BaseModel

import numpy as np
from geniml.bbclient import BBClient
from geniml.io import RegionSet
from gtars.tokenizers import RegionSet as GRegionSet
from pephubclient.exceptions import ResponseError
from pydantic import BaseModel
from qdrant_client.models import Distance, PointIdsList, VectorParams
from sqlalchemy import and_, delete, func, select
from sqlalchemy.orm import Session
Expand All @@ -17,12 +17,12 @@
from bbconf.const import DEFAULT_LICENSE, PKG_NAME, ZARR_TOKENIZED_FOLDER
from bbconf.db_utils import (
Bed,
BedMetadata,
BedStats,
Files,
GenomeRefStats,
TokenizedBed,
Universes,
BedMetadata,
GenomeRefStats,
)
from bbconf.exceptions import (
BedBaseConfError,
Expand All @@ -48,11 +48,11 @@
BedStatsModel,
FileModel,
QdrantSearchResult,
RefGenValidModel,
StandardMeta,
TokenizedBedResponse,
TokenizedPathResponse,
UniverseMetadata,
StandardMeta,
RefGenValidModel,
)

_LOGGER = getLogger(PKG_NAME)
Expand Down Expand Up @@ -335,7 +335,7 @@ def get_embedding(self, identifier: str) -> BedEmbeddingResult:
if not self.exists(identifier):
raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.")
result = self._qdrant_engine.qd_client.retrieve(
collection_name=self._config.config.qdrant.collection,
collection_name=self._config.config.qdrant.file_collection,
ids=[identifier],
with_vectors=True,
with_payload=True,
Expand All @@ -362,14 +362,13 @@ def get_ids_list(
:param offset: offset to start from
:param genome: filter by genome
:param bed_type: filter by bed type. e.g. 'bed6+4'
:param full: if True, return full metadata, including statistics, files, and raw metadata from pephub
:return: list of bed file identifiers
"""
statement = select(Bed)
count_statement = select(func.count(Bed.id))

# TODO: make it generic, like in pephub
# TODO: make it generic, like in PEPhub
if genome:
statement = statement.where(and_(Bed.genome_alias == genome))
count_statement = count_statement.where(and_(Bed.genome_alias == genome))
Expand Down Expand Up @@ -769,7 +768,7 @@ def _embed_file(self, bed_file: Union[str, RegionSet]) -> np.ndarray:
"""
Create embeding for bed file
:param bed_id: bed file id
:param bed_file: bed file path or regionset
:param bed_file: path to the bed file, or RegionSet object
:return np array of embeddings
Expand Down Expand Up @@ -806,9 +805,8 @@ def text_to_bed_search(
:return: list of bed file metadata
"""
_LOGGER.info(f"Looking for: {query}")
_LOGGER.info(f"Using backend: {self._config.t2bsi}")

results = self._config.t2bsi.query_search(query, limit=limit, offset=offset)
results = self._config.bivec.query_search(query, limit=limit, offset=offset)
results_list = []
for result in results:
result_id = result["id"].replace("-", "")
Expand Down Expand Up @@ -908,7 +906,7 @@ def delete_qdrant_point(self, identifier: str) -> None:
"""

result = self._config.qdrant_engine.qd_client.delete(
collection_name=self._config.config.qdrant.collection,
collection_name=self._config.config.qdrant.file_collection,
points_selector=PointIdsList(
points=[identifier],
),
Expand All @@ -920,7 +918,7 @@ def create_qdrant_collection(self) -> bool:
Create qdrant collection for bed files.
"""
return self._config.qdrant_engine.qd_client.create_collection(
collection_name=self._config.config.qdrant.collection,
collection_name=self._config.config.qdrant.file_collection,
vectors_config=VectorParams(size=100, distance=Distance.DOT),
)

Expand Down Expand Up @@ -1071,7 +1069,7 @@ def _add_zarr_s3(
"Set overwrite to True to overwrite it."
)

return os.path.join(ZARR_TOKENIZED_FOLDER, path)
return str(os.path.join(ZARR_TOKENIZED_FOLDER, path))

def get_tokenized(self, bed_id: str, universe_id: str) -> TokenizedBedResponse:
"""
Expand Down Expand Up @@ -1139,7 +1137,7 @@ def _get_tokenized_path(self, bed_id: str, universe_id: str) -> str:
),
)
tokenized_object = session.scalar(statement)
return tokenized_object.path
return str(tokenized_object.path)

def exist_tokenized(self, bed_id: str, universe_id: str) -> bool:
"""
Expand Down
Loading

0 comments on commit 25beff9

Please sign in to comment.