Skip to content

Commit

Permalink
separating schemas from bedms
Browse files Browse the repository at this point in the history
  • Loading branch information
saanikat committed Oct 7, 2024
1 parent 24a5d77 commit 03ad055
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 174 deletions.
3 changes: 3 additions & 0 deletions bedms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
"""

from .attr_standardizer import AttrStandardizer
from .train import AttrStandardizerTrainer

__all__ = ["AttrStandardizer", "AttrStandardizerTrainer"]
209 changes: 75 additions & 134 deletions bedms/attr_standardizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
"""

import logging
import glob
import os
import yaml
from typing import Dict, Tuple, Union, Optional
import pickle
import peppy
Expand All @@ -14,33 +17,11 @@
from .const import (
AVAILABLE_SCHEMAS,
CONFIDENCE_THRESHOLD,
DROPOUT_PROB,
EMBEDDING_SIZE,
HIDDEN_SIZE,
INPUT_SIZE_BOW_BEDBASE,
INPUT_SIZE_BOW_ENCODE,
INPUT_SIZE_BOW_FAIRTRACKS,
OUTPUT_SIZE_BEDBASE,
OUTPUT_SIZE_ENCODE,
OUTPUT_SIZE_FAIRTRACKS,
PROJECT_NAME,
SENTENCE_TRANSFORMER_MODEL,
REPO_ID,
ENCODE_VECTORIZER_FILENAME,
ENCODE_LABEL_ENCODER_FILENAME,
FAIRTRACKS_VECTORIZER_FILENAME,
FAIRTRACKS_LABEL_ENCODER_FILENAME,
BEDBASE_VECTORIZER_FILENAME,
BEDBASE_LABEL_ENCODER_FILENAME,
)
from .model import BoWSTModel
from .utils import (
data_encoding,
data_preprocessing,
fetch_from_pephub,
get_any_pep,
load_from_huggingface,
)
from .utils import data_encoding, data_preprocessing, fetch_from_pephub, get_any_pep


logging.basicConfig(level=logging.INFO)
Expand All @@ -54,82 +35,53 @@ class AttrStandardizer:

def __init__(
self,
schema: str,
repo_id: str,
model_name: str,
custom_param: Optional[str] = None,
confidence: int = CONFIDENCE_THRESHOLD,
) -> None:
"""
Initializes the attribute standardizer with user provided schema, loads the model.
:param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
:param str repo_id: HuggingFace repository ID
:param str model_name: Name of the schema model
:param str custom_param: User provided config file for
custom parameters, if they choose "CUSTOM" schema.
:param int confidence: Confidence threshold for the predictions.
"""
self.schema = schema
self.repo_id = repo_id
self.model_name = model_name
self.conf_threshold = confidence
self.custom_param = custom_param

if self.schema == "CUSTOM" and self.custom_param:
self.custom_param = self._load_custom_param(self.custom_param)
self.model, self.vectorizer, self.label_encoder = self._load_model()

def _load_custom_param(self, config_pth: str) -> Dict[str, Tuple]:
"""
Loads the custom parameters from the config file provided by the user.
:param str config_pth: Path to the config file which has the custom parameters.
:return Dict[str, Tuple]: Custom Parameters dictionary.
"""
with open(config_pth, "r", encoding="utf-8") as file:
return yaml.safe_load(file)

def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
"""
Get the model parameters as per the chosen schema.
:return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters.
"""
if self.schema == "ENCODE":
return (
INPUT_SIZE_BOW_ENCODE,
EMBEDDING_SIZE,
EMBEDDING_SIZE,
HIDDEN_SIZE,
OUTPUT_SIZE_ENCODE,
DROPOUT_PROB,
)
if self.schema == "FAIRTRACKS":
return (
INPUT_SIZE_BOW_FAIRTRACKS,
EMBEDDING_SIZE,
EMBEDDING_SIZE,
HIDDEN_SIZE,
OUTPUT_SIZE_FAIRTRACKS,
DROPOUT_PROB,
)
if self.schema == "BEDBASE":
return (
INPUT_SIZE_BOW_BEDBASE,
EMBEDDING_SIZE,
EMBEDDING_SIZE,
HIDDEN_SIZE,
OUTPUT_SIZE_BEDBASE,
DROPOUT_PROB,
)
if self.schema == "CUSTOM":
return (
self.custom_param["model"]["input_size_bow"],
self.custom_param["model"]["input_size_embeddings"],
self.custom_param["model"]["input_size_embeddings"],
self.custom_param["model"]["hidden_size"],
self.custom_param["model"]["output_size"],
self.custom_param["model"]["dropout_prob"],
)

raise ValueError(
f"Schema not available: {self.schema}."
"Presently, four schemas are available: ENCODE , FAIRTRACKS, BEDBASE, CUSTOM"
config_filename = f"config_{self.model_name}.yaml"
config_pth = hf_hub_download(
repo_id=self.repo_id,
filename=os.path.join(self.model_name, config_filename),
)
with open(config_pth, "r") as file:
config = yaml.safe_load(file)

input_size_bow = config["params"]["input_size_bow"]
embedding_size = config["params"]["embedding_size"]
hidden_size = config["params"]["hidden_size"]
output_size = config["params"]["output_size"]
dropout_prob = config["params"]["dropout_prob"]

return (
input_size_bow,
embedding_size,
embedding_size,
hidden_size,
output_size,
dropout_prob,
)

def _load_model(self) -> Tuple[nn.Module, object, object]:
Expand All @@ -140,65 +92,54 @@ def _load_model(self) -> Tuple[nn.Module, object, object]:
:return object: The scikit learn vectorizer for bag of words encoding.
:return object: Label encoder object for the labels (y).
"""
try:
if self.schema == "ENCODE":
filename_vc = ENCODE_VECTORIZER_FILENAME
filename_lb = ENCODE_LABEL_ENCODER_FILENAME
elif self.schema == "FAIRTRACKS":
filename_vc = FAIRTRACKS_VECTORIZER_FILENAME
filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME
elif self.schema == "BEDBASE":
filename_vc = BEDBASE_VECTORIZER_FILENAME
filename_lb = BEDBASE_LABEL_ENCODER_FILENAME
elif self.schema == "CUSTOM":
vc_path = self.custom_param["paths"]["vectorizer_pth"]
lb_path = self.custom_param["paths"]["label_encoder_pth"]
state_dict = torch.load(self.custom_param["paths"]["model_pth"])
else:
raise ValueError(f"Schema not available: {self.schema}")

if self.schema != "CUSTOM":
vc_path = hf_hub_download(
repo_id=REPO_ID,
filename=filename_vc,
)
lb_path = hf_hub_download(
repo_id=REPO_ID,
filename=filename_lb,
)
model = load_from_huggingface(self.schema)
state_dict = torch.load(model)
model_filename = f"model_{self.model_name}.pth"
label_encoder_filename = f"label_encoder_{self.model_name}.pkl"
vectorizer_filename = f"vectorizer_{self.model_name}.pkl"

with open(vc_path, "rb") as f:
vectorizer = pickle.load(f)
model_pth = hf_hub_download(
repo_id=self.repo_id, filename=os.path.join(self.model_name, model_filename)
)

with open(lb_path, "rb") as f:
label_encoder = pickle.load(f)
vc_path = hf_hub_download(
repo_id=self.repo_id,
filename=os.path.join(self.model_name, vectorizer_filename),
)

(
input_size_values,
input_size_values_embeddings,
input_size_headers,
hidden_size,
output_size,
dropout_prob,
) = self._get_parameters()

model = BoWSTModel(
input_size_values,
input_size_values_embeddings,
input_size_headers,
hidden_size,
output_size,
dropout_prob,
)
model.load_state_dict(state_dict)
model.eval()
return model, vectorizer, label_encoder
lb_path = hf_hub_download(
repo_id=self.repo_id,
filename=os.path.join(self.model_name, label_encoder_filename),
)

except Exception as e:
logger.error(f"Error loading the model: {str(e)}")
raise
with open(vc_path, "rb") as f:
vectorizer = pickle.load(f)

with open(lb_path, "rb") as f:
label_encoder = pickle.load(f)

state_dict = torch.load(model_pth)

(
input_size_values,
input_size_values_embeddings,
input_size_headers,
hidden_size,
output_size,
dropout_prob,
) = self._get_parameters()

model = BoWSTModel(
input_size_values,
input_size_values_embeddings,
input_size_headers,
hidden_size,
output_size,
dropout_prob,
)

model.load_state_dict(state_dict)
model.eval()

return model, vectorizer, label_encoder

def standardize(
self, pep: Union[str, peppy.Project]
Expand Down
19 changes: 0 additions & 19 deletions bedms/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,6 @@

AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"]
PEP_FILE_TYPES = ["yaml", "csv"]
REPO_ID = "databio/attribute-standardizer-model6"
MODEL_ENCODE = "model_encode.pth"
MODEL_FAIRTRACKS = "model_fairtracks.pth"
MODEL_BEDBASE = "model_bedbase.pth"
ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl"
FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl"
BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl"
ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl"
FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl"
BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl"
SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
HIDDEN_SIZE = 32
DROPOUT_PROB = 0.113
CONFIDENCE_THRESHOLD = 0.70
EMBEDDING_SIZE = 384
INPUT_SIZE_BOW_ENCODE = 10459
INPUT_SIZE_BOW_FAIRTRACKS = 13617
OUTPUT_SIZE_FAIRTRACKS = 15
OUTPUT_SIZE_ENCODE = 18
NUM_CLUSTERS = 3
INPUT_SIZE_BOW_BEDBASE = 13708
OUTPUT_SIZE_BEDBASE = 12
2 changes: 1 addition & 1 deletion bedms/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
logger = logging.getLogger(PROJECT_NAME)


class TrainStandardizer:
class AttrStandardizerTrainer:
"""
This is the training class responsible for
managing the training process for the standardizer model.
Expand Down
20 changes: 0 additions & 20 deletions bedms/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from .const import (
MODEL_BEDBASE,
MODEL_ENCODE,
MODEL_FAIRTRACKS,
NUM_CLUSTERS,
REPO_ID,
PEP_FILE_TYPES,
PROJECT_NAME,
)
Expand Down Expand Up @@ -50,22 +46,6 @@ def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame:
return csv_file_df


def load_from_huggingface(schema: str) -> Optional[Any]:
"""
Load a model from HuggingFace based on the schema of choice.
:param str schema: Schema Type
:return Optional[Any]: Loaded model object
"""
if schema == "ENCODE":
model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE)
elif schema == "FAIRTRACKS":
model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS)
elif schema == "BEDBASE":
model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE)
return model


def data_preprocessing(
df: pd.DataFrame,
) -> Tuple[List[List[str]], List[str], List[List[str]], int]:
Expand Down

0 comments on commit 03ad055

Please sign in to comment.