diff --git a/bedms/__init__.py b/bedms/__init__.py index d0d13a5..99bc695 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -3,3 +3,6 @@ """ from .attr_standardizer import AttrStandardizer +from .train import AttrStandardizerTrainer + +__all__ = ["AttrStandardizer", "AttrStandardizerTrainer"] diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 7b93f1d..c823890 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -3,6 +3,9 @@ """ import logging +import glob +import os +import yaml from typing import Dict, Tuple, Union, Optional import pickle import peppy @@ -14,33 +17,11 @@ from .const import ( AVAILABLE_SCHEMAS, CONFIDENCE_THRESHOLD, - DROPOUT_PROB, - EMBEDDING_SIZE, - HIDDEN_SIZE, - INPUT_SIZE_BOW_BEDBASE, - INPUT_SIZE_BOW_ENCODE, - INPUT_SIZE_BOW_FAIRTRACKS, - OUTPUT_SIZE_BEDBASE, - OUTPUT_SIZE_ENCODE, - OUTPUT_SIZE_FAIRTRACKS, PROJECT_NAME, SENTENCE_TRANSFORMER_MODEL, - REPO_ID, - ENCODE_VECTORIZER_FILENAME, - ENCODE_LABEL_ENCODER_FILENAME, - FAIRTRACKS_VECTORIZER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - BEDBASE_LABEL_ENCODER_FILENAME, ) from .model import BoWSTModel -from .utils import ( - data_encoding, - data_preprocessing, - fetch_from_pephub, - get_any_pep, - load_from_huggingface, -) +from .utils import data_encoding, data_preprocessing, fetch_from_pephub, get_any_pep logging.basicConfig(level=logging.INFO) @@ -54,82 +35,53 @@ class AttrStandardizer: def __init__( self, - schema: str, + repo_id: str, + model_name: str, custom_param: Optional[str] = None, confidence: int = CONFIDENCE_THRESHOLD, ) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. - :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param str repo_id: HuggingFace repository ID + :param str model_name: Name of the schema model :param str custom_param: User provided config file for custom parameters, if they choose "CUSTOM" schema. :param int confidence: Confidence threshold for the predictions. """ - self.schema = schema + self.repo_id = repo_id + self.model_name = model_name self.conf_threshold = confidence self.custom_param = custom_param - - if self.schema == "CUSTOM" and self.custom_param: - self.custom_param = self._load_custom_param(self.custom_param) self.model, self.vectorizer, self.label_encoder = self._load_model() - def _load_custom_param(self, config_pth: str) -> Dict[str, Tuple]: - """ - Loads the custom parameters from the config file provided by the user. - - :param str config_pth: Path to the config file which has the custom parameters. - :return Dict[str, Tuple]: Custom Parameters dictionary. - """ - with open(config_pth, "r", encoding="utf-8") as file: - return yaml.safe_load(file) - def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: """ Get the model parameters as per the chosen schema. :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. """ - if self.schema == "ENCODE": - return ( - INPUT_SIZE_BOW_ENCODE, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_ENCODE, - DROPOUT_PROB, - ) - if self.schema == "FAIRTRACKS": - return ( - INPUT_SIZE_BOW_FAIRTRACKS, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_FAIRTRACKS, - DROPOUT_PROB, - ) - if self.schema == "BEDBASE": - return ( - INPUT_SIZE_BOW_BEDBASE, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_BEDBASE, - DROPOUT_PROB, - ) - if self.schema == "CUSTOM": - return ( - self.custom_param["model"]["input_size_bow"], - self.custom_param["model"]["input_size_embeddings"], - self.custom_param["model"]["input_size_embeddings"], - self.custom_param["model"]["hidden_size"], - self.custom_param["model"]["output_size"], - self.custom_param["model"]["dropout_prob"], - ) - - raise ValueError( - f"Schema not available: {self.schema}." - "Presently, four schemas are available: ENCODE , FAIRTRACKS, BEDBASE, CUSTOM" + config_filename = f"config_{self.model_name}.yaml" + config_pth = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, config_filename), + ) + with open(config_pth, "r") as file: + config = yaml.safe_load(file) + + input_size_bow = config["params"]["input_size_bow"] + embedding_size = config["params"]["embedding_size"] + hidden_size = config["params"]["hidden_size"] + output_size = config["params"]["output_size"] + dropout_prob = config["params"]["dropout_prob"] + + return ( + input_size_bow, + embedding_size, + embedding_size, + hidden_size, + output_size, + dropout_prob, ) def _load_model(self) -> Tuple[nn.Module, object, object]: @@ -140,65 +92,54 @@ def _load_model(self) -> Tuple[nn.Module, object, object]: :return object: The scikit learn vectorizer for bag of words encoding. :return object: Label encoder object for the labels (y). """ - try: - if self.schema == "ENCODE": - filename_vc = ENCODE_VECTORIZER_FILENAME - filename_lb = ENCODE_LABEL_ENCODER_FILENAME - elif self.schema == "FAIRTRACKS": - filename_vc = FAIRTRACKS_VECTORIZER_FILENAME - filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME - elif self.schema == "BEDBASE": - filename_vc = BEDBASE_VECTORIZER_FILENAME - filename_lb = BEDBASE_LABEL_ENCODER_FILENAME - elif self.schema == "CUSTOM": - vc_path = self.custom_param["paths"]["vectorizer_pth"] - lb_path = self.custom_param["paths"]["label_encoder_pth"] - state_dict = torch.load(self.custom_param["paths"]["model_pth"]) - else: - raise ValueError(f"Schema not available: {self.schema}") - - if self.schema != "CUSTOM": - vc_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_vc, - ) - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_lb, - ) - model = load_from_huggingface(self.schema) - state_dict = torch.load(model) + model_filename = f"model_{self.model_name}.pth" + label_encoder_filename = f"label_encoder_{self.model_name}.pkl" + vectorizer_filename = f"vectorizer_{self.model_name}.pkl" - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) + model_pth = hf_hub_download( + repo_id=self.repo_id, filename=os.path.join(self.model_name, model_filename) + ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) + vc_path = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, vectorizer_filename), + ) - ( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) = self._get_parameters() - - model = BoWSTModel( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) - model.load_state_dict(state_dict) - model.eval() - return model, vectorizer, label_encoder + lb_path = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, label_encoder_filename), + ) - except Exception as e: - logger.error(f"Error loading the model: {str(e)}") - raise + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + + state_dict = torch.load(model_pth) + + ( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) = self._get_parameters() + + model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) + + model.load_state_dict(state_dict) + model.eval() + + return model, vectorizer, label_encoder def standardize( self, pep: Union[str, peppy.Project] diff --git a/bedms/const.py b/bedms/const.py index e200fda..c36f5f4 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -6,25 +6,6 @@ AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"] PEP_FILE_TYPES = ["yaml", "csv"] -REPO_ID = "databio/attribute-standardizer-model6" -MODEL_ENCODE = "model_encode.pth" -MODEL_FAIRTRACKS = "model_fairtracks.pth" -MODEL_BEDBASE = "model_bedbase.pth" -ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl" -FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl" -BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl" -ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl" -FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl" -BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" -HIDDEN_SIZE = 32 -DROPOUT_PROB = 0.113 CONFIDENCE_THRESHOLD = 0.70 -EMBEDDING_SIZE = 384 -INPUT_SIZE_BOW_ENCODE = 10459 -INPUT_SIZE_BOW_FAIRTRACKS = 13617 -OUTPUT_SIZE_FAIRTRACKS = 15 -OUTPUT_SIZE_ENCODE = 18 NUM_CLUSTERS = 3 -INPUT_SIZE_BOW_BEDBASE = 13708 -OUTPUT_SIZE_BEDBASE = 12 diff --git a/bedms/train.py b/bedms/train.py index d2816fd..9bc3438 100644 --- a/bedms/train.py +++ b/bedms/train.py @@ -29,7 +29,7 @@ logger = logging.getLogger(PROJECT_NAME) -class TrainStandardizer: +class AttrStandardizerTrainer: """ This is the training class responsible for managing the training process for the standardizer model. diff --git a/bedms/utils.py b/bedms/utils.py index fb63805..20e7128 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -17,11 +17,7 @@ from sklearn.cluster import KMeans from sklearn.preprocessing import LabelEncoder from .const import ( - MODEL_BEDBASE, - MODEL_ENCODE, - MODEL_FAIRTRACKS, NUM_CLUSTERS, - REPO_ID, PEP_FILE_TYPES, PROJECT_NAME, ) @@ -50,22 +46,6 @@ def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame: return csv_file_df -def load_from_huggingface(schema: str) -> Optional[Any]: - """ - Load a model from HuggingFace based on the schema of choice. - - :param str schema: Schema Type - :return Optional[Any]: Loaded model object - """ - if schema == "ENCODE": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE) - elif schema == "FAIRTRACKS": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS) - elif schema == "BEDBASE": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE) - return model - - def data_preprocessing( df: pd.DataFrame, ) -> Tuple[List[List[str]], List[str], List[List[str]], int]: