diff --git a/bedms/__init__.py b/bedms/__init__.py index 374c0be..d0d13a5 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -1 +1,5 @@ +""" +This module initializes 'bedms' package. +""" + from .attr_standardizer import AttrStandardizer diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index fbcaf39..6fa3b2e 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -1,9 +1,13 @@ +""" +This module has the class AttrStandardizer for 'bedms'. +""" + import logging from typing import Dict, Tuple, Union - +import pickle import peppy import torch -import torch.nn as nn +from torch import nn import torch.nn.functional as torch_functional from .const import ( @@ -20,6 +24,13 @@ OUTPUT_SIZE_FAIRTRACKS, PROJECT_NAME, SENTENCE_TRANSFORMER_MODEL, + REPO_ID, + ENCODE_VECTORIZER_FILENAME, + ENCODE_LABEL_ENCODER_FILENAME, + FAIRTRACKS_VECTORIZER_FILENAME, + FAIRTRACKS_LABEL_ENCODER_FILENAME, + BEDBASE_VECTORIZER_FILENAME, + BEDBASE_LABEL_ENCODER_FILENAME, ) from .model import BoWSTModel from .utils import ( @@ -29,12 +40,17 @@ get_any_pep, load_from_huggingface, ) +from huggingface_hub import hf_hub_download logging.basicConfig(level=logging.INFO) logger = logging.getLogger(PROJECT_NAME) class AttrStandardizer: + """ + This is the AttrStandardizer class which holds the models for Attribute Standardization. + """ + def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. @@ -43,7 +59,7 @@ def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: :param int confidence: Confidence threshold for the predictions. """ self.schema = schema - self.model = self._load_model() + self.model, self.vectorizer, self.label_encoder = self._load_model() self.conf_threshold = confidence def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: @@ -61,7 +77,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_ENCODE, DROPOUT_PROB, ) - elif self.schema == "FAIRTRACKS": + if self.schema == "FAIRTRACKS": return ( INPUT_SIZE_BOW_FAIRTRACKS, EMBEDDING_SIZE, @@ -70,7 +86,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_FAIRTRACKS, DROPOUT_PROB, ) - elif self.schema == "BEDBASE": + if self.schema == "BEDBASE": return ( INPUT_SIZE_BOW_BEDBASE, EMBEDDING_SIZE, @@ -79,18 +95,49 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_BEDBASE, DROPOUT_PROB, ) - else: - raise ValueError( - f"Schema not available: {self.schema}. Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" - ) + raise ValueError( + f"Schema not available: {self.schema}." + "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + ) - def _load_model(self) -> nn.Module: + def _load_model(self) -> Tuple[nn.Module, object, object]: """ - Calls function to load the model from HuggingFace repository and sets to eval(). - + Calls function to load the model from HuggingFace repository + load vectorizer and label encoder and sets to eval(). :return nn.Module: Loaded Neural Network Model. + :return object: The scikit learn vectorizer for bag of words encoding. + :return object: Label encoder object for the labels (y). """ try: + if self.schema == "ENCODE": + filename_vc = ENCODE_VECTORIZER_FILENAME + filename_lb = ENCODE_LABEL_ENCODER_FILENAME + elif self.schema == "FAIRTRACKS": + filename_vc = FAIRTRACKS_VECTORIZER_FILENAME + filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME + elif self.schema == "BEDBASE": + filename_vc = BEDBASE_VECTORIZER_FILENAME + filename_lb = BEDBASE_LABEL_ENCODER_FILENAME + + vectorizer = None + label_encoder = None + + vc_path = hf_hub_download( + repo_id=REPO_ID, + filename=filename_vc, + ) + + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + + lb_path = hf_hub_download( + repo_id=REPO_ID, + filename=filename_lb, + ) + + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + model = load_from_huggingface(self.schema) state_dict = torch.load(model) @@ -113,7 +160,7 @@ def _load_model(self) -> nn.Module: ) model.load_state_dict(state_dict) model.eval() - return model + return model, vectorizer, label_encoder except Exception as e: logger.error(f"Error loading the model: {str(e)}") @@ -123,7 +170,9 @@ def standardize( self, pep: Union[str, peppy.Project] ) -> Dict[str, Dict[str, float]]: """ - Fetches the user provided PEP from the PEPHub registry path, returns the predictions. + Fetches the user provided PEP + from the PEPHub registry path, + returns the predictions. :param str pep: peppy.Project object or PEPHub registry path to PEP. :return Dict[str, Dict[str, float]]: Suggestions to the user. @@ -139,20 +188,21 @@ def standardize( try: csv_file = fetch_from_pephub(pep) - X_values_st, X_headers_st, X_values_bow, num_rows = data_preprocessing( + x_values_st, x_headers_st, x_values_bow, num_rows = data_preprocessing( csv_file ) ( - X_headers_embeddings_tensor, - X_values_embeddings_tensor, - X_values_bow_tensor, + x_headers_embeddings_tensor, + x_values_embeddings_tensor, + x_values_bow_tensor, label_encoder, ) = data_encoding( + self.vectorizer, + self.label_encoder, num_rows, - X_values_st, - X_headers_st, - X_values_bow, - self.schema, + x_values_st, + x_headers_st, + x_values_bow, model_name=SENTENCE_TRANSFORMER_MODEL, ) @@ -160,9 +210,9 @@ def standardize( with torch.no_grad(): outputs = self.model( - X_values_bow_tensor, - X_values_embeddings_tensor, - X_headers_embeddings_tensor, + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, ) probabilities = torch_functional.softmax(outputs, dim=1) @@ -175,7 +225,7 @@ def standardize( ] suggestions = {} - for i, category in enumerate(X_headers_st): + for i, category in enumerate(x_headers_st): category_suggestions = {} if top_confidences[i][0] >= self.conf_threshold: for j in range(3): diff --git a/bedms/const.py b/bedms/const.py index e325671..86916c6 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -1,7 +1,11 @@ +""" +This module contains constant values used in the 'bedms' package. +""" + PROJECT_NAME = "bedmess" AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"] - +PEP_FILE_TYPES = ["yaml", "csv"] REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" MODEL_FAIRTRACKS = "model_fairtracks.pth" diff --git a/bedms/model.py b/bedms/model.py index af212bc..52eed64 100644 --- a/bedms/model.py +++ b/bedms/model.py @@ -20,8 +20,10 @@ def __init__( Initializes the BoWSTModel. :param int input_size_values: Size of the input for the values (BoW). - :param int inout_size_values_embeddings: Size of the input for the values sentence transformer embeddings. - :param int input_size_headers: Size of the input for the headers with sentence transformer embeddings. + :param int inout_size_values_embeddings: Size of the input + for the values sentence transformer embeddings. + :param int input_size_headers: Size of the input + for the headers with sentence transformer embeddings. :param int hidden_size: Size of the hidden layer. :param int output_size: Size of the output layer. :param float dropout_prob: Dropout probability for regularization. diff --git a/bedms/utils.py b/bedms/utils.py index 632c1d9..0dcb613 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -1,4 +1,7 @@ -import pickle +""" +This module has all util functions for 'bedms' +""" + import warnings from collections import Counter from typing import Any, List, Optional, Tuple, Union @@ -11,21 +14,15 @@ from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans -from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import LabelEncoder from .const import ( - BEDBASE_LABEL_ENCODER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - ENCODE_LABEL_ENCODER_FILENAME, - ENCODE_VECTORIZER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, - FAIRTRACKS_VECTORIZER_FILENAME, MODEL_BEDBASE, MODEL_ENCODE, MODEL_FAIRTRACKS, NUM_CLUSTERS, REPO_ID, + PEP_FILE_TYPES, ) # TODO : convert to single np array before converting to tensor @@ -73,19 +70,21 @@ def data_preprocessing( :param pd.DataFrame df: The input DataFrame (user chosen PEP) to preprocess. :return Tuple[List[List[str]], List[str], List[List[str]]]: - - Nested list containing the comma separated values in each column for sentence transformer embeddings. + - Nested list containing the comma separated values + in each column for sentence transformer embeddings. - List containing the headers of the DataFrame. - - Nested list containing the comma separated values in each column for Bag of Words encoding. + - Nested list containing the comma separated values + in each column for Bag of Words encoding. - Number of rows in the metadata csv """ - X_values_st = [df[column].astype(str).tolist() for column in df.columns] - X_headers_st = df.columns.tolist() - X_values_bow = [df[column].astype(str).tolist() for column in df.columns] + x_values_st = [df[column].astype(str).tolist() for column in df.columns] + x_headers_st = df.columns.tolist() + x_values_bow = [df[column].astype(str).tolist() for column in df.columns] num_rows = df.shape[0] - return X_values_st, X_headers_st, X_values_bow, num_rows + return x_values_st, x_headers_st, x_values_bow, num_rows def get_top_k_average(val_embedding: List[np.ndarray], k: int) -> np.ndarray: @@ -151,31 +150,39 @@ def get_averaged(embeddings: List[np.ndarray]) -> np.ndarray: def data_encoding( + vectorizer: object, + label_encoder: object, num_rows: int, - X_values_st: List[List[str]], - X_headers_st: List[str], - X_values_bow: List[List[str]], - schema: str, + x_values_st: List[List[str]], + x_headers_st: List[str], + x_values_bow: List[List[str]], model_name: str, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Union[LabelEncoder, None]]: """ Encode input data in accordance with the user-specified schemas. + :param object vectorizer: scikit-learn vectorizer for bag of words encoding. + :param object label_encoder" Label encoder object storing labels (y) :param int num_rows: Number of rows in the sample metadata - :param list X_values_st: Nested list containing the comma separated values in each column for sentence transformer embeddings. + :param list X_values_st: Nested list containing the comma separated values + in each column for sentence transformer embeddings. :param list X_headers_st: List containing the headers of the DataFrame. - :param list X_values_bow: Nested list containing the comma separated values in each column for Bag of Words encoding. + :param list X_values_bow: Nested list containing the comma separated values + in each column for Bag of Words encoding. :param str schema: Schema type chosen by the user for standardization. - :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Union[LabelEncoder, None]]: Tuple containing torch tensors for encoded embeddings and Bag of Words representations, and label encoder object. + :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + Union[LabelEncoder, None]]: Tuple containing + torch tensors for encoded embeddings and Bag of Words representations, + and label encoder object. """ # Sentence Transformer Model sentence_encoder = SentenceTransformer(model_name) - X_headers_embeddings = sentence_encoder.encode( - X_headers_st, show_progress_bar=False + x_headers_embeddings = sentence_encoder.encode( + x_headers_st, show_progress_bar=False ) # generating embeddings for each element in sublist (column) embeddings = [] - for column in X_values_st: + for column in x_values_st: val_embedding = sentence_encoder.encode(column, show_progress_bar=False) if num_rows >= 10: embedding = get_top_cluster_averaged(val_embedding) @@ -183,92 +190,29 @@ def data_encoding( embedding = get_averaged(val_embedding) embeddings.append(embedding) - X_values_embeddings = embeddings - if schema == "ENCODE": - # Bag of Words Vectorizer - vectorizer = None - vc_path = hf_hub_download( - repo_id=REPO_ID, - filename=ENCODE_VECTORIZER_FILENAME, - ) - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) - transformed_columns = [] - for column in X_values_bow: - column_text = " ".join(column) - transformed_column = vectorizer.transform([column_text]) - transformed_columns.append(transformed_column.toarray()[0]) - transformed_columns = np.array(transformed_columns) - # print(transformed_columns) - X_values_bow = transformed_columns - # Label Encoding - label_encoder =None - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=ENCODE_LABEL_ENCODER_FILENAME, - ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) - - elif schema == "FAIRTRACKS": - vectorizer = None - vc_path = hf_hub_download( - repo_id=REPO_ID, filename=FAIRTRACKS_VECTORIZER_FILENAME - ) - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) - transformed_columns = [] - for column in X_values_bow: - column_text = " ".join(column) - transformed_column = vectorizer.transform([column_text]) - transformed_columns.append(transformed_column.toarray()[0]) - transformed_columns = np.array(transformed_columns) - # print(transformed_columns) - X_values_bow = transformed_columns - # Label Encoding - label_encoder = None - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=FAIRTRACKS_LABEL_ENCODER_FILENAME, - ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) - - elif schema == "BEDBASE": - vectorizer = None - vc_path = hf_hub_download(repo_id=REPO_ID, filename=BEDBASE_VECTORIZER_FILENAME) - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) - transformed_columns = [] - for column in X_values_bow: - column_text = " ".join(column) - transformed_column = vectorizer.transform([column_text]) - transformed_columns.append(transformed_column.toarray()[0]) - transformed_columns = np.array(transformed_columns) - # print(transformed_columns) - X_values_bow = transformed_columns - # Label Encoding - label_encoder = None - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=BEDBASE_LABEL_ENCODER_FILENAME, - ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) - - X_headers_embeddings_tensor = torch.tensor( - X_headers_embeddings, dtype=torch.float32 + x_values_embeddings = embeddings + transformed_columns = [] + for column in x_values_bow: + column_text = " ".join(column) + transformed_column = vectorizer.transform([column_text]) + transformed_columns.append(transformed_column.toarray()[0]) + transformed_columns = np.array(transformed_columns) + # print(transformed_columns) + x_values_bow = transformed_columns + + x_headers_embeddings_tensor = torch.tensor( + x_headers_embeddings, dtype=torch.float32 ) - X_values_embeddings_tensor = torch.tensor(X_values_embeddings, dtype=torch.float32) - X_values_bow_tensor = torch.tensor(X_values_bow, dtype=torch.float32) - X_values_embeddings_tensor = X_values_embeddings_tensor.squeeze( + x_values_embeddings_tensor = torch.tensor(x_values_embeddings, dtype=torch.float32) + x_values_bow_tensor = torch.tensor(x_values_bow, dtype=torch.float32) + x_values_embeddings_tensor = x_values_embeddings_tensor.squeeze( 1 ) # brings the shape to [num_cols, vocab] return ( - X_headers_embeddings_tensor, - X_values_embeddings_tensor, - X_values_bow_tensor, + x_headers_embeddings_tensor, + x_values_embeddings_tensor, + x_values_bow_tensor, label_encoder, ) @@ -281,11 +225,7 @@ def get_any_pep(pep: str) -> peppy.Project: :return: peppy.Project object. """ - - PEP_FILE_TYPES = ["yaml", "csv"] - res = list(filter(pep.endswith, PEP_FILE_TYPES)) != [] if res: return peppy.Project(pep) - else: - return peppy.Project.from_pephub(pep) + return peppy.Project.from_pephub(pep)