From 2d4e29482f736c527eb9039a69e8ad2300cdc366 Mon Sep 17 00:00:00 2001 From: saanikat Date: Fri, 9 Aug 2024 15:58:12 -0400 Subject: [PATCH 01/27] Likely solved #8 and #10, added top 3 predictions, changed CONFIDENCE_THRESHOLD --- README.md | 11 +- attribute_standardizer/__init__.py | 4 +- .../attr_standardizer_class.py | 172 ++++++++++++++++++ .../attribute_standardizer.py | 4 +- attribute_standardizer/const.py | 10 +- attribute_standardizer/utils.py | 17 +- trial.py | 8 +- 7 files changed, 212 insertions(+), 14 deletions(-) create mode 100644 attribute_standardizer/attr_standardizer_class.py diff --git a/README.md b/README.md index 6335ebc..fb6c404 100644 --- a/README.md +++ b/README.md @@ -16,13 +16,18 @@ pip install . ## Usage -Using Python, this is how you can run `attribute_standardizer` : +Using Python, this is how you can run `attribute_standardizer` and print the results : ``` -from attribute_standardizer.attribute_standardizer import attr_standardizer +from attribute_standardizer.attr_standardizer_class import AttrStandardizer + +model = AttrStandardizer("ENCODE") + +results = model.standardize(pep ="geo/gse178283:default") + +print(results) -attr_standardizer(pep=/path/to/pep, schema="ENCODE") ``` You can use the format provided in the `trial.py` script in this repository as a reference. \ No newline at end of file diff --git a/attribute_standardizer/__init__.py b/attribute_standardizer/__init__.py index e5081d0..6e82403 100644 --- a/attribute_standardizer/__init__.py +++ b/attribute_standardizer/__init__.py @@ -1 +1,3 @@ -from .attribute_standardizer import attr_standardizer +# from .attribute_standardizer import attr_standardizer + +from .attr_standardizer_class import AttrStandardizer diff --git a/attribute_standardizer/attr_standardizer_class.py b/attribute_standardizer/attr_standardizer_class.py new file mode 100644 index 0000000..252fee7 --- /dev/null +++ b/attribute_standardizer/attr_standardizer_class.py @@ -0,0 +1,172 @@ +import pandas as pd +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import logging +from .const import ( + HIDDEN_SIZE, + DROPOUT_PROB, + CONFIDENCE_THRESHOLD, + EMBEDDING_SIZE, + SENTENCE_TRANSFORMER_MODEL, + INPUT_SIZE_BOW_FAIRTRACKS, + INPUT_SIZE_BOW_ENCODE, + OUTPUT_SIZE_ENCODE, + OUTPUT_SIZE_FAIRTRACKS, +) + +from .utils import ( + fetch_from_pephub, + load_from_huggingface, + data_preprocessing, + data_encoding, +) +from .model import BoWSTModel +from huggingface_hub import hf_hub_download +from typing import Dict, List, Tuple, Any, Union + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class AttrStandardizer: + def __init__(self, schema: str) -> None: + """ + Initializes the attribute standardizer with user provided schema, loads the model. + + :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + """ + self.schema = schema + self.model = self._load_model() + + def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: + """ + Gets the model parameters as per the chosen schema. + + :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. + """ + if self.schema == "ENCODE": + return ( + INPUT_SIZE_BOW_ENCODE, + EMBEDDING_SIZE, + EMBEDDING_SIZE, + HIDDEN_SIZE, + OUTPUT_SIZE_ENCODE, + DROPOUT_PROB, + ) + elif self.schema == "FAIRTRACKS": + return ( + INPUT_SIZE_BOW_FAIRTRACKS, + EMBEDDING_SIZE, + EMBEDDING_SIZE, + HIDDEN_SIZE, + OUTPUT_SIZE_FAIRTRACKS, + DROPOUT_PROB, + ) + else: + raise ValueError(f"Schema not available: {self.schema}") + + def _load_model(self) -> nn.Module: + """ + Calls function to load the model from HuggingFace repository and sets to eval(). + + :return nn.Module: Loaded Neural Network Model. + """ + try: + model = load_from_huggingface(self.schema) + state_dict = torch.load(model) + + ( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) = self._get_parameters() + + model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) + model.load_state_dict(state_dict) + model.eval() + return model + + except Exception as e: + logger.error(f"Error loading the model: {str(e)}") + raise + + def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: + """ + Fetches the user provided PEP from the PEPHub registry path, returns the predictions. + + :param str pep: User provided path to the PEP. + :return Dict[str, Dict[str, float]]: Suggestions to the user. + """ + if not pep: + raise ValueError( + "PEP path is missing or empty. Please provide the PEPHub registry path to PEP" + ) + try: + csv_file = fetch_from_pephub(pep) + schema = self.schema + X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file) + ( + X_headers_embeddings_tensor, + X_values_embeddings_tensor, + X_values_bow_tensor, + label_encoder, + ) = data_encoding( + X_values_st, + X_headers_st, + X_values_bow, + schema, + model_name=SENTENCE_TRANSFORMER_MODEL, + ) + logger.info("Data Preprocessing completed.") + + with torch.no_grad(): + outputs = self.model( + X_values_bow_tensor, + X_values_embeddings_tensor, + X_headers_embeddings_tensor, + ) + probabilities = F.softmax(outputs, dim=1) + # confidence, predicted = torch.max(probabilities, 1) + + values, indices = torch.topk(probabilities, k=3, dim=1) + top_preds = indices.tolist() + top_confidences = values.tolist() + + decoded_predictions = [ + label_encoder.inverse_transform(indices) for indices in top_preds + ] + + suggestions = {} + for i, category in enumerate(X_headers_st): + category_suggestions = {} + if top_confidences[i][0] >= CONFIDENCE_THRESHOLD: + for j in range(3): + prediction = decoded_predictions[i][j] + probability = top_confidences[i][j] + if probability >= CONFIDENCE_THRESHOLD: + category_suggestions[prediction] = probability + else: + break + else: + category_suggestions["Not Predictable"] = 0.0 + + suggestions[category] = category_suggestions + + return suggestions + + except Exception as e: + logger.error( + f"Error occured during standardization in standardize function: {str(e)}" + ) diff --git a/attribute_standardizer/attribute_standardizer.py b/attribute_standardizer/attribute_standardizer.py index fcc82d4..baf306a 100644 --- a/attribute_standardizer/attribute_standardizer.py +++ b/attribute_standardizer/attribute_standardizer.py @@ -1,3 +1,5 @@ +# This script is not used anymore + import pandas as pd import numpy as np import torch @@ -145,4 +147,4 @@ def attr_standardizer(pep: str, schema: str) -> None: csv_file = fetch_from_pephub(pep) suggestions = standardize_attr_names(csv_file, schema) - logger.info(suggestions) + return suggestions diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py index 7ed657a..f74988b 100644 --- a/attribute_standardizer/const.py +++ b/attribute_standardizer/const.py @@ -1,6 +1,6 @@ REPO_ID = "databio/attribute-standardizer-model6" -FILENAME_ENCODE = "model_encode.pth" -FILENAME_FAIRTRACKS = "model_fairtracks.pth" +MODEL_ENCODE = "model_encode.pth" +MODEL_FAIRTRACKS = "model_fairtracks.pth" ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl" FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl" ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl" @@ -8,5 +8,9 @@ SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" HIDDEN_SIZE = 256 DROPOUT_PROB = 0.203 -CONFIDENCE_THRESHOLD = 0.9 +CONFIDENCE_THRESHOLD = 0.51 EMBEDDING_SIZE = 384 +INPUT_SIZE_BOW_ENCODE = 24014 +INPUT_SIZE_BOW_FAIRTRACKS = 13617 +OUTPUT_SIZE_FAIRTRACKS = 15 +OUTPUT_SIZE_ENCODE = 18 diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index c16f025..152e0c5 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -11,14 +11,23 @@ from typing import Optional, Any, List, Tuple, Union from .const import ( REPO_ID, - FILENAME_ENCODE, - FILENAME_FAIRTRACKS, + MODEL_ENCODE, + MODEL_FAIRTRACKS, ENCODE_LABEL_ENCODER_FILENAME, FAIRTRACKS_LABEL_ENCODER_FILENAME, ENCODE_VECTORIZER_FILENAME, FAIRTRACKS_VECTORIZER_FILENAME, SENTENCE_TRANSFORMER_MODEL, ) +import warnings + + +# TODO : convert to single np array before converting to tensor +warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Creating a tensor from a list of numpy.ndarrays is extremely slow.", +) def fetch_from_pephub(pep: str) -> pd.DataFrame: @@ -43,9 +52,9 @@ def load_from_huggingface(schema: str) -> Optional[Any]: :return Optional[Any]: Loaded model object """ if schema == "ENCODE": - model = hf_hub_download(repo_id=REPO_ID, filename=FILENAME_ENCODE) + model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE) elif schema == "FAIRTRACKS": - model = hf_hub_download(repo_id=REPO_ID, filename=FILENAME_FAIRTRACKS) + model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS) return model diff --git a/trial.py b/trial.py index 160ae30..cfa1a8b 100644 --- a/trial.py +++ b/trial.py @@ -1,3 +1,7 @@ -from attribute_standardizer.attribute_standardizer import attr_standardizer +from attribute_standardizer.attr_standardizer_class import AttrStandardizer -attr_standardizer(pep="geo/gse178283:default", schema="ENCODE") +model = AttrStandardizer("ENCODE") + +results = model.standardize(pep ="geo/gse178283:default") + +print(results) From 8697c89b1000f5df88f7dae80c233139ff69ca54 Mon Sep 17 00:00:00 2001 From: saanikat Date: Thu, 15 Aug 2024 23:01:49 -0400 Subject: [PATCH 02/27] File type, hyperparameters changed --- attribute_standardizer/attr_standardizer_class.py | 2 +- attribute_standardizer/const.py | 4 ++-- attribute_standardizer/model.py | 12 ------------ 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/attribute_standardizer/attr_standardizer_class.py b/attribute_standardizer/attr_standardizer_class.py index 252fee7..aa97f88 100644 --- a/attribute_standardizer/attr_standardizer_class.py +++ b/attribute_standardizer/attr_standardizer_class.py @@ -65,7 +65,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: DROPOUT_PROB, ) else: - raise ValueError(f"Schema not available: {self.schema}") + raise ValueError(f"Schema not available: {self.schema}. Presently, two schemas are available: ENCODE , FAIRTRACKS") def _load_model(self) -> nn.Module: """ diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py index f74988b..e9743ca 100644 --- a/attribute_standardizer/const.py +++ b/attribute_standardizer/const.py @@ -7,10 +7,10 @@ FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" HIDDEN_SIZE = 256 -DROPOUT_PROB = 0.203 +DROPOUT_PROB = 0.241 CONFIDENCE_THRESHOLD = 0.51 EMBEDDING_SIZE = 384 -INPUT_SIZE_BOW_ENCODE = 24014 +INPUT_SIZE_BOW_ENCODE = 24030 INPUT_SIZE_BOW_FAIRTRACKS = 13617 OUTPUT_SIZE_FAIRTRACKS = 15 OUTPUT_SIZE_ENCODE = 18 diff --git a/attribute_standardizer/model.py b/attribute_standardizer/model.py index 23b9109..af212bc 100644 --- a/attribute_standardizer/model.py +++ b/attribute_standardizer/model.py @@ -29,18 +29,12 @@ def __init__( super(BoWSTModel, self).__init__() self.fc_values1 = nn.Linear(input_size_values, hidden_size) self.dropout_values1 = nn.Dropout(dropout_prob) - self.fc_values2 = nn.Linear(hidden_size, hidden_size) - self.dropout_values2 = nn.Dropout(dropout_prob) self.fc_values_embeddings1 = nn.Linear( input_size_values_embeddings, hidden_size ) self.dropout_values_embeddings1 = nn.Dropout(dropout_prob) - self.fc_values_embeddings2 = nn.Linear(hidden_size, hidden_size) - self.dropout_values_embeddings2 = nn.Dropout(dropout_prob) self.fc_headers1 = nn.Linear(input_size_headers, hidden_size) self.dropout_headers1 = nn.Dropout(dropout_prob) - self.fc_headers2 = nn.Linear(hidden_size, hidden_size) - self.dropout_headers2 = nn.Dropout(dropout_prob) self.fc_combined1 = nn.Linear(hidden_size * 3, hidden_size) self.dropout_combined1 = nn.Dropout(dropout_prob) self.fc_combined2 = nn.Linear(hidden_size, output_size) @@ -61,16 +55,10 @@ def forward( """ x_values = F.relu(self.fc_values1(x_values)) x_values = self.dropout_values1(x_values) - x_values = F.relu(self.fc_values2(x_values)) - x_values = self.dropout_values2(x_values) x_values_embeddings = F.relu(self.fc_values_embeddings1(x_values_embeddings)) x_values_embeddings = self.dropout_values_embeddings1(x_values_embeddings) - x_values_embeddings = F.relu(self.fc_values_embeddings2(x_values_embeddings)) - x_values_embeddings = self.dropout_values_embeddings2(x_values_embeddings) x_headers = F.relu(self.fc_headers1(x_headers)) x_headers = self.dropout_headers1(x_headers) - x_headers = F.relu(self.fc_headers2(x_headers)) - x_headers = self.dropout_headers2(x_headers) x_combined = torch.cat((x_values, x_values_embeddings, x_headers), dim=1) x_combined = F.relu(self.fc_combined1(x_combined)) From f9ae04d7f370a06644e44af40e14d1b4a55d1c10 Mon Sep 17 00:00:00 2001 From: saanikat <114712639+saanikat@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:04:54 -0400 Subject: [PATCH 03/27] Delete attribute_standardizer/attribute_standardizer.py --- .../attribute_standardizer.py | 150 ------------------ 1 file changed, 150 deletions(-) delete mode 100644 attribute_standardizer/attribute_standardizer.py diff --git a/attribute_standardizer/attribute_standardizer.py b/attribute_standardizer/attribute_standardizer.py deleted file mode 100644 index baf306a..0000000 --- a/attribute_standardizer/attribute_standardizer.py +++ /dev/null @@ -1,150 +0,0 @@ -# This script is not used anymore - -import pandas as pd -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import logging -from .const import ( - HIDDEN_SIZE, - DROPOUT_PROB, - CONFIDENCE_THRESHOLD, - EMBEDDING_SIZE, - SENTENCE_TRANSFORMER_MODEL, -) - -from .utils import ( - fetch_from_pephub, - load_from_huggingface, - data_preprocessing, - data_encoding, -) -from .model import BoWSTModel -from huggingface_hub import hf_hub_download -from typing import Dict, List, Tuple, Any, Union - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def standardize_attr_names(csv_file: str, schema: str) -> Dict[str, Dict[str, float]]: - """ - Standardize attribute names. - - :param str csv_file: Path to the CSV file containing metadata to be standardized. - :param str schema: Schema type. - :return Dict[str, Dict[str, float]]: Suggestions for standardized attribute names. - """ - try: - X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file) - ( - X_headers_embeddings_tensor, - X_values_embeddings_tensor, - X_values_bow_tensor, - label_encoder, - ) = data_encoding( - X_values_st, - X_headers_st, - X_values_bow, - schema, - model_name=SENTENCE_TRANSFORMER_MODEL, - ) - logger.info("Data Preprocessing completed.") - - model = load_from_huggingface(schema) - # print(model) - state_dict = torch.load(model) - - """Padding the input tensors.""" - - padded_data_values_tensor = torch.zeros( - X_values_bow_tensor.shape[0], state_dict["fc_values1.weight"].shape[1] - ) - padded_data_headers_tensor = torch.zeros( - X_headers_embeddings_tensor.shape[0], - state_dict["fc_headers1.weight"].shape[1], - ) - padded_data_values_embeddings_tensor = torch.zeros( - X_values_embeddings_tensor.shape[0], - state_dict["fc_values_embeddings1.weight"].shape[1], - ) - - padded_data_values_tensor[:, : X_values_bow_tensor.shape[1]] = ( - X_values_bow_tensor - ) - padded_data_headers_tensor[:, : X_headers_embeddings_tensor.shape[1]] = ( - X_headers_embeddings_tensor - ) - padded_data_values_embeddings_tensor[ - :, : X_values_embeddings_tensor.shape[1] - ] = X_values_embeddings_tensor - - input_size_values = padded_data_values_tensor.shape[1] - input_size_headers = EMBEDDING_SIZE - input_size_values_embeddings = EMBEDDING_SIZE - hidden_size = HIDDEN_SIZE - output_size = len(label_encoder.classes_) - dropout_prob = DROPOUT_PROB - model = BoWSTModel( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) - - model.load_state_dict(state_dict) - - model.eval() - - all_preds = [] - all_confidences = [] - with torch.no_grad(): - outputs = model( - padded_data_values_tensor, - padded_data_values_embeddings_tensor, - padded_data_headers_tensor, - ) - probabilities = F.softmax(outputs, dim=1) - confidence, predicted = torch.max(probabilities, 1) - all_preds.extend(predicted.tolist()) - all_confidences.extend(confidence.tolist()) - - decoded_predictions = label_encoder.inverse_transform(all_preds) - - suggestions = {} - for i, category in enumerate(X_headers_st): - if all_confidences[i] >= CONFIDENCE_THRESHOLD: - prediction = decoded_predictions[i] - probability = all_confidences[i] - else: - prediction = "Not Predictable" - probability = 0.0 - suggestions[category] = {prediction: probability} - - return suggestions - except Exception as e: - logger.error(f"Error occured in standardize_attr_names: {str(e)}") - return {} - - -def attr_standardizer(pep: str, schema: str) -> None: - """ - :param str pep: Path to the PEPhub registry containing the metadata csv file. - :param str schema: Schema Type chosen by the user. - """ - if not pep: - raise ValueError( - "pep argument is missing or empty. Please provide the PEPHub registry path to PEP" - ) - if not schema: - raise ValueError( - "schema argument is missing or empty. Please mention the schema of choice: ENCODE or FAIRTRACKS." - ) - csv_file = fetch_from_pephub(pep) - suggestions = standardize_attr_names(csv_file, schema) - - return suggestions From a43afb9908aa903cebf839121c503feecc412237 Mon Sep 17 00:00:00 2001 From: saanikat Date: Wed, 28 Aug 2024 11:50:41 -0400 Subject: [PATCH 04/27] retrained models, added bedbase --- .../attr_standardizer_class.py | 18 ++++- attribute_standardizer/const.py | 14 ++-- attribute_standardizer/utils.py | 69 ++++++++++++++++++- 3 files changed, 95 insertions(+), 6 deletions(-) diff --git a/attribute_standardizer/attr_standardizer_class.py b/attribute_standardizer/attr_standardizer_class.py index aa97f88..db36fc2 100644 --- a/attribute_standardizer/attr_standardizer_class.py +++ b/attribute_standardizer/attr_standardizer_class.py @@ -1,3 +1,6 @@ +# TODO take the pep object as input, add a function for that and then add the present fetch_from_pep as the wrapper +# TODO use the peppy constructor to take the Peppy.Project object - prj = peppy.Project(pep) + import pandas as pd import numpy as np import torch @@ -14,6 +17,8 @@ INPUT_SIZE_BOW_ENCODE, OUTPUT_SIZE_ENCODE, OUTPUT_SIZE_FAIRTRACKS, + INPUT_SIZE_BOW_BEDBASE, + OUTPUT_SIZE_BEDBASE, ) from .utils import ( @@ -64,8 +69,19 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_FAIRTRACKS, DROPOUT_PROB, ) + elif self.schema == "BEDBASE": + return ( + INPUT_SIZE_BOW_BEDBASE, + EMBEDDING_SIZE, + EMBEDDING_SIZE, + HIDDEN_SIZE, + OUTPUT_SIZE_BEDBASE, + DROPOUT_PROB, + ) else: - raise ValueError(f"Schema not available: {self.schema}. Presently, two schemas are available: ENCODE , FAIRTRACKS") + raise ValueError( + f"Schema not available: {self.schema}. Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + ) def _load_model(self) -> nn.Module: """ diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py index e9743ca..460abfa 100644 --- a/attribute_standardizer/const.py +++ b/attribute_standardizer/const.py @@ -1,16 +1,22 @@ REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" MODEL_FAIRTRACKS = "model_fairtracks.pth" +MODEL_BEDBASE = "model_bedbase.pth" ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl" FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl" +BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl" ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl" FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl" +BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" -HIDDEN_SIZE = 256 -DROPOUT_PROB = 0.241 -CONFIDENCE_THRESHOLD = 0.51 +HIDDEN_SIZE = 32 +DROPOUT_PROB = 0.113 +CONFIDENCE_THRESHOLD = 0.70 EMBEDDING_SIZE = 384 -INPUT_SIZE_BOW_ENCODE = 24030 +INPUT_SIZE_BOW_ENCODE = 10459 INPUT_SIZE_BOW_FAIRTRACKS = 13617 OUTPUT_SIZE_FAIRTRACKS = 15 OUTPUT_SIZE_ENCODE = 18 +NUM_CLUSTERS = 3 +INPUT_SIZE_BOW_BEDBASE = 13708 +OUTPUT_SIZE_BEDBASE = 12 diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index 152e0c5..8922941 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -6,18 +6,24 @@ import pickle from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import CountVectorizer +from sklearn.cluster import KMeans from collections import Counter from huggingface_hub import hf_hub_download +from sklearn.metrics import silhouette_score from typing import Optional, Any, List, Tuple, Union from .const import ( REPO_ID, MODEL_ENCODE, MODEL_FAIRTRACKS, + MODEL_BEDBASE, ENCODE_LABEL_ENCODER_FILENAME, FAIRTRACKS_LABEL_ENCODER_FILENAME, ENCODE_VECTORIZER_FILENAME, FAIRTRACKS_VECTORIZER_FILENAME, + BEDBASE_VECTORIZER_FILENAME, + BEDBASE_LABEL_ENCODER_FILENAME, SENTENCE_TRANSFORMER_MODEL, + NUM_CLUSTERS, ) import warnings @@ -30,6 +36,11 @@ ) +def fetch_pep(pep): + # input of python object of peppy.Project and output of csv_fle_df + raise NotImplementedError + + def fetch_from_pephub(pep: str) -> pd.DataFrame: """ Fetches metadata from PEPhub registry. @@ -55,6 +66,8 @@ def load_from_huggingface(schema: str) -> Optional[Any]: model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE) elif schema == "FAIRTRACKS": model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS) + elif schema == "BEDBASE": + model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE) return model @@ -98,6 +111,35 @@ def get_top_k_average(val_embedding: List[np.ndarray], k: int) -> np.ndarray: return column_embedding_mean.numpy() +def get_top_cluster_averaged(embeddings: List[np.ndarray]) -> np.ndarray: + """ + Calculates the average of the largest embedding cluster. + + :param list embeddings: List of embeddings, each embedding is a vector of values. + :return np.ndarray: The mean of the largest cluster as a NumPy array. + """ + flattened_embeddings = [embedding.tolist() for embedding in embeddings] + kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(flattened_embeddings) + labels_kmeans = kmeans.labels_ + cluster_counts = Counter(labels_kmeans) + most_common_cluster = max(cluster_counts, key=cluster_counts.get) + most_common_indices = [ + idx for idx, label in enumerate(labels_kmeans) if label == most_common_cluster + ] + most_common_embeddings = [ + torch.tensor(embeddings[idx]) for idx in most_common_indices + ] + + if most_common_embeddings: + top_k_average = torch.mean( + torch.stack(most_common_embeddings), dim=0 + ).unsqueeze(0) + else: + top_k_average = torch.zeros_like(most_common_embeddings[0]).unsqueeze(0) + + return top_k_average.numpy() + + def data_encoding( X_values_st: List[List[str]], X_headers_st: List[str], @@ -123,7 +165,7 @@ def data_encoding( embeddings = [] for column in X_values_st: val_embedding = sentence_encoder.encode(column, show_progress_bar=False) - embedding = get_top_k_average(val_embedding, k=3) + embedding = get_top_cluster_averaged(val_embedding) embeddings.append(embedding) X_values_embeddings = embeddings if schema == "ENCODE": @@ -176,11 +218,36 @@ def data_encoding( with open(lb_path, "rb") as f: label_encoder = pickle.load(f) + elif schema == "BEDBASE": + vectorizer = CountVectorizer() + vc_path = hf_hub_download(repo_id=REPO_ID, filename=BEDBASE_VECTORIZER_FILENAME) + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + transformed_columns = [] + for column in X_values_bow: + column_text = " ".join(column) + transformed_column = vectorizer.transform([column_text]) + transformed_columns.append(transformed_column.toarray()[0]) + transformed_columns = np.array(transformed_columns) + # print(transformed_columns) + X_values_bow = transformed_columns + # Label Encoding + label_encoder = LabelEncoder() + lb_path = hf_hub_download( + repo_id=REPO_ID, + filename=BEDBASE_LABEL_ENCODER_FILENAME, + ) + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + X_headers_embeddings_tensor = torch.tensor( X_headers_embeddings, dtype=torch.float32 ) X_values_embeddings_tensor = torch.tensor(X_values_embeddings, dtype=torch.float32) X_values_bow_tensor = torch.tensor(X_values_bow, dtype=torch.float32) + X_values_embeddings_tensor = X_values_embeddings_tensor.squeeze( + 1 + ) # brings the shape to [num_cols, vocab] return ( X_headers_embeddings_tensor, From 265f7d2348379c22458c3adb72b143b93a7777e4 Mon Sep 17 00:00:00 2001 From: saanikat Date: Wed, 28 Aug 2024 11:54:18 -0400 Subject: [PATCH 05/27] updated readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb6c404..062f222 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # bedmess -bedmess is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS). +bedmess is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). To install `attribute-standardizer` , you need to clone this repository first. Follow the steps given below to install: From 5a721157477d50ae631bb3545fe95c2c9147a7e1 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 28 Aug 2024 16:26:54 -0400 Subject: [PATCH 06/27] fixed #16 --- README.md | 3 +- attribute_standardizer/__init__.py | 4 +- ...dardizer_class.py => attr_standardizer.py} | 55 ++++++++++--------- attribute_standardizer/utils.py | 24 +++++++- requirements/requirements-all.txt | 4 +- scripts/model1.py | 13 +++-- trial.py | 4 +- 7 files changed, 65 insertions(+), 42 deletions(-) rename attribute_standardizer/{attr_standardizer_class.py => attr_standardizer.py} (84%) diff --git a/README.md b/README.md index 062f222..33800c5 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,10 @@ Using Python, this is how you can run `attribute_standardizer` and print the res ``` -from attribute_standardizer.attr_standardizer_class import AttrStandardizer +from attribute_standardizer import AttrStandardizer model = AttrStandardizer("ENCODE") +model = AttrStandardizer("FAIRTRACKS") results = model.standardize(pep ="geo/gse178283:default") diff --git a/attribute_standardizer/__init__.py b/attribute_standardizer/__init__.py index 6e82403..374c0be 100644 --- a/attribute_standardizer/__init__.py +++ b/attribute_standardizer/__init__.py @@ -1,3 +1 @@ -# from .attribute_standardizer import attr_standardizer - -from .attr_standardizer_class import AttrStandardizer +from .attr_standardizer import AttrStandardizer diff --git a/attribute_standardizer/attr_standardizer_class.py b/attribute_standardizer/attr_standardizer.py similarity index 84% rename from attribute_standardizer/attr_standardizer_class.py rename to attribute_standardizer/attr_standardizer.py index db36fc2..7f76024 100644 --- a/attribute_standardizer/attr_standardizer_class.py +++ b/attribute_standardizer/attr_standardizer.py @@ -1,12 +1,19 @@ -# TODO take the pep object as input, add a function for that and then add the present fetch_from_pep as the wrapper -# TODO use the peppy constructor to take the Peppy.Project object - prj = peppy.Project(pep) - -import pandas as pd -import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F +import torch.nn.functional as torch_functional import logging +import peppy + +from typing import Dict, Tuple, Union + +from .model import BoWSTModel +from .utils import ( + fetch_from_pephub, + load_from_huggingface, + data_preprocessing, + data_encoding, + get_any_pep, +) from .const import ( HIDDEN_SIZE, DROPOUT_PROB, @@ -21,33 +28,25 @@ OUTPUT_SIZE_BEDBASE, ) -from .utils import ( - fetch_from_pephub, - load_from_huggingface, - data_preprocessing, - data_encoding, -) -from .model import BoWSTModel -from huggingface_hub import hf_hub_download -from typing import Dict, List, Tuple, Any, Union - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class AttrStandardizer: - def __init__(self, schema: str) -> None: + def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param int confidence: Confidence threshold for the predictions. """ self.schema = schema self.model = self._load_model() + self.conf_threshold = confidence def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: """ - Gets the model parameters as per the chosen schema. + Get the model parameters as per the chosen schema. :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. """ @@ -118,16 +117,22 @@ def _load_model(self) -> nn.Module: logger.error(f"Error loading the model: {str(e)}") raise - def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: + def standardize( + self, pep: Union[str, peppy.Project] + ) -> Dict[str, Dict[str, float]]: """ Fetches the user provided PEP from the PEPHub registry path, returns the predictions. - :param str pep: User provided path to the PEP. + :param str pep: peppy.Project object or PEPHub registry path to PEP. :return Dict[str, Dict[str, float]]: Suggestions to the user. """ - if not pep: + if isinstance(pep, str): + pep = get_any_pep(pep) + elif isinstance(pep, peppy.Project): + pass + else: raise ValueError( - "PEP path is missing or empty. Please provide the PEPHub registry path to PEP" + f"PEP should be either a path to PEPHub registry or peppy.Project object." ) try: csv_file = fetch_from_pephub(pep) @@ -153,7 +158,7 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: X_values_embeddings_tensor, X_headers_embeddings_tensor, ) - probabilities = F.softmax(outputs, dim=1) + probabilities = torch_functional.softmax(outputs, dim=1) # confidence, predicted = torch.max(probabilities, 1) values, indices = torch.topk(probabilities, k=3, dim=1) @@ -167,11 +172,11 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: suggestions = {} for i, category in enumerate(X_headers_st): category_suggestions = {} - if top_confidences[i][0] >= CONFIDENCE_THRESHOLD: + if top_confidences[i][0] >= self.conf_threshold: for j in range(3): prediction = decoded_predictions[i][j] probability = top_confidences[i][j] - if probability >= CONFIDENCE_THRESHOLD: + if probability >= self.conf_threshold: category_suggestions[prediction] = probability else: break diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index 8922941..ef45bb0 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -9,8 +9,10 @@ from sklearn.cluster import KMeans from collections import Counter from huggingface_hub import hf_hub_download -from sklearn.metrics import silhouette_score from typing import Optional, Any, List, Tuple, Union +import warnings +import peppy + from .const import ( REPO_ID, MODEL_ENCODE, @@ -22,10 +24,8 @@ FAIRTRACKS_VECTORIZER_FILENAME, BEDBASE_VECTORIZER_FILENAME, BEDBASE_LABEL_ENCODER_FILENAME, - SENTENCE_TRANSFORMER_MODEL, NUM_CLUSTERS, ) -import warnings # TODO : convert to single np array before converting to tensor @@ -255,3 +255,21 @@ def data_encoding( X_values_bow_tensor, label_encoder, ) + + +def get_any_pep(pep: str) -> peppy.Project: + """ + Get the PEP file from the local system or from PEPhub. + + :param pep: Path to the PEP file or PEPhub registry path. + + :return: peppy.Project object. + """ + + PEP_FILE_TYPES = ["yaml", "csv"] + + res = list(filter(pep.endswith, PEP_FILE_TYPES)) != [] + if res: + return peppy.Project(pep) + else: + return peppy.Project.from_pephub(pep) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 6642681..848e7e8 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -2,5 +2,5 @@ pandas numpy torch sentence-transformers -pephubclient - +pephubclient>=0.4.2 +peppy>=0.40.5 diff --git a/scripts/model1.py b/scripts/model1.py index bef41fb..0118add 100644 --- a/scripts/model1.py +++ b/scripts/model1.py @@ -29,7 +29,8 @@ class NN1(nn.Module): - """ Simple Neural Network with a single Hidden Layer.""" + """Simple Neural Network with a single Hidden Layer.""" + def __init__(self, input_size, hidden_size, output_size): """ Initializes the NN1 model. @@ -45,7 +46,7 @@ def __init__(self, input_size, hidden_size, output_size): def forward(self, x): """ - Defines the forward pass of the neural network. + Defines the forward pass of the neural network. :param torch.Tensor x: Input tensor. :return torch.Tensor: Output tensor after passing through the network. @@ -86,14 +87,14 @@ def data_split(df_values): df_values_temp, test_size=0.5, random_state=42 ) - #Snippet for testing on unseen data + # Snippet for testing on unseen data """ df_values_test = pd.read_csv( "/home/saanika/curation/scripts/bedmess_archive/data/encode_metadata_values_moderate.csv", sep=",", ) """ - #Comment out the above for training on seen data. + # Comment out the above for training on seen data. X_values_train = [ df_values_train[column].astype(str).tolist() @@ -135,9 +136,9 @@ def data_split(df_values): def encoding(X_values_train, X_values_test, X_values_val, y_train, y_test, y_val): """ - Encodes the values for the model. + Encodes the values for the model. - :param list X_values_train: Training features. + :param list X_values_train: Training features. :param list X_values_test: Testing features. :param list X_values_val: Validation features. :param list y_train: Training labels. diff --git a/trial.py b/trial.py index cfa1a8b..88a257e 100644 --- a/trial.py +++ b/trial.py @@ -1,7 +1,7 @@ -from attribute_standardizer.attr_standardizer_class import AttrStandardizer +from attribute_standardizer.attr_standardizer import AttrStandardizer model = AttrStandardizer("ENCODE") -results = model.standardize(pep ="geo/gse178283:default") +results = model.standardize(pep="geo/gse178283:default") print(results) From 82940db3f893ceeaf39bd7385e243a0680825821 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 28 Aug 2024 16:38:52 -0400 Subject: [PATCH 07/27] cleaning --- attribute_standardizer/attr_standardizer.py | 47 +++++++++++---------- attribute_standardizer/const.py | 2 + attribute_standardizer/utils.py | 44 +++++++++---------- 3 files changed, 45 insertions(+), 48 deletions(-) diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py index 7f76024..f29adc6 100644 --- a/attribute_standardizer/attr_standardizer.py +++ b/attribute_standardizer/attr_standardizer.py @@ -1,35 +1,36 @@ +import logging +from typing import Dict, Tuple, Union + +import peppy import torch import torch.nn as nn import torch.nn.functional as torch_functional -import logging -import peppy - -from typing import Dict, Tuple, Union -from .model import BoWSTModel -from .utils import ( - fetch_from_pephub, - load_from_huggingface, - data_preprocessing, - data_encoding, - get_any_pep, -) from .const import ( - HIDDEN_SIZE, - DROPOUT_PROB, CONFIDENCE_THRESHOLD, + DROPOUT_PROB, EMBEDDING_SIZE, - SENTENCE_TRANSFORMER_MODEL, - INPUT_SIZE_BOW_FAIRTRACKS, + HIDDEN_SIZE, + INPUT_SIZE_BOW_BEDBASE, INPUT_SIZE_BOW_ENCODE, + INPUT_SIZE_BOW_FAIRTRACKS, + OUTPUT_SIZE_BEDBASE, OUTPUT_SIZE_ENCODE, OUTPUT_SIZE_FAIRTRACKS, - INPUT_SIZE_BOW_BEDBASE, - OUTPUT_SIZE_BEDBASE, + SENTENCE_TRANSFORMER_MODEL, + PROJECT_NAME, +) +from .model import BoWSTModel +from .utils import ( + data_encoding, + data_preprocessing, + fetch_from_pephub, + get_any_pep, + load_from_huggingface, ) logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +logger = logging.getLogger(PROJECT_NAME) class AttrStandardizer: @@ -132,11 +133,11 @@ def standardize( pass else: raise ValueError( - f"PEP should be either a path to PEPHub registry or peppy.Project object." + "PEP should be either a path to PEPHub registry or peppy.Project object." ) try: csv_file = fetch_from_pephub(pep) - schema = self.schema + X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file) ( X_headers_embeddings_tensor, @@ -147,9 +148,10 @@ def standardize( X_values_st, X_headers_st, X_values_bow, - schema, + self.schema, model_name=SENTENCE_TRANSFORMER_MODEL, ) + logger.info("Data Preprocessing completed.") with torch.no_grad(): @@ -159,7 +161,6 @@ def standardize( X_headers_embeddings_tensor, ) probabilities = torch_functional.softmax(outputs, dim=1) - # confidence, predicted = torch.max(probabilities, 1) values, indices = torch.topk(probabilities, k=3, dim=1) top_preds = indices.tolist() diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py index 460abfa..54e9b06 100644 --- a/attribute_standardizer/const.py +++ b/attribute_standardizer/const.py @@ -1,3 +1,5 @@ +PROJECT_NAME = "bedmess" + REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" MODEL_FAIRTRACKS = "model_fairtracks.pth" diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index ef45bb0..8e798ba 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -1,33 +1,33 @@ -import pandas as pd +import pickle +import warnings +from collections import Counter +from typing import Any, List, Optional, Tuple, Union + import numpy as np +import pandas as pd +import peppy import torch +from huggingface_hub import hf_hub_download from pephubclient import PEPHubClient from sentence_transformers import SentenceTransformer -import pickle -from sklearn.preprocessing import LabelEncoder -from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import KMeans -from collections import Counter -from huggingface_hub import hf_hub_download -from typing import Optional, Any, List, Tuple, Union -import warnings -import peppy +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import LabelEncoder from .const import ( - REPO_ID, - MODEL_ENCODE, - MODEL_FAIRTRACKS, - MODEL_BEDBASE, + BEDBASE_LABEL_ENCODER_FILENAME, + BEDBASE_VECTORIZER_FILENAME, ENCODE_LABEL_ENCODER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, ENCODE_VECTORIZER_FILENAME, + FAIRTRACKS_LABEL_ENCODER_FILENAME, FAIRTRACKS_VECTORIZER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - BEDBASE_LABEL_ENCODER_FILENAME, + MODEL_BEDBASE, + MODEL_ENCODE, + MODEL_FAIRTRACKS, NUM_CLUSTERS, + REPO_ID, ) - # TODO : convert to single np array before converting to tensor warnings.filterwarnings( "ignore", @@ -36,20 +36,14 @@ ) -def fetch_pep(pep): - # input of python object of peppy.Project and output of csv_fle_df - raise NotImplementedError - - -def fetch_from_pephub(pep: str) -> pd.DataFrame: +def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame: """ Fetches metadata from PEPhub registry. :param str pep: Path to the PEPhub registry containing the metadata csv file :return pd.DataFrame: path to the CSV file on the local system. """ - phc = PEPHubClient() - project = phc.load_project(pep) + sample_table = project.sample_table csv_file_df = pd.DataFrame(sample_table) return csv_file_df From f9bdffeccf33aef1cce26258eb0a8890be759ad4 Mon Sep 17 00:00:00 2001 From: saanikat Date: Fri, 30 Aug 2024 14:12:57 -0400 Subject: [PATCH 08/27] minor changes to trial.py --- trial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trial.py b/trial.py index cfa1a8b..393b932 100644 --- a/trial.py +++ b/trial.py @@ -1,7 +1,7 @@ from attribute_standardizer.attr_standardizer_class import AttrStandardizer -model = AttrStandardizer("ENCODE") +model = AttrStandardizer("BEDBASE") -results = model.standardize(pep ="geo/gse178283:default") +results = model.standardize(pep ="geo/gse228815:default") print(results) From ce167ce8418ca5f34e6585b4b4c1dfe61f2de91f Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 12 Sep 2024 14:39:46 -0400 Subject: [PATCH 09/27] Added github workflows --- .github/workflows/black.yml | 11 ++++++++++ .github/workflows/python-publish.yml | 30 ++++++++++++++++++++++++++++ attribute_standardizer/_version.py | 2 +- 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/black.yml create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..052e2ec --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - uses: psf/black@stable diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..ddf9a38 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,30 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + name: upload release to PyPI + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + run: | + python setup.py sdist bdist_wheel + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/attribute_standardizer/_version.py b/attribute_standardizer/_version.py index 485f44a..3dc1f76 100644 --- a/attribute_standardizer/_version.py +++ b/attribute_standardizer/_version.py @@ -1 +1 @@ -__version__ = "0.1.1" +__version__ = "0.1.0" From e53427032df737c084732d220f0560f71f4d1e78 Mon Sep 17 00:00:00 2001 From: saanikat Date: Thu, 12 Sep 2024 16:26:31 -0400 Subject: [PATCH 10/27] adding averaging for less than 10 samples --- README.md | 9 +++++++ attribute_standardizer/attr_standardizer.py | 14 ++++++++++- attribute_standardizer/utils.py | 28 ++++++++++++++++++--- trial.py | 9 +++++-- 4 files changed, 54 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 33800c5..2289d46 100644 --- a/README.md +++ b/README.md @@ -31,4 +31,13 @@ print(results) ``` +To see the available schemas, you can run: +``` +schemas = model.show_available_schemas() + +print(schemas) +``` + +This will print the available schemas as a list. + You can use the format provided in the `trial.py` script in this repository as a reference. \ No newline at end of file diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py index f29adc6..148fb16 100644 --- a/attribute_standardizer/attr_standardizer.py +++ b/attribute_standardizer/attr_standardizer.py @@ -138,13 +138,16 @@ def standardize( try: csv_file = fetch_from_pephub(pep) - X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file) + X_values_st, X_headers_st, X_values_bow, num_rows = data_preprocessing( + csv_file + ) ( X_headers_embeddings_tensor, X_values_embeddings_tensor, X_values_bow_tensor, label_encoder, ) = data_encoding( + num_rows, X_values_st, X_headers_st, X_values_bow, @@ -192,3 +195,12 @@ def standardize( logger.error( f"Error occured during standardization in standardize function: {str(e)}" ) + @staticmethod + def show_available_schemas()-> list[str]: + """ + Stores a list of available schemas. + :return list: List of available schemas. + """ + schemas = ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] + return schemas + diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index 8e798ba..aff492e 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -67,7 +67,7 @@ def load_from_huggingface(schema: str) -> Optional[Any]: def data_preprocessing( df: pd.DataFrame, -) -> Tuple[List[List[str]], List[str], List[List[str]]]: +) -> Tuple[List[List[str]], List[str], List[List[str]], int]: """ Preprocessing the DataFrame by extracting the column values and headers. @@ -76,13 +76,16 @@ def data_preprocessing( - Nested list containing the comma separated values in each column for sentence transformer embeddings. - List containing the headers of the DataFrame. - Nested list containing the comma separated values in each column for Bag of Words encoding. + - Number of rows in the metadata csv """ X_values_st = [df[column].astype(str).tolist() for column in df.columns] X_headers_st = df.columns.tolist() X_values_bow = [df[column].astype(str).tolist() for column in df.columns] - return X_values_st, X_headers_st, X_values_bow + num_rows = df.shape[0] + + return X_values_st, X_headers_st, X_values_bow, num_rows def get_top_k_average(val_embedding: List[np.ndarray], k: int) -> np.ndarray: @@ -134,7 +137,21 @@ def get_top_cluster_averaged(embeddings: List[np.ndarray]) -> np.ndarray: return top_k_average.numpy() +def get_averaged(embeddings: List[np.ndarray]) -> np.ndarray: + """ + Averages the embeddings. + :param list embeddings: List of embeddings, each embedding is a vector of values. + :return np.ndarray: The mean of all the embeddings as a NumPy array. + """ + flattened_embeddings = [embedding.tolist() for embedding in embeddings] + flattened_embeddings_array = np.array(flattened_embeddings) + averaged_embedding = np.mean(flattened_embeddings_array, axis=0) + + return averaged_embedding + + def data_encoding( + num_rows: int, X_values_st: List[List[str]], X_headers_st: List[str], X_values_bow: List[List[str]], @@ -144,6 +161,7 @@ def data_encoding( """ Encode input data in accordance with the user-specified schemas. + :param int num_rows: Number of rows in the sample metadata :param list X_values_st: Nested list containing the comma separated values in each column for sentence transformer embeddings. :param list X_headers_st: List containing the headers of the DataFrame. :param list X_values_bow: Nested list containing the comma separated values in each column for Bag of Words encoding. @@ -159,7 +177,11 @@ def data_encoding( embeddings = [] for column in X_values_st: val_embedding = sentence_encoder.encode(column, show_progress_bar=False) - embedding = get_top_cluster_averaged(val_embedding) + if num_rows >= 10: + embedding = get_top_cluster_averaged(val_embedding) + else: + embedding = get_averaged(val_embedding) + embeddings.append(embedding) X_values_embeddings = embeddings if schema == "ENCODE": diff --git a/trial.py b/trial.py index 88a257e..b704345 100644 --- a/trial.py +++ b/trial.py @@ -2,6 +2,11 @@ model = AttrStandardizer("ENCODE") -results = model.standardize(pep="geo/gse178283:default") +schemas = model.show_available_schemas() -print(results) +print(schemas) + +#results = model.standardize(pep="geo/gse178283:default") +results = model.standardize(pep="geo/gse228634:default") + +print(results) \ No newline at end of file From 36a34c4acdd44758c845b8018bada16eac4c3492 Mon Sep 17 00:00:00 2001 From: saanikat Date: Thu, 12 Sep 2024 16:28:06 -0400 Subject: [PATCH 11/27] adding averaging for less than 10 samples --- attribute_standardizer/attr_standardizer.py | 10 +++++----- trial.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py index 148fb16..b250acb 100644 --- a/attribute_standardizer/attr_standardizer.py +++ b/attribute_standardizer/attr_standardizer.py @@ -195,12 +195,12 @@ def standardize( logger.error( f"Error occured during standardization in standardize function: {str(e)}" ) + @staticmethod - def show_available_schemas()-> list[str]: + def show_available_schemas() -> list[str]: """ - Stores a list of available schemas. - :return list: List of available schemas. + Stores a list of available schemas. + :return list: List of available schemas. """ - schemas = ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] + schemas = ["ENCODE", "FAIRTRACKS", "BEDBASE"] return schemas - diff --git a/trial.py b/trial.py index b704345..b2d3224 100644 --- a/trial.py +++ b/trial.py @@ -6,7 +6,7 @@ print(schemas) -#results = model.standardize(pep="geo/gse178283:default") -results = model.standardize(pep="geo/gse228634:default") +# results = model.standardize(pep="geo/gse178283:default") +results = model.standardize(pep="geo/gse228634:default") -print(results) \ No newline at end of file +print(results) From 991d55ba252cfac943df498923f3aae404ca619a Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 12 Sep 2024 22:09:53 -0400 Subject: [PATCH 12/27] deleted pycache --- __pycache__/bedmess.cpython-312.pyc | Bin 146 -> 0 bytes __pycache__/model6_multitable.cpython-312.pyc | Bin 25557 -> 0 bytes .../nn_model1_preprocess.cpython-312.pyc | Bin 4581 -> 0 bytes __pycache__/nn_model1_test.cpython-312.pyc | Bin 5631 -> 0 bytes __pycache__/nn_model1_train.cpython-312.pyc | Bin 6416 -> 0 bytes .../nn_model2_preprocess.cpython-312.pyc | Bin 6797 -> 0 bytes __pycache__/nn_model2_test.cpython-312.pyc | Bin 5657 -> 0 bytes __pycache__/nn_model2_train.cpython-312.pyc | Bin 7707 -> 0 bytes 8 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 __pycache__/bedmess.cpython-312.pyc delete mode 100644 __pycache__/model6_multitable.cpython-312.pyc delete mode 100644 __pycache__/nn_model1_preprocess.cpython-312.pyc delete mode 100644 __pycache__/nn_model1_test.cpython-312.pyc delete mode 100644 __pycache__/nn_model1_train.cpython-312.pyc delete mode 100644 __pycache__/nn_model2_preprocess.cpython-312.pyc delete mode 100644 __pycache__/nn_model2_test.cpython-312.pyc delete mode 100644 __pycache__/nn_model2_train.cpython-312.pyc diff --git a/__pycache__/bedmess.cpython-312.pyc b/__pycache__/bedmess.cpython-312.pyc deleted file mode 100644 index 67502ad19bfc0c19898cd57f9e4eae6d81f08e52..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 146 zcmX@j%ge<81jPxmX(0MBh(HIQS%4zb87dhx8U0o=6fpsLpFwJV8R}=`=ceixCnn}) zW+&<=mlh?KWaj7T7bh2G7L*k0C#9z3rWV6#y@JYL95%W6DWy57c15f}6BvQG7{vI< M%*e=C#0+Es0LCvOCIA2c diff --git a/__pycache__/model6_multitable.cpython-312.pyc b/__pycache__/model6_multitable.cpython-312.pyc deleted file mode 100644 index 23a89bb01eeee28a6074f2a52941f58042475dbc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25557 zcmd6PYjhjedEg8%Al@KI5Cq=@ACd@)BK3Yy@3$yXvh}tgSun^MQlRjl2cRS}fDNb0 zF_fxD)W%&=>!e~%dp7ji-LO6FR_W$!oNc;QwtG$o1|`5_`b6nwckBJJD@$$?d%I`% z`|e;Kq~J)&Y0v49xO3;;?|%3Fec%1=ckf@@?G_52<-d1f=x+~G)PKbf`Ee9IPiD&~ z>K4UPEFGr&G9o??8BzXq@v!?lq*e;o-k zh3g{?{)R}SzcJF}Z;CYgnMI!{O z#>l>rVcDm?-z|Fi`Y%Z{sKUxVHK!=e(liAnT0ZiM9IL)nI4F^R)c3o3$?xxF zE3Q)h&1@yOTeu$9b(NCa&A*jh&Q@Ke{o7bK#BPV!YEA48`1WYNcfxm#=6e@g3#oRq zb>Qw{>%ra2ZD1QNc2LwL&F9!g|30?KzaMI|ncJt7`~dmhuYC8x_d)m_;tp_q@9Lyd zE3y3$vwY-Vsr8b5T)zwh{nO9VY;zsuKLWmH)(gHDz~^OK!1p3x*vcJ|q0d3P_>cOJ z`Hz$E7Z!w{@SpUbBH=GC2tVzA$$y61IP#}*?&_Oul_AMk+QA*=j&aAi6WmGe6nC0? zi97SIS%N+XRKkDOf6o81|GYoof5jj45BLZDte^7_`G@@%z65`pil1%whoE#jxwG6k z?q%*g7kbwq;r5S^@BsG;7vu&s;TK8xAjfhXH>3#-3_$aPdSSJ{k@sAk5Yu9N@xxqk}P)SnVtiqhu!{LDRWg1#l^y!0RgQA&_4F(3s_)8q% zWroAC0mQFnWg88$fjIZ-7?v2LM)}xKD9nA*39SI%hA{MgcmeAuQYGq1JAk|i9G)ZG z-LK0~L%>C@nt;dafc`NV>I&^Q9H*f6mR+&a$4?xFRtparv4M*Zj1~%M^OJsXf0w>R zCFB??A?HISso$|u3Z15@go0tjq<|})B%Ner%9*4BMVdfqQ`r8XHT9<*ejUVpOQriz8QyZUseviwtlq8;TACwz`E#7Q~fIviesf zSuI*2mkS&cO~CDxaVsI3=+#d}uNG~PdN39lfJ(Eg#0ojN8l(kS?K6o?oC^<$6`|

srgp0>}B1mG_2O?wPL?{s) z2y@+|<9q{PnsBKkxLY0tOCTZ+s}<^(X=>hNzdkb6o;7vMnL2+~*Ko&o+b6WIoBc|z zZg+;cUOwO4a%bZ9#7tMNc|(SoGd0XhFw=)->vMHmOM-k>Mzt^f&Zci~5?1ux+nj4V zjR{-lJ37B>dB-xlCD*Y71D4IVEkU4}vv=!rZF@_@0IG(jJDYBA5|(TLuzCVkGw*Hv zj^*2ynJqc*MoiZ@U*Cv8(`RRw=jyj%(9?iweu}) z->Lg{-AsS3WfKN8%|n(F+~!=v_6&31RQm*)j2c9~B|pev2oKu@E^x5o+6|;hg@ehU zKpChJg(GP~X_@=T?-UJ?3Lh#3nWCSWlfqp}rC*Sf!W~LMK1WUp&DW&Y$vMdHln&~o z(Cku>Dcv*cq)_5g=@;ar(9lwl&ykZ`O(*-3dN~LAozg>{JaWAHRv}aRXVwXq9*Z6| z{eqn87lFa23JEzi%JCyjvQK@3WTZ>#hx9D{o=)Z(Ddw4_Yf&-CKHzV~Q7*qz475X= z9Iw7rNCJTbXVJ3&fB5r1=eq#0UcaaRBFg=F>D(E#E(PWfY`dKZmqw$eyaUW&+VR>j$GZE zoNH~yRB(Cnu4P%*vKdp(wJKwJUa+R-J43gJrcdOTY|k#)F6=m-TXG`TbW$+Y7n)l0 zOK?zqt0ckk6X_tA`H-d%m`)Xh^4KMrd4EFM)s)4MsQ+a4dlU zFvGDh`fY*0t7E~i6hqh{zZZMUUgWle9IT%LKAK;TU4)}=2kA~CG!X}fSsZ25UtXm? z(|HU#=%25yG}Mu73BVwxk{?`_6P9oSJX)5ZE^0=!ETovOB*6#usXSTb$p(`|H}QKG z9Co;bjC;A@-~}%(+q^@3EaFXE;Jj=ImW43p$GxM$#D#7Gyptb}_Yxn(dOP9?-q-7W z5u&|`*uq2-{4p*u#z%`u0J@jpiX*g|V<@^Psh8w1Vd(50j9>CGJW^~?7mJGwmPBMm zVxcIkEMaoiG1s zoU=1$T`n-oNhd65f1H%0{Y}s-+V(Hak{1v}I!Tve(X2jERJH(PFCmJv%zFk^#Q->z zMH-Z)`0Sxx-b7Kr27wS0byEqp`EC(*L|G8YsVRyr!&Ja>YReT_z2rVB8you@Pm#gX)5PJWQ*_!w8350DE=u zU?Rgq##Lf4jS|G&3gTWbJLHWGNyAD>Cyk^p=q?;Cac)%99ocyT{6-M4!@)t0M*+)c z7ImYnXyRdA5P+nj5h{hTu*4Y^_0eEdG!DkXW05F~KopNfCKQehie*rEpa?!jG%M8* zb?lH>CP9nkkr-%xV091yyzFsDs#R?b#r>(*q-3vSO;*-d}mO$gxYNDy_I^-PmJ&HO9O_^d+22~o+@Z~a1sNvlrp-UN)Wu>N5$^>}LrQ;wk z;L<5evMgyK696abqDP(pDQnVPk|0{;642E34jkGgq0A~2DUY#KS(Wk_{}Ichdscap z`jQYPwZzx|GuBIz)?nh&6LS_EcsmCN$6#d_1eK(hg*hovd$QIWMcNOuG42gTfjoyp z@kG4TY>j(6qM*>^SONm%zaJpIpkz#dt`pYm-uT!6NsO~m(g}EDyjNPrN;#C6&WSP1 zA5xH{64JPep{tla-sKIW{LdYHqk)pBidgoie&*3 z?|4ZT!=&%Bap`T93LC&gqOG{>mcGpFkSy6*z$-7w>2c9QmdFwUJxqJiR;)e=ildeW z+g*O7iC-R6x*>qQz~hEf7=H1;2PaKEs-#@*^x*4&MjTsu!Wh@U}4S834*3}Lq!Q7SEU8rrHUUR!C zU)!Cn?atM%%=8y*)l!P7@O|5InE&Q3nE&SLys05;YMAoRnK~XKrh=<6?^=>|Em1JF zP4B+loUiT4*7oFTSA8iaPu|p+H8oC+%$a--O^zG(Yxc?DeN)Y&HmJ&@cFI;e)jhLR zu&xxCl@iH8YAipvbR`^k3n4j3W^U*Pf#eM8UWP@CcKMamjUeQrM)J8DJQNRmc;`rb&nRWP? zh>dzLZe8O=rUi6?X!H|aRTtMSF;i6pRHM8dIJ0oN!oCIUQN(4kfDBny=E-pg76gKU z3{|287rh-ob9~-j?~yTp<_V%=iAofNNRK#KfsA=$MH~wxO|nw!mi9|}y*s6PvYMh` zN#!c5!ASaw0t6|YT#g8r2;w{~wg3?t_|4F5HOS`2=wSC^RJESQI>9dkD68VD&i~xCoL*f{Vb$5`Jjc zmUbzm*rjnCmNLyLN*fi!Tq3XlivKD=U58)%cfcWJXX}3Yfph=N_*?GD@w{hQ*0U_< z@!hZJ6rB6hec;SH+>@c4!<*h;u(|TK=B%xGn$Fo;(|aHGZhzp}G28x4!yVgg+w_T< zmT#ZFU$a8+>=2kLaObOPZymgO@S9_IzH<93!t%|z<}L5S@>9`e2^fm9UD&~jA=3TYSpq4VA6x{~DCul`*h@uzB=PaF2XhBk$c}ZH9 ztbt*}j6+2dFh`K1Cjdts^)KmLGzbF@h8oxXq5d-M(@%V#Oql-Q#JCsbG~{fs3YWMh z43mLiV(i|daA^!|cgu>&}6d>q`ZdaZP$3a_+ z1CYc>KxaZ+n3a}aWWM!!cVQ{yVY+yT9`Wf!18gAOHHi8>_{GIGTc#Z}fQGB4R25s6a_gvM z6-*|SHQnz*&)|c5_^IlYCsmWGCGoIRmozMjuS?aZ8d8lUzAR}>nij=3rJ7UTR7}`%0!Z2b>e~fS*jCuYi!+jhjs?0I zoY1J6e}YOBBpJh~o1l-pgXRYG6^%;h3)7ZxFfzafw@uV3y|x9`?ywf$)-A{VIiMkX zWoc@5=j@s^MTAknWlqE{a#6I(m~eu9#EQevJN<`O^sirWX*GWoQZ3lU!FfiC^&=Sl zzpKAR4Hy507&0GVZJa7g8CCkEGAa$tAu~y4R`R?9F}275Qy}K;p(b^N1`@hBNx%FA zW*ksbm;u(54JqUedyy)ES~_LFR##H6Rj6Ajnk7?{QWQy!18TMksEi|J2hq8NB1)w^ zBg&3Bv{CU%IueR-JEH7x%dr&o-cw3Tnp#jd*j<6T)sB?fWwEom_mnn+a(W2WwG*mq zSEw$i(}CW5*%?I}uKb5xnS(7mWy&5X&XS$S{t( zz$jjndeZ5%v<$FHnGhA#MB~%^QxpATz!yIXJPK9^1$IcdFd~Gb!YD7WeAYA|n7l&k zCSmh&;pD))yX6ns{~$iI^=|jQrMYECbFIgO6X$d8K*m<6Tl$2mGKc8Q!9wTSCset; zC+l1~X`1p)A1#~-K4=~gj-8yg&_D9=h*6EHn>H z+Q2C|JMvCQ-t#AhKePRj?e4j`qo?vm&u5RGpIaH2bH4I$VB|s5Md9pup}8mDygu8! zel{+=cv5Iyf4}*Z&~$Oq3eF=Pbx3zg_X~>ZIIVj`QG0azb#on?gz@Bq-jom?`~JEg zZ28`nd*-<_=ksTVvS)@KY##ogH+8?`H2{Vrg0uZo;wdaLe-!o|{I2-fM_N=pg`lScX?nlkgXOCK`>bA^*hpy$q ziao-<^TI0^1lNTRY!^QDVdsASx6K2eKQ5!1!}RB3Z|ZXpUZANk#2bZ>&97kylomAs zjmPQu0T^08q}LwUX!?&mbqBVZ{=(_#Gg|(lsjE*{^&^WJ{6E@O*Jn5VWwWEtWBJP! zU44~R|Ebmt{=cfJ?Q1msSZ4s3A6FV6^FpDayi^ihnnb0Htb*zpq9m#;I^@PI zeyhZvw44UFO8iNTirTF~ja^X0L@2-dsV2wP)!>Dak#>obWs#ad)A^X9h!r|t{o+wn z=HEl-55b8q0jCtnT^P1w#}0`An`D~HXCdLgK@2~D#m^U@s%xT_Z_B%bf;%{M`JU$k zckul-;pJE0Eu1{_FIWhkRGjn?2%ax*dZ%%&d_($RfpNTf7(Dzrirw`71 zI%j%wp7o%q1Isp7^EbL-^TD_7fouKr&dJ@k4%|F2b@X=+39j|&esBsax8!S1&#b!F z@;A$VybMV4Uw3^_bNYSvl>1Ko?fP#u3N@#v&=DASw&wJFl}D%n3y;q9;rZIesYtH2 zOJJ%COhq1|8Mjc~F~|6R&Ug$V`q4_NvMPP>!&fc}VWeHVfcC=cB^`^eA>F7BC_|8b zRhMM~0@EloZxA-UARG;Rcs?YIBmv}g`WQl<&;{@n)Lp~`0Xhr`&I(L}(6mn2(0?!T zp`R0mE(62_-H(7rfV#rV5BUK6S(*cQD1E)qxJFplcW=u>YrC*?o3P`Q@X`>#h3K8S zf~7@h+jw_H!LdYGwpG{;oGmB}jzVml-m8PX6dSm(7rR&Y`DbTos(Jtvn^jcRhR;8$ zIUTQnj{Lf3PowFFO*MPknICRlvBy_7sO(8VtPy@hHP#K?jSI;I+p5KlJz3cTYN3Ng zl?yB{QzlfGl~Uu#%Qlw2xCaJg1wd)@ge>u46@^*KtSY@g+(ZS9StFpatF+$4{VAf>mny$+40L$l!_ z5rVjOOH`|1^5R}Uhte5Qr;trfEQv&8B%P8%aTIFv8)scgo8{dr@5y{g?lPbGr^^1Y zY~L+wgo+tTwMRavNaVjsb7OOARCPqvu5KEuif?J#N!Fsv3U^Gs2ekOoNFf7hfuc)| z1JkFcM^0 z;3oj{Kj9aj1qZlW1Lbh056mAsDV&M|O?{Oom|_eqS<+ry@?fk&nSnxF8O5IFNAIo_n*}e>^!-BLIpO?J zu5vhIESQ`(%CD7A8Sk6CkIaDXGYeH+f9vecvw8QDtb0k`y(a5kGkYNC-U{q--VWTX zW7dCf&z$`z?8{p#(ubw3YsIt+YT$$)A>V%t1eTERG9ckb_z}w%RRgFJHlow}E)X~i zSixf2MUPCp;p~z^v{gJBLai@lS|*dv#cv>|Rym_m1Ujj)l+oxiuzri4QX0IHnz@qY zgM4KDrcI-dvw@vU3Eh`QZ-Tbdzp}MhUCIu-Iu7`iC-uN8K};puSSRd`RHj^DI)H28 zMUUFruscB|oyiK=B`M~Us)8^Y_B>olncYeEh+;RbqEfLyE@*kY=+VU60Yl58X!w*N zCSVeQrc&9-8EJhQHdzHbMa5Pmwpg(Lk}OZUMwUDc%PJ;*1ao=P0edViJZU_#>}hDZ z6wfT7yY0 zkjo5vdb(5{^nN}38Z<4~_2p^SQIIbKW97AlW92!=Lfv!pWu2xZ&)b&`&()U=uuJxj z(U1`s&Ri0rPK8Lo z4u&Uk~*+{+}-%|ECXxKBk{>Lyn zi-PPBoD*oHR)M&F8FZ*;pvPhRKXyfbhJwxia?pDcX8Awl%q5c#!mC*bn=tMjje$UU z_6xdaqWtA9+$sc{fc{uK-s}BB-@SHdqEZe-lRG{*I8Gu~4^7yYba_t&!$sf;r}U+P zqL=t;_^Xf$*nA*bqJiMQ1_As$hL(YXm5oJUy? z+UPWi2BJd{&2VfC>{4JuP&CIz6QKxhlZiHI?Jlu?(Sj>s*&iK?1UPK9xO^@et>B<# zK&>e{5K{B4;6b}$84vLr2o5yhWkXo9bQXopkIXv;2z ziB=5&|NGcVe}K-vMh6wTqER|s2VxaTP6tFYi0qIbSUrNppNiNa`HKY~7fn)yAxp5^ zAz7l`4wOc;X(~8QEYcUqL{EK4LXvH8ES>;!c9IPb*OpK)SgE)jSW%jK=&1#X1nhai zNDwRq`wJB=SjtyaPfp}2TAxrohPWo$z1iG%>EBu4O6G5gE?16W>3N9ym9#2;p;B|_>Cjij?CE_ z9@-kFneUjtZT@|0*0x-*HDFospx)`3V6J}Uwf@Y`%y_}(nOb#c{q6Oj(A<=@wPy|$ zz{)T*v+8y<-?%E(^)DyiNVataIZ{+*3L>*((l}+93f?wOj!)Cmozo37fxG^DdSOey(0^VCM1<(8!g*fcQHirt*C)yD zhw-&TcLZNYbm#FE01XYi!nzo~&VzynUZWu1!>gRv{hXrAaowW;jYIj-D>P+mm^wdm z?ryJO-77GAf6;;!!KVF!EHeJ8ZWnXNZuo1v;ZU`CP%9PU5(K}WJOKF!4F*Pv>I4W= z9*x96r{Mm96azm4$m~RRS9cFf9lMd!63IaalEe3lk^?AXa95s44xl~(^+>T?DI1X@ zZ5qkfp42DprS=|W>_k@6Yf6F?e$kV(fo!Lh0v#at8j{vh`}nG=hhR3qzG8XON~Az+ zu|_j-(6U-_oek5Z~*zA$+}9|kS$8Qm z&oc-OcZgAZ89H!-7_^td%aP?vH3W%^)K1h5t%`$7BRjc~587(+Qj> z?^%`gtjc-TWNm9QP`*m!!c#4GI&OEo-8pAlB8T|$b)DI|&RkvBecOspE1;Oa&_kIY zRZ#}VTdv8oxr&x7(*o5jube!6Ej4vATiyx^9cS}&*-Y78i*W4tobyDwZ{A)zwRC#v z?7=zvcC3HJw_*t%Z}*3dbqMA&99yj01e0 zdH;-_1**gvo}rsj=-BhZ0Uax9sh|XEX>p1zdRXO%!2(FnU^6C%xu zhFy4Diq9Y#g3)LUjAw=-36U9$1V=^dl|*RpqIAv)c7@fC2`Ak1B;DoYJxuO$nkWZ4 zEGxIWmiZofC@`* zV;oozq=v;Nm=cL5HWVC=MT239512VvBf^msSXL5>Z{(|?;dr=vk>Y=Z*sWL}`4%gO zbjyiMulWoqAAH79MZSP4yGO$@JcqgP9xQ$XAU*-VI5ImpQD50Qy?$mroU2M7nrDni ztS2wuXPVJ}=#4{@*6Fn~$7e79mtB9;_v1d{)P-DsXzubjm}Ojn4MAXY_yXFqOLqd* z(^u$|I^b1?Q#wrF_eP)K@Xj$UMZ`k6<|C7wV;YLN2+oyr%qsji@Wz3w2Mca*-tEh} zeK|P2e(}4pcVh3anu+DQ_h#LD(?{kj>!*(7Dwm`6{n~3~89K8D?qCs|ZPPoa*Gv!0 zxMq&em<7k``=&K$h`(0SR2!hS@`Fu9Xxk{b&pPI;YMxoRTLkVYg2XcYOVY&RvPe~c zYCyY6PpMOrzY>!U4npf?scHcxt&Y8jomI957eJPfj9KM=t_6_F2BlnHplMp_r`Dsy z2Bu737GNwzPqkE1#aN1-YN@0OS%RLH%)2Q0t95}JzLfqh(QA6v@+EEk>tbSnn*LMt znnIx$wDCW;Ueo9qH6#C5_8M>q#}Pt;Cn`>$N(l}BWg3ijmxN?axK62-_j0UgAnHfa zBnM15`r#HXf?+q_)OTW{8VDg#J1!*o6LuvlR8~%uYp`R0$dKSl)O#&V30sjJFdpFw z2B4)xJ;|eS0uMLQ!c8a`Qx@iiL5znx0{IZjuf*_G;K1c5pTl_Mzd+~zpz{yt{9kYu zUwz^tbUIwDigf89zXNat;TLy;lcs(qk;dyMCs$35Os~2b&Dp^OV}r2$*nQ@Bk!~)C zoPJH%D#ziu`O9;Rr$~h}6LU-*`U!b%nv-Zf5z>p8GAFJcK`+pM!HJeC#qg^KOO<0f zd3XDLhAlp|}HjYMmA~3GV8RmG;L}SUw;?{dl$P2_fbQPVy181TV2NyXjj)y+>#G+jH zgkx{<=!^8te;0cawc;VT_!rDn$n$Xf@ISos=?8}gUfd3gPF)zZ!qOr%z``nj#4nPO&`o3(xM4C38eAv}k32?n%dxOZD_)%d z^M_dSiOq7{5Qruqs03hr33_Hs+?_x_QbfQY8^}VWe9dNDG6=-=s%Vlp zTO+|J+6&?$i5duGn`88hk^j6eTWIH z2Jk~HJ4|lgAs6G2n_Q%W{ye})R^r#tmqq6%=%8+#thk9(M~DX5HUJqLFN1;G^Q24v z$OkIpF`{@qfE-0WrQMdd0H+8yo5B@1aH9;~iokz@K~@+mFwvxh(qdoon*n7J@K3Nj ze~Qk(L+8)Xxr@&C(D`$8{yjRNBZQUoa1 z)7a05u7w7aPa#C*$>RJP)ci1dQFe8j>@ZrJTO}IrEDBSxVIwWh3kqNC4vbxt6ii9w z-oaRHP8C{1OHwPz+7ed^WZR1e(VXI|aA5oreMI535`C-CS&hybbWr6an#ghj1_i$s z!`5M#U13FJQSupvId}8G8}~*NaJS_+GD}G#We=|oz$MH9>A0E5>|}!x(J~suJ8xh; z6X!vhO?@dN)GTyRDTNleaArNu;l__dn&2`@PO`UWArCZWf*UlI5L9u+$)btBj`?gv zX9GBJ01n6}Tw%41e*+N1KbYw=6!l0?)AUD_>m#c2V`}ZkRQJc!=8vg$A5pfCDfdsP zW`Sz{n5zE?wKq%c{fM%DOf~<6;_?&+o{uQ^N0j3u$_$VnQ|%v7_K&IMA5(1)jpj5{ zpeoXaJXMjUDkj_KC{Mw=1aDWAY}@C&o6?3iEpJ%zRCSiBo=nV94R;g2EKt^T*_+lk zteHJ?R7HW(y=i{K{HEg#M+N|@V0(qGN*nW(D@(bimfh*P-F2T@RxsA2FXxRlS!2yq zP0rYwF8i#ki>{ljyj6R%_A?5+kI&O4y6bV3opyk+tj_$Yu905#L=V14#J9~!*FJ8y z&>fFzNbFWnAQhw0`{*^eB?d#_#SnWuyyo4x)_Mw$D_1|ZhO4iO0RufS4EdU z>Y^B1#-G!B3gum3Z|m$yANpxU(-dsWb*B42tZbgP<|@0=hkxqw+}6YX*NR-t#+++Y zy8n@hGPu(Fh;{CkImTP4Y??|;>xIhp^r1pUX}a&zGHUw~ z`rgIdHh-4cEHECScEwEc!_5cp_2xF8$}$_l*jA`nK6CcNjr;GJa~qFknRQ^8D|mb} zy#>Z12{tHFd}^W2!u1-kfMs=;@ineHtcTYN`m_3iy;*2kJA1KE*@ky#);%%k&g%Bj zpTd^W6%bP8QyZ8adUV-R;mQqt|$!8wL9 z%3zgp$CWUAIxE5S*VQC6gE=i&XN?1Vl70b8aujld6?N4)9X4O1(6m|DdPX=aVLXTs z{a`5!5U$g@^q~(c8iYnTYy#H0uR>Hp&hscjR36i%4?e6}Kl^I7W>daqTefD~-B)up zJE2JmZ9Cqt$hPgyw;jl~9k^GKYda)BAJnd$J({iEkgwg6t=)3>Xs&iUnlD$nA)`w7 zWF%kRm96f|Rj$A;s)9|}s#@q-TeaZn5IXk>2mHdhVIh=&f?TF|V+rhgbfEKfwoU5t z&bF+xE$3VUYEn#9aMn#4ZdqLUR5;I4&?z diff --git a/__pycache__/nn_model1_preprocess.cpython-312.pyc b/__pycache__/nn_model1_preprocess.cpython-312.pyc deleted file mode 100644 index a10eb2488ca22a9887973c4dc10953a55208ab52..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4581 zcmb_fTW}NS75-P+)qQm}!j|O=HpZ3_7RERv_(mFxiHU=qhEB2x(FpBtBO&doS{Y)- zOqaGZkr`$zlLxDP@}pPh5=dE3r!;*^pGfkMG1G^hU1{wm*pN)J z8tI(>Kj-_-<-bV!oTg0(p7`(ni_@U}TOO#7v8sIg9|J;*h(#8j82hyMh{XGGsKOIk&v<4m?>^%%yA22ficO@8ia|cR}Ho$G0jr9bT>7X8ODZf z(LGyMRb%y-jJBwwEy)^g8EXAjdpI$ zsTzx96yx5+2UN?J#O|9!Wh8s`gL=J>L{PI4+gKB8zGYFfd><-hYMDC4R>#uM*gQ-< zuHC+lc@)-6!TT&!c?IFxStYY5RQJONH>mkU2YzTnNUe53bA2BPd{+?Dw0%sb8GC+I zE}ma7mp$9ZWLj|jkIJR~7tEz~`m#-nS~t?F;kTj=YC| z7Vq&Ld3%2r@9>Vi15oSr=hiy1BX;nG*poYAhhB(%9k8RUOI=F>%@AP?p?t?&g3lc^ zy3pGLC05juxs!(5nWaZ;jBbND2mstzyP*-$DJV5j5pnP7KUO1Sux zFanQOzzIq7=Bsamu8(kuSSXCA{`@ETc`)>q{0jq!jg>Hw%UH4`LmbP;gF-5l!qsc! z@sOm8OmTb?1BvxKIK*&pn!CX9m$=K0N=RJEsw;6JCTT9614oKY%_Q(;*LV^ql6)8o zLWG-YKF)Kv%XdoBC4eR*NSXvIkyA1LoJ4Wc@x+Xz;Sv&+;*;U?l7<(+FA}~G!_sa6 zK0`^aD-OlSf&z}=a4N!cN%;qtv`e)n{y$g}zUEL8?q(#}p!Dffa54g!Cqk+7k|Bvh zY%nZblr-$5qz9PX8A;29I7uJoW7BaCf>#uYl!HoSB*up&Tbzd_3G?wd&jl}rVv-I> zV_Zlwu#-V$(ImaB;dr8|2q{SiGm39WkedRll1fUt6d#L#QU8vLo2e=~n1O|tbTA=H z>Kz45%~S*|(G+mRFellokRbk0ZYX9YTLrTHclH^RQr%&;KOckOk02&Bu2~&!ZjPcx*B**$5q#9d= zR6UpfmHsBAZX!) zJkb&av5s8p$h?l6>&UW>jO&R02Ri&uH1OEDSF}B*?BZJ=y?6D!Ow$rur1p!N$4K*$ z>8dGxIOAM&FSwU#L{ky%du+AON9Us7APsR?JW+PnX59s5;3MN$?xl2lX5U<(WNOZvnwMM!Q(MX8&zt-Y zO|>%K9(>Ke-o7hGej$X?#(FF8lE zXKv$_*Y4>*eWUQw>ub)@oa09Cs&h2cTX2qwwsKo%$uW{0zCFCsdS~*}{e{CP)*K_t zzrE46>KMti6&xd?6~eXoOSWU#-FNhBwqrT+2DxfGmLUtaW1^+p>ML0Xv-;fBnssoQ zy_Q!4^Zx3rh&{>-J^$Qs?heEb@_O82MP0^Ki~fU?>Di1uf-Y2g}Pqy3_( z+~|8`>C6b}@U=5*mQK-FZfJjG?#P7GzrHrSX6_ITU+a**X6fak&M#`q&Rr#EYu?$q zG+J;5pbkgPV#7j1cBbI)iPo~+wMZ|}+4BW^yGVaQ8COjW*;Ikr`veh0?J`j&ZQ_ZK zPF+2f(Pk!!bvvx-G%y|JH2-YRtAdJ z*C9Mzmp(K9{@nZ710_!&?+Fw<-FJ4~ZCGh2(j%hfv7;fgYtg&l&GNZ3pD?$XqT`sH zgf~MiS{5wX4|2Vq9KC(CXdeVBTYWW=vD~gtJhweXTfb=e7ot{m^$(GEc|Jprin)KLAPn5mHb>=Gtr^VcTf|D^#22S(`-Ee diff --git a/__pycache__/nn_model1_test.cpython-312.pyc b/__pycache__/nn_model1_test.cpython-312.pyc deleted file mode 100644 index 9baaf790b0937a69907b128c8c1fed35f806907e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5631 zcmd5*3oxj-aRs^5n@K^YGJ3@b_jP|h=iO-)0ovTPd0wW@ht~!wk z={Q|T&*>S;>qSF|<=BvsGpalznnGsIQlzb%4Rj1*Maa(C)jd{p{8FbqQcLw{i`Dr-w&}t8n9WT|tB!ZrC)MYw{zm8J-r^^6oN@Mrn9` z2rM@3MWfRQ{Roh_wOlKlsl#hEkGRKFj04)q@`@i5OqxaYY#X-@HwT|BYiJZ+7d)oX z8l=%02Uu;&h*(7Y+HxLNav>-q+bmXgML zh+GZsF7Z9UZxCqB)s%5?k5)Z<(QC-3^TRU=Jur=M*Lidv;hrfl??G2|mn#e)n?_vk zBAk-7e}l9B8*ny4g=~VS6`svP1=k0-TX2`=ui72`xKFTaQRze6R?r#1y_ybG&I`rL z*~V=LnH_J}WZWR|HH-Ksfqxti;=$ngazr)ihC{Q$4S~)O?LPeElnrWfC*B-Ed)rgi6;^ST1^>$=N!h^c;BH_d3LAoH76v^Ioy zj_b-=m!}OhlLL68wg<6*9#=bI8@ES|#U4$j#^T%DeJ^fZ(fVEui9we)8&Cjo`@q6} z+=@5h&E*Oj?F0{C8gWl?PjkWFj5qj}J*x$0N z7&UN_Re`6fpX0@Vz|(7#q=elUjrb;rSyP{K3}aqW5%$ESGkD5gLL83B@|1HC2;CqC z`tBGbtP+s`$fYjbF(n|%-C&HDXlx_{yoggiB~tM25Qe<@WQnCBI^;8KMVdV)viiuvtVPaRq<;7ti07}H{6M0$2vX`)`1>zK;G`+II zE7(iRf*9xqv8qyUlvgH*0iWST$whszPkzmiO|?o70JMb{MK3KbnOG@VEC5qD;`K`a zX;dp#CPqzAC{(yYh(AXd@ThRuh|j@21fusvV}uRILeWW583>=jl8k9Q!t!`T3K6@2 z>6L_{Nr%R2=)$<_DscdU*9X`0N2CCj2@42*OzeIs5{s6PYi#kj1^QBkXvSpXR3cHY zG9h7HH03IG*myvMEgB*4JP?M^MWPUK+afWsU@(s*Fi)5OaTNWSEFu|ww9{o$sYDdM z5WHe%61^X0hqL5`7g7+zq^7(P#gY$)6&~(l3q-wSJ7& zJ}@GaH94Adx2N6h*DBNQCvH`JP(3%AJLF3r@@1Y8vPK+d^N7tMcN)2q!6d#Kyb_%C zWNN#zXgvsCwO+7Zthi^h$F1`fwO6KRM{gd=)a{;|%2sf3+r0`$+>)olJX;k%{_5z3 z(U-js5X)@Hdmev?4D5zWj)XC3oUf{zF}}V&VaiuGB>S!oTp5@-G0SJF+b@|DOrkGu zuFRQNrOm5m+;7#tQUB)ZTzzM{zB5zbl{K%=o2zo>`n0)zrtPiHH#*wg?fdyEOhg>RltHqM-$t(@JFUekN0a^oErj@##-+Bh~ezWwRO{V8)p-dvY6H>J%@sbeQ| z$8h=>PL2CB$0l;kfp-ty-aGgBtYH?Xx`)zjL+O^GOmiS>4weJnlX_w(RloDDdDnuS zy7It*n%l2AGEMytk)G{Ln?2y!5cTY(jB_1$wz_HN%(X4=Y!P1%Go?ypLf?KtofFXUpRm2 z%(V1AM2w{~UD1%(o9xUtcO(uZl{*y;P}`QydyVT7w&cD$X3qj-x*ecdSI*#08{AiX zGf!k{+p>mrk2)Zv4>}QJjGxM)ss$s0s2>qhs>aBzF@v+1&p*H(L8o|;M1K}lm+U9DWl`-S=R8@ox+J_wvLjAYw!uj0wd_cj>0er*UKRiV diff --git a/__pycache__/nn_model1_train.cpython-312.pyc b/__pycache__/nn_model1_train.cpython-312.pyc deleted file mode 100644 index bdf115b68965edc4e90ccb6dcc715a63f632523f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6416 zcmb^#ZERcB^*(>acAgz4PU6OiQ^)zBPD?^)`A7J;3Nu>KlHjUgLj3fT?h5Mj(031eXm@m6i4%I|ff ziic>0vko||*A3+{2kVS?R(Lgxv9$pjaxwN$qk*tZtUDgmXq8{K{mSN@kb1{ZtxpF5tmDVY?a5$QX2tqh4;7)#MvtJDlo=Z-$gF+<2 z#h#7~MyL2lI+o-Hg(x3OrG>#WEHlXp!XU?$4>z3VBQb6uHO-^9WoS620Zn8Fj0mFu zzMLg%R>$RNc9&%BTD5ksHM$qt=GzuWaz83G?wm7Swyrg|E*zRa^x_fd*w)-F3&Zon zi>H>h7u@&GnO?B2!y+WAA1ie5=#}G70I~oF9WQ$aboC}un$6Boz=8am+9nSD$SZ6c zIH*SmkB1yI@@NUX69I%B5;X8G_>rk+1(ee7=UcE_H<9F@iSSIO9`#c0Gyz2z05D5R zwCl>@ONX=Hd;ZuQg`Fdpj$|XxA6GjSN`q_8%44_*TF@f6$RrCEVGrXWOeGMf!{PH& zk%UT7&Vt9NB0I{b9Ay)Ua5zZu&A2=2YM&yaVr2@z4-)XmzPw02GnmZx{mX7LH!8a- z^eB+@rX9t40EQ|K%$UJ_DR5s4I4~75F-CA>^Ry{gCp%7He0UVCCB~hNR%;q@D6m{w6LKY*2%QS&6-} zBM~{nCc@?D%Oi*E+RrBi;UJghld0+BfXa0VfZ+*FHYZc**raS4Wg?TZJr&`>s?sbk z$To!l(Sw&=ihGA6QH%u97%Rxm3z0;1pH?|=QJ~t9s^Q$!WSC7QqvwR6S#b{@HOHg< z^KKN*k$_-^<`9N3!|Gv3bsjaKv~DmPJ+z>03rPSFSz5L%dUL0iw-;LWK=iP$`TFNf zt5)xtr>*GempuIi&kh);*V;N?@xJ8E9eC~N)uY9pVX0@h(6jqp$7Cw!0Y6cyRyy7^beD>h^e|ANiw;d&GY!Cj zF?mxH8$_q(nlmoZbvrkv@It928Jg{;R*@`?F<)u5QbVzU(Qo}HA*{?a&NPWlqVwr( zkd2@{Z))1Q@%Bvw=xVeXw`ddH8PL~^O|-)qA!Zlr;p=WQOLWA2Utg2aFUt(rpt4eZ zmbz>*5!l0?sfU%DK`S*9*m&31R}<@N*%z*&os*(tygoLynyWQ$n6+sJcH_NYAJnv@ z_&^KzKr8rwr|OjYzIo0)1hPfl7dLo|QIn?$}ig>w@8N1@)3qdAC#?UBGIQYQHBa8&8A>Nl8>H~ zjgygQWuq_!NlGlu0>qMrM}5ndNGip048I*aCKl2TxsEF@iMchu0}197ALG)&b~QP7 zKA~1C;WWz$NnUnNZ|u`ggfV?7Q|ald9zTf7+0R7M(Q|4oWntJ0u_%0-lHthibTSQd zqe-4;qiJ^myEV7Fe|13NSTE%Tce@P5IX@f(_2 zT-*yx8TZB%PLZ<*2*M!bO|#^N+p{n-KeF)P{DZmHf_oe0aGzO7Yy09zp`~w~?694u z<_=d!r1 z@b1Xgl|1bW;=GtUwDfqvvny*V0k>n`@zVxGlxf|Pt9#Ye`*CmoYbURsTzY1OD)f#> zu0WQ`?k#oQwLB(ujTHTRCI8-nf3)Zyll)@^|KY5o)en?!8 zTs`vYvE{yZw!gW3CGysQ)U!`=?faX{f6IqodaILk4&`@_NxreUanM(h_DQs_NcTu| zPrmn_-x*iN3-=$BdXKHr#}#syM0XYGZ4$k0sdJUS|CWU~8d1qt9=`POD(x%zyYf2p z`RB&h8e129^S&3ienRYy{S+kBi|2pYol6t~JC{R+wtb%xTjPEzJ6iGuLEWRv_G{yL z-|nJskL264a-rZG%Z}ZKbfYuyO8lkxQfr~}uI!<;J9fM_eRX=-v(i<#;{mC;Cp(&* zDRraXx<`tEy;5LrAuw7Dj7fp9Lf~-rSgE;psaYzvmBs!4W^Y;D~ zy0Gh@)O~Q39#b^8Rid{R={||>%ZaP>u8--)?4iYjxzU%7Ebh)lmh3rkd3fc~Kb-u{ z$={w@Igvl|R6a79e~QaDCI3#RZk?pirf)q-5m$4up+{=y$uY&gVX1F;Wmj?cacTGQ z_n5yX|CG!(oX%TMf6=YTr>OiZr9l5<-+t2xxA`w_^N9{yv|Js7LlJ^iu~Lk@4M=q* zU8|0T3j+B!wc8aFP$=O201d7^JsJ6iwX2IHbyCfgy7J@(?~3Ku6$`%`WWcl6J-Z8IRMEXna&IfR`GqI*Dc4=mH~ zTOU-aL@3nH{;2tlLmy?*dqGIN&2Rbv$i`@DDrn)ofb$qd`3?kFjqw;S`6z&EM16R{ zih|#&fh~XtSb|(6>js;-eZ6(3`G?ecoHCgAsQe1+-N12k9{^=joZ5|aG9Xi9vQ=Z0>s3s7 zKj80x9pvW9OCek>DLxs6=R-b%5}ZN01tfqMEcF>mk^TS@$&O-`PH*H9*`UYY8PLY;EEz~iZ+$%DAf7=l9xP_g`e1QQ4_M^=pDE9(Pb zJRrp$9v_j>u!T1O+%i%WwQeDl;~!*eo^1V)w0ua~*GS+-y?54La&*nopPB4b<6PiM z@KW$I0_Yc8_EXfJPYA+WM-9Z;QFL@kj;p!x47VWuX%k2gGgR^xdQumzWCyqI8ky}cv9M3=YgFG{lPo`%bt7Hm@MbadZrtHXK TVwG(FfSB`^j`xXA`JVp(qybOL diff --git a/__pycache__/nn_model2_preprocess.cpython-312.pyc b/__pycache__/nn_model2_preprocess.cpython-312.pyc deleted file mode 100644 index 648c49295a6ed7b8c136d15599cf4cdbc8081544..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6797 zcmb_fTTmNUn(o%6h3*$XNV04&AV3&P#z1`GViTK-u`#}c1UCk)tUiDg-69FwxL`+~ zmw?Mvh)mTW`z1V*va3dZim5!_H;{4`+~4j zo~FeApZ|RS{Xa*oCu+3Ta4o@AA_Sm53-3grKt)LJWA{Wk39bPe zU%9Z$bA$|teHHvlphVGq`8{!NjUrIDQrP3UlgP*s1i=kJ0}N&;^wsh^0=<`KeRYD2 z=Lm$=!b`XG_wzdf1*P}U+=@lDLH2g51c5mh0S%!bl=oG9l$T+qco}Zu_DqEOjXg7N z0cG3gF^_meJ*THK*`AK9Rofmn; z+_aRzX+hz<)xu95DB~xCr|^78AVM^aE{Jbq66CfS&V+bIG=t)HP`QI8GjD;XJ6Ng* zwjC^4cndt;!O}iZ0WFDN$xi$Z*@<7vPK-BzsvRs9&hAZ1`$5ePmI`P0rX>g111(AB zrTN||VP_=Ml6*a+)6#qvYOY7p842zLdw-+G0Z{!LH5!2vYGC=>M+#%Yrm=Ap-XAjV z{=o2q;J{lDO`!2Dh-TPDC84>m<);8|0WH3OaBuM(IJkv41QDO{a)E-TEyR%mh~_QC zQRugo&=LB3`EQ{X1aE^f%y&bu$aA0twDCCj&4#jn!upL2IzY#E89a(0)6p7I0^;TWHpl;t=b%&wuWn$}H2Suh(j|k=7b+5ty1vu`lC%?ZFFMbKL zJrBQ`w=moDq8V%}l2^DYq!Edt*It@}IS% zj4l>&dxNeRh{jyeu-_ZAix|}z_Z8sp3VK2Wh%oAbAm|On^2-}K!X0xviOE1<%GKzI zMX$a<<zuN=G@7`gbwPQclLvPTqJmF99Zi8-||8L1R)PHWg;AiV1JvM)A1_2uDI55RH0+;}s`ELEx~DFjyFRbVnIc zm|&#A$v}9D5e37HG!}|@CKypD3K3q%Yi^R&qQMk z1|=NU9c8NmTNjHkSS;lC!cEywUQe56T)A3C8QD~R2ZK9sTo_G`ZUHa|VCB*w&b*o< zTf5RJPAY|cABR5uAjp)x>cR!CvmF2gV|>N3ATNE{q#)MiBNHUrTquG-!+96hD&F;& z-yLI=d4JqOk#oZHx$|+5VB~oaWQ%eT$cf=dGU_pJjIB*DIu?oqn61j?#uaJ`dwk9K zt$dOAr6d%FbwS4%2|Qq7MhWFY0o{>^dy1Wo=!6@#?X~Jf^QYtW>Iw2!swaq*o>0&O z4`C2~7%iNoE8ZX=Tz>Wrh%#!9@+XfG!&Dizy43ExnoTSxe?0WKO6#dB6q~VA+7_-YFRqRcGH@xoKfh++Ow8H66lJuf<(6QKZbM&0h9_fxmiGZ}HSH;*&wiFm`Y zsPhUS0`OjQ!po4moDRB#TgVZfdQsE{#gl%}5t)Qm*^4uJ5JQmXViZL;km3!*uz{#H z5cvk8{vV|If06cQhU&QXnN%0Q@cHn~;Y9fok&*6;i=H8(&lNWnv&{*^qH)2vR1#NY zkeX*2-Mnwk_cJ0wo8#SCqbX@z@GXWGLX@$VHa4V<4a?`#MrV8=t1nJ;FIZ@OWm;dk zbd1tB#XpQ+{e9naDb)N)ft1>oE~J_sU&xg7rV)L-XtwqTL*>$;lxJ0xF`T}sjmu_@ z&oSgU+KbZ12GYj{o+{3zKAt_C=uRtl(aOrSvT~`3Qr6N+M_TFlUU^_$aV{>)>P)n* zBCV^Sb+u_-Ev2)^wONIMR#?&s3*2r$`N*~6dMfWveK1>_sF`!pii)(NVyT!?*l0yV zTG8;m!m%#zk7HS_asGF6zoWIbwAMyxYvY=%TsL1nR}QyYJ0EqdbUZEUOBrX432bf; zE#H%t?@2}}c@-_MPs{7Smp7~z^~I%GP04(4E=X&t(wZttV~eY^MVfidoF*=X$u%BF zY0K!+*jnjYl)O0nWQIC5vThlD+xRCR{)-mU(3fbT41KVwwt8CMlk9rXwNmwP?3;a5bN9NwXZf=)Y^1&?VWadt zaSe25YoN8qlVuNO>)PWf$rln*dpsebw8!JBY?YnXbR=b|@pVndGI1+LYC2|Pl%^xD z%vRRY>bAtSRL{D)ZTaLaf>gK75|p|vu7D;h8lI{flH%FuEwHX`h%2&t>`ztoiRi57 z*5JCTJ}%Ghu6?SkOL%54-s)Oc*2Rl{!Vp=>(%}r&5Eo|+rL>_cZKzu6qYO@%hrVQS z_rmVv6s5Pr9_xx1)eGw61f{EutACKnNyYADjFMJAM^MzXjAkX;c=zY0Z=Ozw6X!FM zvR~n;GW}G9XsyeKGn!@?y2VCYn$ni0RB!6TuX`W#lIYpSA6`_rcV z%M;62{xtE`1bOHT<>;p!gK5Vg`SB2m4^xiIlxYNZnG3smiJ+wW*sv>>KNG@MCSlm+ zY}gggi;&{IiONOCf`hC)m>PIIO78uLG7QGGKj^G~(7;he)HI@pr!(QqAeC7|Ic>0~ z4R*5rC}n76E4PQ%*QND!%jJ~*2wS_|xl^~Sp>&7hYS=2Bb#`)odTu&7KwF#A)+Wl@ z@^J9c$jV4Ydm1WeEVE$VKj%+&(7PJbyBeuoO%GchwXd{iG^e0@wRv`M{_@=AY3S=EdA1Y$4b_;(=}b`nl7rQXLa(snYEdW8I}Q-tU4+E zQguf~+FEJbiL~tmW$RiU{BC4zBx8a{rZihtm6%+dUYI7UkI>bf>FQ3Z`owC>ckOHK znX*eHQu54Fm9%|nzhhq-C0kCD)n~{H7c-WRxYkud=jJ=+rGB!hm#jLq`dP+&o^!ev zI+fp%FSU`4U1VkVYG=lD7CL*T-<>F3v@Teaq152l_yatnKhD<7nvgE47F5a6RLj># z9~{l-I$-A7UHQ88r%J!JJg{W6ZE@9K5h;o7`W~@v{tSO*E6<~wKVyjEEJx(0(am2J z$SxPUxnUmO+%OGo{(>PTaC5`Zx4EJ3VVAm2xYTrPZm8Q>Rn7iQMpeJbDC>UJuv5%V z_={h4a57l7;zcw9zdQa-ZogC``FFeh(mu(5K>mQ_KP#-4TCi2A^-?RgT5rWsY)xy$ zW!PG?6<1?_F@OWU+>AXb)8XaVlLI>3hJ9P4!1qbMH7oD~l5gu3c#Gt}+Dh=F WVzRwtSSDtUN{1z~4ViRUFZ+KGkB86z diff --git a/__pycache__/nn_model2_test.cpython-312.pyc b/__pycache__/nn_model2_test.cpython-312.pyc deleted file mode 100644 index 3eb8178d1eaa1f4da602f4cb97ad0e756629fab8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5657 zcmbtYTW}NS75-QE`^6X87We`NV+pXq#s-^B+TXH-Zr)uP?BG^R z=|q{(!Au|GOdg^_TLnYQk@+O(U^of^_A?+~9^r7cpX(b1l zG;QtC{+Dykch3L)=Nzs57pv8T;Ai+Fzw6r#&qjmC{3hN^V+E|>MXfx<&!j_1Yw#s|ju3SlR%3biR=lt)(I01I66bQx;^>ssszb(F1?*>$QgyJ5VpDa`14Tp#LI zvsD9YQGC%Y0YqSFH zw|winO6wk2E4fg@!9C?%xQhI$0DR6uFPKHR>ms^{a8Ds8uBonAG$5NrbnhyhlDBWb z>3#~%Zph(1@bSQ>kG0bMfV&rWDe=m=yASuXHieZQMDGWk1Grbwf&6_LA^G+<(*q#0 z;sHfQAB1)7s`VhOx8j5NVCZrsvp4LD7kUVEhU5-2I0L#s4*D5Ul=Vz6>tV8iX zjIL5wRep7lFbX&%%2{ewb&#> z2k>rX54h+maq+Co#j}b`=HlC&{Wh`-LBMS*WFmJ8jTMM%vC;IzN!Mbd`3ar1 zm@j<~7Nh?|WkgRf_Qy&Mw&n>>i)-kVdUVm#H{Q)O)rxi^o;nY2PG{hIpF(r$E3FsQ zXV7`d4|}9@pNHqyW^(5_k18ywD{mAmJsTg8z)pBnlJyjh@-c3h5f~l|l8H=?#~2oK zAha=le}ZHD(>~rG&_Zx1$u!0Y z{)vKI?T;sBTyYMwLB9YY@xDlm#o?VD!UaL1&V#o49G-qaUdJAgw_uOzfdvMxt^lPU z4+cm)rzDD%w1-Gc9<8KhqS2TjsmCLNr13|XxMaQ{1pVj2%oq+sfYas0jAKY)qh&;9 zW5)h`AsWoK947`drc+!37w67XgqUD7&@xzL6++a?z%$h^FySD}kYkjjlJ=Q+%s(L+ z6!jU~am;Ws!cc;n!ZRm3Ct?xY$umqec#i2LS49vyS8>wGCL)n(U$|)K1Dz@CD_<)n zY?H@SS4Suw@L)+R#5nNkkcW$IMuai%M!f3UvBo;V>@T9;cU~qmWwU8oPXQ9&zxu z36g5cyKuHR7xQDDmrTX9@trs!Szq!Am>1wdvRTsd6HFXSwZ&aUu_#rCaa6L(%=>WE z5AHb0;14rAk9nV@mHkWhFjRq$7Z?HiNQOxovLTsdsV~k56Osl)E^|&2p>_Ie{>^eQ z!~mdmES?Jbv1DYzVIR302%SueIi-)rd;t!iwHy--V2KJ!Dj_CW3iVN}7HeUp6pBu< zEz}4vsUyqU+)k_`}ieqbUH;&3ZU(G#&`#NZc0Qe(+h^gdmL(fLW&;U%XK6PN<4#V#PJ1JK9pr4S1J zMye@q#4+c`QGo%jbxa(*vXVIxgXHnYB9T}WYzrravME%3l`M_N;)yWB6*fzO%Bi8A z3C6h=!ZqoTg`+Ed&RAkVGRb&_Y|;h=9N{Gmxi87IDkSZB4n$BN0wisTr3r9B7V4Op!VrW<%?-l2xsaS_g-aogV#kghU+sBH zd*c0EFfQ<&W0;LV9e0+Cn^X<7jjs&*pvn?qe1N+ISmcX@w?L0sQBxH439@{SdOt_| zK10o)pjvo-iW)ydt)HTe_jQi+{#WVW?0GcuMNnaGYzWP+jW$hpk)%MTGc zsv1$Pz;~bOd#cA|U{$b5hA)T=tNwwIeq5E*=8-msYO|Z?3K{Ti181O{^PCS@N!}3xf;HP18H!jB9Vsbs+0HaBE6*9Zp&E&Zhavv@=b= z-IQ_e$T_>S&hDFsMd#j>?tWF>yd@nGt9GUI`Res^J+Jp&>zjWu&4|_QuNqQRs^^}; znKRU94fXT2Z#BHp@aDQ)gE!mY6&toK8MfyQRXIaL*3dBD_Llbz@0;6l?rmB3HqqU2 z*U#vi6RJ8PVRKwB5H?&Fy~eB24xT z-rn&3=wic%KggW+i~HD*9r)(tT%&w|1KABb?>M{eIB?Rsy#LVco$vb>y&uLhFJf{3 z_{WaGEo!bQO{F)aPh>ZC-Enr`aRidK0?T?M@a?Ed+Vf6N2050IHW^t$HOpFK#^j#w z7PZYQsD^4!@4esHmYx+GfpMMo5CumA5)LfeU2{JY?QJRDvb%9UD!R943=PZm>*fzH z=-xBEV_K^3PTBK~t?BP%8+T=T4rdz=XAJA}hPs@gDQjrTjGoDj;_N8Sj0eQgiClB= z{bRR>7dNLhX`Ja8$hHk+TL#4D;F2MfFVWqZ**cJEIC$4ExMC%tJhY+a_63{RwEGcK zYrR>68zLJZk^NY-Z-L0xHO)^g^uE{sPXE%n-i*PWH`uOPU$tgj?RO2H6+PIjm{4_X z^4N06&YNwwJhy#f$LU8%qixR`yeVzQ-kfg8*Vd&>`4-PF?Qh$~mYt6fW%On(>r=yX z-h8tsb#zX+V_6R+XY}4{+>$cSJ#)w4UNHg$Kx-X2O>I_F3wgU$bhRyMwmkL#r4PM` z(k0I=p{f-fve=TQ`?jX}nT0;l){`{nktwMrl{2^JE?S>QW>}donN!E+g@x*yyBAe| zHU8O{`7R?qHtDIO%XdkIyS{HOYtkZVQ#T#;27Ox`^a<13k%Vz9j()gn{;IE)sF7c{~K)LnH2y4 diff --git a/__pycache__/nn_model2_train.cpython-312.pyc b/__pycache__/nn_model2_train.cpython-312.pyc deleted file mode 100644 index 470fc12ab67b97e42bc3a7f9945e3d8ba6ea0acc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7707 zcmbt3ZEPDycC-8vMJ*|c)E7zdL!@MjvLe}v<3zG8MTsms{!ntUqn_#5EX`e6WGIr} zrEH70LRG-UDS!{5a97C2wJ?nqumYzAQ}l;|{%HO+hx^gpTcu>SIy8p{IJm!@DrgP? zitC$QE=kF>oS-9UcIJJ|yqS6L&6~$xo6RN!W&8L4OMlXe(0}5NTAIp@ZH7Q-4N-_9 z5-6m6iFr*_L!fzWNJl6T{k$dw5Shv_-wRoN474WR zu1-|_KGG`f5GhkO0HyJfrVKOFmbj-%tG1{{2Q))gx@8ZnjWUJo8br6!ZSfaW&gz%$ zcm!+2Mg4|4f!a)4X&Y^)J0Iy3Y{)@7;x8%f+M2e|Ewqd7qTRHIYNANW{E_8Rqe4L- z;2E!_R@!tUZBo}pAehZZHLJ4|Xpsn7A7jGlAR!U`S#vTu9G*{6bYd`_nakSfWE47# z0vg7!Xzdi`Jww|)0R1VP9*sf-g+iJYG)qv#ZDUAFX>OyCZUx;ULi+P4sFh4}bOeTE z!_(7Iea&Ea?f6`_e*@KD5^E?^L1xr>5QlyS`tqoZAyB44C5ju>R`tsfoK{^shr*TF zp2MZt!DUd#dk&YDBC;?8r>mKjyjNx!B8Owyjk|vpy z3}<6WI>Jbev(fO4NMeCzB@;BuJ0>+jJDQrm3Pe=UAn8~-F)OvilIev^n2lv=mB&^C zs_@RmD2h(XbW&4lA)_)$jp1-K5n$WU~FiDY7_ z`A+<9 zk6`TEH1=;fT=yOK9DLw#;XTpu+KTRuWy{%hKX@<5_Z-=15}lI(YA#ddkFH;TaJ+Cp zbd0U&J~wu5!+ zsT_CBI2x5-{Vr)TWs?PYWVcwQ#w}1<&8&tZKGLX)w4?sTpL*(xjVPs$h(I(h)w(Gim7DpDB1&pJ&->w z+J~Uu?BI=rE+lZ942Q2TL=p-{)<>orOKmQzdX!Ei!r>sn zz-5G3yl1b<%aUzUuJEnI;tl)5ZS;*sryu_xQm=Q)vTZqN@5RVPU4!37fF0uA5< ziI9%cf-BT7>fm6^=dh2xfGsALycVta7~sckS$SX+iFW4yH<^1o{HjM!_rrjU+&=@my6uP z3R~?}qyj&xo&Nn#TL1p!@3NV3P;^a#30f3as+aFA$()E>r4!-u&BfqJB9gH1IWln&v$LAjg-%3}VuVP|~8*SgA zC>2lY1b%iNDljp-oA(UyhhG2m694vP(S8M7rD@C6x1!rLI=9+7ifw&DTc6n04{&m; zqiZd@n$7n-?0?W-^uH|lUl#o%8}w!e_OOghS78W_Ty#AO(F$pS`afj?swWOSmvSF+R$2KxP} ztzgXGpSlHBU%T_=J*#&wR96o?6&F$2WW+Ud*2>*t0~&k*>?)j%YpKxKIs0BlPT+H| zCmYuCU8O2p2HzucsTZPiHmVZ3y$eofsdcH1YvWqp>w{YpN8-DRI&R#x3jtjfx75xx zaqU@fCQD5m31^C1Bxi=z?ZG8enRCP)lMVqtnQa2D4p?#5(=)}rPs6xM&=31$uBUhM zh@CK^?`iJ3BX+8`A@BUtOeh|B8mR7qY!eQ!2`AWutL7AXp3U++R7Pz05!JhdV9LVK5I^> z&QrFIowzyo|Ig%$Td zeVTQ(+^e1-@Vjd4rgF0Kh&~u`;CqhftB#l%Qh6vHXQH&&zyA`F23!NwhP_t`6b+pB zm*}sJ)$wxLI~Tw|)b$FK5j(7(= zZ0A%g8A8NZR?xElJBRK=2vM(DTfT7-)#3 z({z$zaP~*i(GW;V4M}Avc(bJ!Bz)e>O(NE zq%&AahnXcJEH$SYnufjn#9`Ph*61V;tzA0dKYM(`3}8(ke;JHC26-!9tWg>lF9Rh#qf zsMxk|8x1$!Bvww8Nb3iu-#=X>y9Baplk}8)-8|_n*KQwAc43Y~e-+ISik?FSU>(~= z&E|tSqU3#HJyIYFr$4sv-tnUMHNpGZr$z z{d4cvh%_IpY{bLxgK%L+>^;VhybbNBk+2J#8f|6(B|I$r&6WNJB>nKJ@%664lIS^+JGr%g;Njwf#X`qMx48e9(Atxm$Ssuu_y7Wol%rH{DI?qKcHs#KcV@VGV04_d$GAkXzs~V z#eE~fzLAZ?#iM71qh~*({x$Wh6yJQAH(vgB1U5(3neXIe*uM-;w!dZ8^Je{99gRCu zt~ziNaw6a#p;4Q)s+Tc>KmpV5D)BuY^6*G6XPzLGsf5hW7`x6d0rj4zTqVH@zk@*1 zbQ*Fs^2MxUBR6Q}k~gs#NX>-tRl?vXQ!>h_a=5dCTIIqu$yujxI+B@_NcHCu++*=} zfcI7Mr6xS4k}1!2?sT(N3=JIY%TUSb@efSzn{tP-$1U3X1bd%o?_WQ?X?z(DRY4W) zy@I{BXde{pg9Y+)<8e7Z1Ha@j_`wv^KE`kI$3Cc-Qy5@i{J4$ZhD;Bxa5|2`o}XEO z>O-WYH`_1(KgZ+>hpsGs$lpfW8l&E|-8!znLTtaL(dfq%dYKeQZI54tin)QSo4ArE z)iRK~LCo8@IuBL0RbG8HOq)z4>A_&Pvh@%xKz4dCnH-#5NJjAw4v~bC!ZXzXaP~_w zs-#kr(vtIa432Z9*6Obka2uzYR1|)yU|1{z{#8jbfDDi+mh*TtB&@LH6#VZJ%J Date: Fri, 13 Sep 2024 11:54:00 -0400 Subject: [PATCH 13/27] renaming from bedmess to bedms --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2289d46..f8b62d4 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# bedmess +# BEDMS -bedmess is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). +BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). To install `attribute-standardizer` , you need to clone this repository first. Follow the steps given below to install: ``` -git clone https://github.com/databio/bedmess.git +git clone https://github.com/databio/bedms.git -cd bedmess +cd bedms pip install . From 3d2253836f4dea0039832f6d94d7e4079780a4c7 Mon Sep 17 00:00:00 2001 From: saanikat Date: Fri, 13 Sep 2024 11:56:43 -0400 Subject: [PATCH 14/27] changing function name to get_available_schemas --- README.md | 2 +- attribute_standardizer/attr_standardizer.py | 2 +- trial.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f8b62d4..14cd1fe 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ print(results) To see the available schemas, you can run: ``` -schemas = model.show_available_schemas() +schemas = model.get_available_schemas() print(schemas) ``` diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py index b250acb..13bf949 100644 --- a/attribute_standardizer/attr_standardizer.py +++ b/attribute_standardizer/attr_standardizer.py @@ -197,7 +197,7 @@ def standardize( ) @staticmethod - def show_available_schemas() -> list[str]: + def get_available_schemas() -> list[str]: """ Stores a list of available schemas. :return list: List of available schemas. diff --git a/trial.py b/trial.py index b2d3224..1df22e1 100644 --- a/trial.py +++ b/trial.py @@ -2,7 +2,7 @@ model = AttrStandardizer("ENCODE") -schemas = model.show_available_schemas() +schemas = model.get_available_schemas() print(schemas) From 6a5c9a200f4a80ff6c275ead238d0e6e0b906805 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 13 Sep 2024 13:22:11 -0400 Subject: [PATCH 15/27] Added instalation test --- .github/workflows/run-pytest.yml | 33 ++++++++++++++++++++++++++++++++ README.md | 2 +- setup.py | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/run-pytest.yml diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml new file mode 100644 index 0000000..257d8aa --- /dev/null +++ b/.github/workflows/run-pytest.yml @@ -0,0 +1,33 @@ +## we can't run test, but lets just install all dependencies and package +name: Installation test + +on: + push: + branches: [dev] + pull_request: + branches: [master, dev] + +jobs: + pytest: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ["3.9", "3.12"] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dev dependencies + run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi + + - name: Install package + run: python -m pip install . + +# - name: Run pytest tests +# run: pytest tests -x -vv diff --git a/README.md b/README.md index 14cd1fe..defcf09 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Using Python, this is how you can run `attribute_standardizer` and print the res ``` -from attribute_standardizer import AttrStandardizer +from bedms import AttrStandardizer model = AttrStandardizer("ENCODE") model = AttrStandardizer("FAIRTRACKS") diff --git a/setup.py b/setup.py index e6a1b2e..182d464 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -PACKAGE_NAME = "attribute_standardizer" +PACKAGE_NAME = "bedms" # Ordinary dependencies DEPENDENCIES = [] From 160fd4c5fe5e3c77940a152407d545789330c92e Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 13 Sep 2024 13:35:14 -0400 Subject: [PATCH 16/27] upodated requirements tests and names --- .github/workflows/run-pytest.yml | 4 ++-- {attribute_standardizer => bedms}/__init__.py | 0 {attribute_standardizer => bedms}/_version.py | 0 {attribute_standardizer => bedms}/attr_standardizer.py | 0 {attribute_standardizer => bedms}/const.py | 0 {attribute_standardizer => bedms}/model.py | 0 {attribute_standardizer => bedms}/utils.py | 0 requirements/requirements-dev.txt | 2 +- tests/__init__.py | 0 trial.py | 2 +- 10 files changed, 4 insertions(+), 4 deletions(-) rename {attribute_standardizer => bedms}/__init__.py (100%) rename {attribute_standardizer => bedms}/_version.py (100%) rename {attribute_standardizer => bedms}/attr_standardizer.py (100%) rename {attribute_standardizer => bedms}/const.py (100%) rename {attribute_standardizer => bedms}/model.py (100%) rename {attribute_standardizer => bedms}/utils.py (100%) create mode 100644 tests/__init__.py diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 257d8aa..2422470 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -29,5 +29,5 @@ jobs: - name: Install package run: python -m pip install . -# - name: Run pytest tests -# run: pytest tests -x -vv + - name: Run pytest tests + run: pytest tests -x -vv diff --git a/attribute_standardizer/__init__.py b/bedms/__init__.py similarity index 100% rename from attribute_standardizer/__init__.py rename to bedms/__init__.py diff --git a/attribute_standardizer/_version.py b/bedms/_version.py similarity index 100% rename from attribute_standardizer/_version.py rename to bedms/_version.py diff --git a/attribute_standardizer/attr_standardizer.py b/bedms/attr_standardizer.py similarity index 100% rename from attribute_standardizer/attr_standardizer.py rename to bedms/attr_standardizer.py diff --git a/attribute_standardizer/const.py b/bedms/const.py similarity index 100% rename from attribute_standardizer/const.py rename to bedms/const.py diff --git a/attribute_standardizer/model.py b/bedms/model.py similarity index 100% rename from attribute_standardizer/model.py rename to bedms/model.py diff --git a/attribute_standardizer/utils.py b/bedms/utils.py similarity index 100% rename from attribute_standardizer/utils.py rename to bedms/utils.py diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index c3daf95..073296d 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,3 +1,3 @@ black isort -pytest \ No newline at end of file +pytest diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/trial.py b/trial.py index 1df22e1..9c2fec8 100644 --- a/trial.py +++ b/trial.py @@ -1,4 +1,4 @@ -from attribute_standardizer.attr_standardizer import AttrStandardizer +from bedms import AttrStandardizer model = AttrStandardizer("ENCODE") From 4a99d82865935fe06ec9ed619629b2002a7e1ace Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 13 Sep 2024 13:35:23 -0400 Subject: [PATCH 17/27] upodated requirements tests and names --- tests/test_bedms.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 tests/test_bedms.py diff --git a/tests/test_bedms.py b/tests/test_bedms.py new file mode 100755 index 0000000..74382ac --- /dev/null +++ b/tests/test_bedms.py @@ -0,0 +1,16 @@ +import pytest +from bedms import AttrStandardizer + + +class TestBEDMES: + def test_bedmes(self): + + model = AttrStandardizer("ENCODE") + + schemas = model.get_available_schemas() + + assert schemas + # results = model.standardize(pep="geo/gse178283:default") + results = model.standardize(pep="geo/gse228634:default") + + assert results From 7f484a665022b8da84e15a068423980d2c098879 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 13 Sep 2024 13:42:40 -0400 Subject: [PATCH 18/27] restructured code --- bedms/attr_standardizer.py | 8 +++++--- bedms/const.py | 2 ++ bedms/utils.py | 2 +- requirements/requirements-all.txt | 2 +- tests/test_bedms.py | 7 ------- trial.py | 12 ------------ 6 files changed, 9 insertions(+), 24 deletions(-) delete mode 100644 trial.py diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 13bf949..fbcaf39 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -7,6 +7,7 @@ import torch.nn.functional as torch_functional from .const import ( + AVAILABLE_SCHEMAS, CONFIDENCE_THRESHOLD, DROPOUT_PROB, EMBEDDING_SIZE, @@ -17,8 +18,8 @@ OUTPUT_SIZE_BEDBASE, OUTPUT_SIZE_ENCODE, OUTPUT_SIZE_FAIRTRACKS, - SENTENCE_TRANSFORMER_MODEL, PROJECT_NAME, + SENTENCE_TRANSFORMER_MODEL, ) from .model import BoWSTModel from .utils import ( @@ -200,7 +201,8 @@ def standardize( def get_available_schemas() -> list[str]: """ Stores a list of available schemas. + :return list: List of available schemas. """ - schemas = ["ENCODE", "FAIRTRACKS", "BEDBASE"] - return schemas + + return AVAILABLE_SCHEMAS diff --git a/bedms/const.py b/bedms/const.py index 54e9b06..e325671 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -1,5 +1,7 @@ PROJECT_NAME = "bedmess" +AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"] + REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" MODEL_FAIRTRACKS = "model_fairtracks.pth" diff --git a/bedms/utils.py b/bedms/utils.py index aff492e..67dbd2e 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -8,7 +8,7 @@ import peppy import torch from huggingface_hub import hf_hub_download -from pephubclient import PEPHubClient + from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans from sklearn.feature_extraction.text import CountVectorizer diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 848e7e8..3f373a4 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,4 +3,4 @@ numpy torch sentence-transformers pephubclient>=0.4.2 -peppy>=0.40.5 +peppy>=0.40.6 diff --git a/tests/test_bedms.py b/tests/test_bedms.py index 74382ac..a47dfb1 100755 --- a/tests/test_bedms.py +++ b/tests/test_bedms.py @@ -1,16 +1,9 @@ -import pytest from bedms import AttrStandardizer class TestBEDMES: def test_bedmes(self): - model = AttrStandardizer("ENCODE") - - schemas = model.get_available_schemas() - - assert schemas - # results = model.standardize(pep="geo/gse178283:default") results = model.standardize(pep="geo/gse228634:default") assert results diff --git a/trial.py b/trial.py deleted file mode 100644 index 9c2fec8..0000000 --- a/trial.py +++ /dev/null @@ -1,12 +0,0 @@ -from bedms import AttrStandardizer - -model = AttrStandardizer("ENCODE") - -schemas = model.get_available_schemas() - -print(schemas) - -# results = model.standardize(pep="geo/gse178283:default") -results = model.standardize(pep="geo/gse228634:default") - -print(results) From 5519e53fef63311d60958bd0540b6ff4e212acc5 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 13 Sep 2024 13:48:04 -0400 Subject: [PATCH 19/27] updated readme --- README.md | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index defcf09..676ecbb 100644 --- a/README.md +++ b/README.md @@ -3,41 +3,35 @@ BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). -To install `attribute-standardizer` , you need to clone this repository first. Follow the steps given below to install: +## Installation +To install `bbclient` use this command: ``` -git clone https://github.com/databio/bedms.git - -cd bedms - -pip install . - +pip install bedms +``` +or install the latest version from the GitHub repository: +``` +pip install git+https://github.com/databio/bedms.git ``` ## Usage -Using Python, this is how you can run `attribute_standardizer` and print the results : - - -``` +```python from bedms import AttrStandardizer model = AttrStandardizer("ENCODE") -model = AttrStandardizer("FAIRTRACKS") - -results = model.standardize(pep ="geo/gse178283:default") - -print(results) +results = model.standardize(pep="geo/gse228634:default") +assert results ``` -To see the available schemas, you can run: -``` -schemas = model.get_available_schemas() -print(schemas) +To see the available schemas, you can run: ``` +from bedms.constants import AVAILABLE_SCHEMAS +print(AVAILABLE_SCHEMAS) -This will print the available schemas as a list. +# >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] -You can use the format provided in the `trial.py` script in this repository as a reference. \ No newline at end of file +``` +AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata. From 22d91cb65e983fff1164e101ae1a9e44bb767cc0 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 13 Sep 2024 14:51:35 -0400 Subject: [PATCH 20/27] Fixed readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 676ecbb..0f95dca 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigeno ## Installation -To install `bbclient` use this command: +To install `bedms` use this command: ``` pip install bedms ``` From b13f0a3f927e52b5ccb9ec6f36064f19a1f5af9e Mon Sep 17 00:00:00 2001 From: saanikat Date: Fri, 13 Sep 2024 16:19:50 -0400 Subject: [PATCH 21/27] minor changes --- attribute_standardizer/attr_standardizer.py | 1 - attribute_standardizer/utils.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py index 13bf949..6825392 100644 --- a/attribute_standardizer/attr_standardizer.py +++ b/attribute_standardizer/attr_standardizer.py @@ -86,7 +86,6 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: def _load_model(self) -> nn.Module: """ Calls function to load the model from HuggingFace repository and sets to eval(). - :return nn.Module: Loaded Neural Network Model. """ try: diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index aff492e..7e24487 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -186,7 +186,7 @@ def data_encoding( X_values_embeddings = embeddings if schema == "ENCODE": # Bag of Words Vectorizer - vectorizer = CountVectorizer() + vectorizer = None vc_path = hf_hub_download( repo_id=REPO_ID, filename=ENCODE_VECTORIZER_FILENAME, @@ -202,7 +202,7 @@ def data_encoding( # print(transformed_columns) X_values_bow = transformed_columns # Label Encoding - label_encoder = LabelEncoder() + label_encoder = None lb_path = hf_hub_download( repo_id=REPO_ID, filename=ENCODE_LABEL_ENCODER_FILENAME, From 70ab6c04d2310536bd06cbb99649d1c58e2fd2d1 Mon Sep 17 00:00:00 2001 From: saanikat Date: Fri, 13 Sep 2024 16:34:37 -0400 Subject: [PATCH 22/27] instantiation issue solved --- bedms/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bedms/utils.py b/bedms/utils.py index 67dbd2e..632c1d9 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -186,7 +186,7 @@ def data_encoding( X_values_embeddings = embeddings if schema == "ENCODE": # Bag of Words Vectorizer - vectorizer = CountVectorizer() + vectorizer = None vc_path = hf_hub_download( repo_id=REPO_ID, filename=ENCODE_VECTORIZER_FILENAME, @@ -202,7 +202,7 @@ def data_encoding( # print(transformed_columns) X_values_bow = transformed_columns # Label Encoding - label_encoder = LabelEncoder() + label_encoder =None lb_path = hf_hub_download( repo_id=REPO_ID, filename=ENCODE_LABEL_ENCODER_FILENAME, @@ -211,7 +211,7 @@ def data_encoding( label_encoder = pickle.load(f) elif schema == "FAIRTRACKS": - vectorizer = CountVectorizer() + vectorizer = None vc_path = hf_hub_download( repo_id=REPO_ID, filename=FAIRTRACKS_VECTORIZER_FILENAME ) @@ -226,7 +226,7 @@ def data_encoding( # print(transformed_columns) X_values_bow = transformed_columns # Label Encoding - label_encoder = LabelEncoder() + label_encoder = None lb_path = hf_hub_download( repo_id=REPO_ID, filename=FAIRTRACKS_LABEL_ENCODER_FILENAME, @@ -235,7 +235,7 @@ def data_encoding( label_encoder = pickle.load(f) elif schema == "BEDBASE": - vectorizer = CountVectorizer() + vectorizer = None vc_path = hf_hub_download(repo_id=REPO_ID, filename=BEDBASE_VECTORIZER_FILENAME) with open(vc_path, "rb") as f: vectorizer = pickle.load(f) @@ -248,7 +248,7 @@ def data_encoding( # print(transformed_columns) X_values_bow = transformed_columns # Label Encoding - label_encoder = LabelEncoder() + label_encoder = None lb_path = hf_hub_download( repo_id=REPO_ID, filename=BEDBASE_LABEL_ENCODER_FILENAME, From b71112f8aa6fc92f79afb615c5f98b3636adeac9 Mon Sep 17 00:00:00 2001 From: saanikat Date: Fri, 13 Sep 2024 16:19:50 -0400 Subject: [PATCH 23/27] minor changes --- bedms/attr_standardizer.py | 1 - bedms/utils.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index fbcaf39..0ddc6e0 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -87,7 +87,6 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: def _load_model(self) -> nn.Module: """ Calls function to load the model from HuggingFace repository and sets to eval(). - :return nn.Module: Loaded Neural Network Model. """ try: diff --git a/bedms/utils.py b/bedms/utils.py index 632c1d9..0b1415c 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -202,7 +202,11 @@ def data_encoding( # print(transformed_columns) X_values_bow = transformed_columns # Label Encoding +<<<<<<< HEAD:bedms/utils.py label_encoder =None +======= + label_encoder = None +>>>>>>> b13f0a3 (minor changes):attribute_standardizer/utils.py lb_path = hf_hub_download( repo_id=REPO_ID, filename=ENCODE_LABEL_ENCODER_FILENAME, From 50bc889b7196379ea63cd7e9439b3f327e8cdd79 Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 16 Sep 2024 11:19:41 -0400 Subject: [PATCH 24/27] loading vectorizer and label_enc in _load_model --- bedms/__init__.py | 3 + bedms/attr_standardizer.py | 98 ++++++++++++++++------ bedms/const.py | 6 +- bedms/model.py | 6 +- bedms/utils.py | 163 +++++++++++-------------------------- 5 files changed, 135 insertions(+), 141 deletions(-) diff --git a/bedms/__init__.py b/bedms/__init__.py index 374c0be..cdbc2d6 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -1 +1,4 @@ +""" +This module initializes 'bedms' package. +""" from .attr_standardizer import AttrStandardizer diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 0ddc6e0..92df10f 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -1,9 +1,12 @@ +""" +This module has the class AttrStandardizer for 'bedms'. +""" import logging from typing import Dict, Tuple, Union - +import pickle import peppy import torch -import torch.nn as nn +from torch import nn import torch.nn.functional as torch_functional from .const import ( @@ -20,6 +23,13 @@ OUTPUT_SIZE_FAIRTRACKS, PROJECT_NAME, SENTENCE_TRANSFORMER_MODEL, + REPO_ID, + ENCODE_VECTORIZER_FILENAME, + ENCODE_LABEL_ENCODER_FILENAME, + FAIRTRACKS_VECTORIZER_FILENAME, + FAIRTRACKS_LABEL_ENCODER_FILENAME, + BEDBASE_VECTORIZER_FILENAME, + BEDBASE_LABEL_ENCODER_FILENAME, ) from .model import BoWSTModel from .utils import ( @@ -28,6 +38,7 @@ fetch_from_pephub, get_any_pep, load_from_huggingface, + hf_hub_download, ) logging.basicConfig(level=logging.INFO) @@ -35,6 +46,9 @@ class AttrStandardizer: + """ + This is the AttrStandardizer class which holds the models for Attribute Standardization. + """ def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. @@ -43,7 +57,7 @@ def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: :param int confidence: Confidence threshold for the predictions. """ self.schema = schema - self.model = self._load_model() + self.model, self.vectorizer, self.label_encoder = self._load_model() self.conf_threshold = confidence def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: @@ -61,7 +75,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_ENCODE, DROPOUT_PROB, ) - elif self.schema == "FAIRTRACKS": + if self.schema == "FAIRTRACKS": return ( INPUT_SIZE_BOW_FAIRTRACKS, EMBEDDING_SIZE, @@ -70,7 +84,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_FAIRTRACKS, DROPOUT_PROB, ) - elif self.schema == "BEDBASE": + if self.schema == "BEDBASE": return ( INPUT_SIZE_BOW_BEDBASE, EMBEDDING_SIZE, @@ -79,17 +93,50 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_BEDBASE, DROPOUT_PROB, ) - else: - raise ValueError( - f"Schema not available: {self.schema}. Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + + raise ValueError( + f"Schema not available: {self.schema}." + "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" ) - def _load_model(self) -> nn.Module: + def _load_model(self) -> tuple[nn.Module, object, object]: """ - Calls function to load the model from HuggingFace repository and sets to eval(). + Calls function to load the model from HuggingFace repository + load vectorizer and label encoder and sets to eval(). :return nn.Module: Loaded Neural Network Model. + :return object: The scikit learn vectorizer for bag of words encoding. + :return object: Label encoder object for the labels (y). """ try: + if self.schema == "ENCODE": + filename_vc = ENCODE_VECTORIZER_FILENAME + filename_lb = ENCODE_LABEL_ENCODER_FILENAME + elif self.schema == "FAIRTRACKS": + filename_vc = FAIRTRACKS_VECTORIZER_FILENAME + filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME + elif self.schema == "BEDBASE": + filename_vc = BEDBASE_VECTORIZER_FILENAME + filename_lb = BEDBASE_LABEL_ENCODER_FILENAME + + vectorizer = None + label_encoder = None + + vc_path = hf_hub_download( + repo_id=REPO_ID, + filename=filename_vc, + ) + + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + + lb_path = hf_hub_download( + repo_id=REPO_ID, + filename=filename_lb, + ) + + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + model = load_from_huggingface(self.schema) state_dict = torch.load(model) @@ -112,7 +159,7 @@ def _load_model(self) -> nn.Module: ) model.load_state_dict(state_dict) model.eval() - return model + return model, vectorizer, label_encoder except Exception as e: logger.error(f"Error loading the model: {str(e)}") @@ -122,7 +169,9 @@ def standardize( self, pep: Union[str, peppy.Project] ) -> Dict[str, Dict[str, float]]: """ - Fetches the user provided PEP from the PEPHub registry path, returns the predictions. + Fetches the user provided PEP + from the PEPHub registry path, + returns the predictions. :param str pep: peppy.Project object or PEPHub registry path to PEP. :return Dict[str, Dict[str, float]]: Suggestions to the user. @@ -138,20 +187,21 @@ def standardize( try: csv_file = fetch_from_pephub(pep) - X_values_st, X_headers_st, X_values_bow, num_rows = data_preprocessing( + x_values_st, x_headers_st, x_values_bow, num_rows = data_preprocessing( csv_file ) ( - X_headers_embeddings_tensor, - X_values_embeddings_tensor, - X_values_bow_tensor, + x_headers_embeddings_tensor, + x_values_embeddings_tensor, + x_values_bow_tensor, label_encoder, ) = data_encoding( + self.vectorizer, + self.label_encoder, num_rows, - X_values_st, - X_headers_st, - X_values_bow, - self.schema, + x_values_st, + x_headers_st, + x_values_bow, model_name=SENTENCE_TRANSFORMER_MODEL, ) @@ -159,9 +209,9 @@ def standardize( with torch.no_grad(): outputs = self.model( - X_values_bow_tensor, - X_values_embeddings_tensor, - X_headers_embeddings_tensor, + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, ) probabilities = torch_functional.softmax(outputs, dim=1) @@ -174,7 +224,7 @@ def standardize( ] suggestions = {} - for i, category in enumerate(X_headers_st): + for i, category in enumerate(x_headers_st): category_suggestions = {} if top_confidences[i][0] >= self.conf_threshold: for j in range(3): diff --git a/bedms/const.py b/bedms/const.py index e325671..86916c6 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -1,7 +1,11 @@ +""" +This module contains constant values used in the 'bedms' package. +""" + PROJECT_NAME = "bedmess" AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"] - +PEP_FILE_TYPES = ["yaml", "csv"] REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" MODEL_FAIRTRACKS = "model_fairtracks.pth" diff --git a/bedms/model.py b/bedms/model.py index af212bc..94bd9da 100644 --- a/bedms/model.py +++ b/bedms/model.py @@ -20,8 +20,10 @@ def __init__( Initializes the BoWSTModel. :param int input_size_values: Size of the input for the values (BoW). - :param int inout_size_values_embeddings: Size of the input for the values sentence transformer embeddings. - :param int input_size_headers: Size of the input for the headers with sentence transformer embeddings. + :param int inout_size_values_embeddings: Size of the input + for the values sentence transformer embeddings. + :param int input_size_headers: Size of the input + for the headers with sentence transformer embeddings. :param int hidden_size: Size of the hidden layer. :param int output_size: Size of the output layer. :param float dropout_prob: Dropout probability for regularization. diff --git a/bedms/utils.py b/bedms/utils.py index 0b1415c..f27fd82 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -1,4 +1,6 @@ -import pickle +""" +This module has all util functions for 'bedms' +""" import warnings from collections import Counter from typing import Any, List, Optional, Tuple, Union @@ -11,21 +13,15 @@ from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans -from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import LabelEncoder from .const import ( - BEDBASE_LABEL_ENCODER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - ENCODE_LABEL_ENCODER_FILENAME, - ENCODE_VECTORIZER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, - FAIRTRACKS_VECTORIZER_FILENAME, MODEL_BEDBASE, MODEL_ENCODE, MODEL_FAIRTRACKS, NUM_CLUSTERS, REPO_ID, + PEP_FILE_TYPES ) # TODO : convert to single np array before converting to tensor @@ -73,19 +69,21 @@ def data_preprocessing( :param pd.DataFrame df: The input DataFrame (user chosen PEP) to preprocess. :return Tuple[List[List[str]], List[str], List[List[str]]]: - - Nested list containing the comma separated values in each column for sentence transformer embeddings. + - Nested list containing the comma separated values + in each column for sentence transformer embeddings. - List containing the headers of the DataFrame. - - Nested list containing the comma separated values in each column for Bag of Words encoding. + - Nested list containing the comma separated values + in each column for Bag of Words encoding. - Number of rows in the metadata csv """ - X_values_st = [df[column].astype(str).tolist() for column in df.columns] - X_headers_st = df.columns.tolist() - X_values_bow = [df[column].astype(str).tolist() for column in df.columns] + x_values_st = [df[column].astype(str).tolist() for column in df.columns] + x_headers_st = df.columns.tolist() + x_values_bow = [df[column].astype(str).tolist() for column in df.columns] num_rows = df.shape[0] - return X_values_st, X_headers_st, X_values_bow, num_rows + return x_values_st, x_headers_st, x_values_bow, num_rows def get_top_k_average(val_embedding: List[np.ndarray], k: int) -> np.ndarray: @@ -151,31 +149,39 @@ def get_averaged(embeddings: List[np.ndarray]) -> np.ndarray: def data_encoding( + vectorizer: object, + label_encoder: object, num_rows: int, - X_values_st: List[List[str]], - X_headers_st: List[str], - X_values_bow: List[List[str]], - schema: str, + x_values_st: List[List[str]], + x_headers_st: List[str], + x_values_bow: List[List[str]], model_name: str, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Union[LabelEncoder, None]]: """ Encode input data in accordance with the user-specified schemas. + :param object vectorizer: scikit-learn vectorizer for bag of words encoding. + :param object label_encoder" Label encoder object storing labels (y) :param int num_rows: Number of rows in the sample metadata - :param list X_values_st: Nested list containing the comma separated values in each column for sentence transformer embeddings. + :param list X_values_st: Nested list containing the comma separated values + in each column for sentence transformer embeddings. :param list X_headers_st: List containing the headers of the DataFrame. - :param list X_values_bow: Nested list containing the comma separated values in each column for Bag of Words encoding. + :param list X_values_bow: Nested list containing the comma separated values + in each column for Bag of Words encoding. :param str schema: Schema type chosen by the user for standardization. - :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Union[LabelEncoder, None]]: Tuple containing torch tensors for encoded embeddings and Bag of Words representations, and label encoder object. + :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + Union[LabelEncoder, None]]: Tuple containing + torch tensors for encoded embeddings and Bag of Words representations, + and label encoder object. """ # Sentence Transformer Model sentence_encoder = SentenceTransformer(model_name) - X_headers_embeddings = sentence_encoder.encode( - X_headers_st, show_progress_bar=False + x_headers_embeddings = sentence_encoder.encode( + x_headers_st, show_progress_bar=False ) # generating embeddings for each element in sublist (column) embeddings = [] - for column in X_values_st: + for column in x_values_st: val_embedding = sentence_encoder.encode(column, show_progress_bar=False) if num_rows >= 10: embedding = get_top_cluster_averaged(val_embedding) @@ -183,96 +189,29 @@ def data_encoding( embedding = get_averaged(val_embedding) embeddings.append(embedding) - X_values_embeddings = embeddings - if schema == "ENCODE": - # Bag of Words Vectorizer - vectorizer = None - vc_path = hf_hub_download( - repo_id=REPO_ID, - filename=ENCODE_VECTORIZER_FILENAME, - ) - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) - transformed_columns = [] - for column in X_values_bow: - column_text = " ".join(column) - transformed_column = vectorizer.transform([column_text]) - transformed_columns.append(transformed_column.toarray()[0]) - transformed_columns = np.array(transformed_columns) - # print(transformed_columns) - X_values_bow = transformed_columns - # Label Encoding -<<<<<<< HEAD:bedms/utils.py - label_encoder =None -======= - label_encoder = None ->>>>>>> b13f0a3 (minor changes):attribute_standardizer/utils.py - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=ENCODE_LABEL_ENCODER_FILENAME, - ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) - - elif schema == "FAIRTRACKS": - vectorizer = None - vc_path = hf_hub_download( - repo_id=REPO_ID, filename=FAIRTRACKS_VECTORIZER_FILENAME - ) - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) - transformed_columns = [] - for column in X_values_bow: - column_text = " ".join(column) - transformed_column = vectorizer.transform([column_text]) - transformed_columns.append(transformed_column.toarray()[0]) - transformed_columns = np.array(transformed_columns) - # print(transformed_columns) - X_values_bow = transformed_columns - # Label Encoding - label_encoder = None - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=FAIRTRACKS_LABEL_ENCODER_FILENAME, - ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) - - elif schema == "BEDBASE": - vectorizer = None - vc_path = hf_hub_download(repo_id=REPO_ID, filename=BEDBASE_VECTORIZER_FILENAME) - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) - transformed_columns = [] - for column in X_values_bow: - column_text = " ".join(column) - transformed_column = vectorizer.transform([column_text]) - transformed_columns.append(transformed_column.toarray()[0]) - transformed_columns = np.array(transformed_columns) - # print(transformed_columns) - X_values_bow = transformed_columns - # Label Encoding - label_encoder = None - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=BEDBASE_LABEL_ENCODER_FILENAME, - ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) - - X_headers_embeddings_tensor = torch.tensor( - X_headers_embeddings, dtype=torch.float32 + x_values_embeddings = embeddings + transformed_columns = [] + for column in x_values_bow: + column_text = " ".join(column) + transformed_column = vectorizer.transform([column_text]) + transformed_columns.append(transformed_column.toarray()[0]) + transformed_columns = np.array(transformed_columns) + # print(transformed_columns) + x_values_bow = transformed_columns + + x_headers_embeddings_tensor = torch.tensor( + x_headers_embeddings, dtype=torch.float32 ) - X_values_embeddings_tensor = torch.tensor(X_values_embeddings, dtype=torch.float32) - X_values_bow_tensor = torch.tensor(X_values_bow, dtype=torch.float32) - X_values_embeddings_tensor = X_values_embeddings_tensor.squeeze( + x_values_embeddings_tensor = torch.tensor(x_values_embeddings, dtype=torch.float32) + x_values_bow_tensor = torch.tensor(x_values_bow, dtype=torch.float32) + x_values_embeddings_tensor = x_values_embeddings_tensor.squeeze( 1 ) # brings the shape to [num_cols, vocab] return ( - X_headers_embeddings_tensor, - X_values_embeddings_tensor, - X_values_bow_tensor, + x_headers_embeddings_tensor, + x_values_embeddings_tensor, + x_values_bow_tensor, label_encoder, ) @@ -285,11 +224,7 @@ def get_any_pep(pep: str) -> peppy.Project: :return: peppy.Project object. """ - - PEP_FILE_TYPES = ["yaml", "csv"] - res = list(filter(pep.endswith, PEP_FILE_TYPES)) != [] if res: return peppy.Project(pep) - else: - return peppy.Project.from_pephub(pep) + return peppy.Project.from_pephub(pep) From 55d9a1473efc0bd4809470cfc28a3f6a50f55290 Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 16 Sep 2024 14:27:04 -0400 Subject: [PATCH 25/27] conflict changes --- bedms/attr_standardizer.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index a0ff644..c61f4cb 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -1,6 +1,7 @@ """ This module has the class AttrStandardizer for 'bedms'. """ + import logging from typing import Dict, Tuple, Union import pickle @@ -38,8 +39,8 @@ fetch_from_pephub, get_any_pep, load_from_huggingface, - hf_hub_download, ) +from huggingface_hub import hf_hub_download logging.basicConfig(level=logging.INFO) logger = logging.getLogger(PROJECT_NAME) @@ -49,6 +50,7 @@ class AttrStandardizer: """ This is the AttrStandardizer class which holds the models for Attribute Standardization. """ + def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. @@ -93,20 +95,15 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_BEDBASE, DROPOUT_PROB, ) - raise ValueError( - f"Schema not available: {self.schema}." - "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" - ) + f"Schema not available: {self.schema}." + "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + ) def _load_model(self) -> tuple[nn.Module, object, object]: """ -<<<<<<< HEAD:bedms/attr_standardizer.py Calls function to load the model from HuggingFace repository load vectorizer and label encoder and sets to eval(). -======= - Calls function to load the model from HuggingFace repository and sets to eval(). ->>>>>>> b13f0a3f927e52b5ccb9ec6f36064f19a1f5af9e:attribute_standardizer/attr_standardizer.py :return nn.Module: Loaded Neural Network Model. :return object: The scikit learn vectorizer for bag of words encoding. :return object: Label encoder object for the labels (y). @@ -173,8 +170,8 @@ def standardize( self, pep: Union[str, peppy.Project] ) -> Dict[str, Dict[str, float]]: """ - Fetches the user provided PEP - from the PEPHub registry path, + Fetches the user provided PEP + from the PEPHub registry path, returns the predictions. :param str pep: peppy.Project object or PEPHub registry path to PEP. From 8d8b1a6977c4303c6f5e8e273d026718cb39aa64 Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 16 Sep 2024 14:31:23 -0400 Subject: [PATCH 26/27] minor changes --- bedms/__init__.py | 1 + bedms/attr_standardizer.py | 2 +- bedms/model.py | 4 ++-- bedms/utils.py | 17 +++++++++-------- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/bedms/__init__.py b/bedms/__init__.py index cdbc2d6..d0d13a5 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -1,4 +1,5 @@ """ This module initializes 'bedms' package. """ + from .attr_standardizer import AttrStandardizer diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index c61f4cb..6fa3b2e 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -100,7 +100,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" ) - def _load_model(self) -> tuple[nn.Module, object, object]: + def _load_model(self) -> Tuple[nn.Module, object, object]: """ Calls function to load the model from HuggingFace repository load vectorizer and label encoder and sets to eval(). diff --git a/bedms/model.py b/bedms/model.py index 94bd9da..52eed64 100644 --- a/bedms/model.py +++ b/bedms/model.py @@ -20,9 +20,9 @@ def __init__( Initializes the BoWSTModel. :param int input_size_values: Size of the input for the values (BoW). - :param int inout_size_values_embeddings: Size of the input + :param int inout_size_values_embeddings: Size of the input for the values sentence transformer embeddings. - :param int input_size_headers: Size of the input + :param int input_size_headers: Size of the input for the headers with sentence transformer embeddings. :param int hidden_size: Size of the hidden layer. :param int output_size: Size of the output layer. diff --git a/bedms/utils.py b/bedms/utils.py index f27fd82..0dcb613 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -1,6 +1,7 @@ """ This module has all util functions for 'bedms' """ + import warnings from collections import Counter from typing import Any, List, Optional, Tuple, Union @@ -21,7 +22,7 @@ MODEL_FAIRTRACKS, NUM_CLUSTERS, REPO_ID, - PEP_FILE_TYPES + PEP_FILE_TYPES, ) # TODO : convert to single np array before converting to tensor @@ -69,10 +70,10 @@ def data_preprocessing( :param pd.DataFrame df: The input DataFrame (user chosen PEP) to preprocess. :return Tuple[List[List[str]], List[str], List[List[str]]]: - - Nested list containing the comma separated values + - Nested list containing the comma separated values in each column for sentence transformer embeddings. - List containing the headers of the DataFrame. - - Nested list containing the comma separated values + - Nested list containing the comma separated values in each column for Bag of Words encoding. - Number of rows in the metadata csv """ @@ -163,15 +164,15 @@ def data_encoding( :param object vectorizer: scikit-learn vectorizer for bag of words encoding. :param object label_encoder" Label encoder object storing labels (y) :param int num_rows: Number of rows in the sample metadata - :param list X_values_st: Nested list containing the comma separated values + :param list X_values_st: Nested list containing the comma separated values in each column for sentence transformer embeddings. :param list X_headers_st: List containing the headers of the DataFrame. - :param list X_values_bow: Nested list containing the comma separated values + :param list X_values_bow: Nested list containing the comma separated values in each column for Bag of Words encoding. :param str schema: Schema type chosen by the user for standardization. - :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Union[LabelEncoder, None]]: Tuple containing - torch tensors for encoded embeddings and Bag of Words representations, + :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + Union[LabelEncoder, None]]: Tuple containing + torch tensors for encoded embeddings and Bag of Words representations, and label encoder object. """ # Sentence Transformer Model From d429077464360e48e137722e0382d474d6df76a6 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Sep 2024 15:15:30 -0400 Subject: [PATCH 27/27] Added changelog --- docs/changelog.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/changelog.md diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..747b21b --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,8 @@ +# Changelog + +This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. + + +## [0.1.0] - 2024-09-16 +### Added +- initial project release \ No newline at end of file