From 4e62012447ef4b5027aaeed09e0a1ccd85757824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 2 Aug 2021 16:21:55 +0200 Subject: [PATCH 01/61] feat(mlflow): artifact logging * Model artifact for inference or deployment * Eval artifact logging all the metrics * Label artifact logging labels in order of the predictions --- .../oguz/huggingface-multihead/train.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index b99aa14..0c3b5e6 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -9,7 +9,7 @@ from sklearn.metrics import accuracy_score, precision_recall_fscore_support from transformers import AutoModel, AutoTokenizer, TrainingArguments -from constants import PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D +from constants import SECTORS, PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D from data import MultiHeadDataFrame from model import MultiHeadTransformer from trainer import MultiHeadTrainer @@ -61,8 +61,8 @@ parser.add_argument("--experiment_name", type=str) # SageMaker parameters - data, model, and output directories - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) + parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) @@ -263,10 +263,32 @@ def _process(text): eval_result = trainer.evaluate(eval_dataset=test_dataset) # write eval result to file which can be accessed later in s3 ouput - with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: + eval_file = os.path.join(args.output_data_dir, "eval_results.txt") + with open(eval_file, "w") as writer: print("***** Eval results *****") for key, value in sorted(eval_result.items()): writer.write(f"{key} = {value}\n") + mlflow.log_artifact(eval_file) + + # get labels + if args.target == "sectors": + labels = SECTORS + elif groups is not None: + if args.iterative: + labels = [[gn + ":"] + gs for gs, gn in zip(groups, group_names)] + labels = [label for ls in labels for label in ls] + else: + labels = [label for gs in groups for label in gs] + else: + labels = None + + # write groups to a file which can be accessed later in s3 ouput + label_file = os.path.join(args.output_data_dir, "labels.txt") + with open(label_file, "w") as writer: + print("***** Labels *****") + for label in labels: + writer.write(f"{label}\n") + mlflow.log_artifact(label_file) # write eval result to MLFlow for key, value in sorted(eval_result.items()): @@ -274,11 +296,15 @@ def _process(text): # log experiment params to MLFlow mlflow.log_params(vars(args)) - mlflow.log_params({"groups": groups, "group_names": group_names}) # set tags mlflow.set_tags({"split": args.split, "iterative": args.iterative}) + # log model + mlflow.pytorch.log_model( + trainer.model, artifact_path="model", registered_model_name="multi-head-transformer" + ) + # finish mlflow run mlflow.end_run() except Exception as e: From 9bf8cd6f7b5740d9a92221bcebef76f4150f8d72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 3 Aug 2021 12:12:30 +0200 Subject: [PATCH 02/61] feat(data): add maximum tokenizer length as an option * Forces use of padding = "max_length" * `add_special_tokens` and `return_token_type_ids` are set to True --- .../oguz/huggingface-multihead/data.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index dd3124e..60d679f 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -29,6 +29,7 @@ class MultiHeadDataFrame(Dataset): least one target in `sector` or in `pillar2d` fields. flatten: flatten group targets to 1D for convenience online: online or offline tokenization + tokenizer_max_len: maximum output length for the tokenizer """ def __init__( @@ -42,6 +43,7 @@ def __init__( filter: Optional[Union[str, List[str]]] = None, flatten: bool = True, online: bool = False, + tokenizer_max_len: int = 200, ): self.group_names = group_names self.flatten = flatten @@ -69,13 +71,27 @@ def __init__( dataframe = dataframe[pos] self.logger.info(f"Filtered data points with non-empty (or) {','.join(filter)} values") + # prepare tokenizer options + self.tokenizer_options = { + "truncation": True, + "padding": "max_length", + "add_special_tokens": True, + "return_token_type_ids": True, + "max_length": min(tokenizer_max_len, tokenizer.model_max_length), + } + if tokenizer.model_max_length < tokenizer_max_len: + self.logger.info( + f"Using maximum model length: {tokenizer.model_max_length} instead" + f"of given length: {tokenizer_max_len}" + ) + if self.online: # save data as exceprt self.data = dataframe[source].tolist() else: # tokenize and save source data - self.logger.info(f"Applying offline tokenization") - self.data = tokenizer(dataframe[source].tolist(), truncation=True, padding=True) + self.logger.info("Applying offline tokenization") + self.data = tokenizer(dataframe[source].tolist(), **self.tokenizer_options) # prepare target encoding all_targets = np.hstack(dataframe[target].to_numpy()) @@ -160,7 +176,7 @@ def __len__(self): def __getitem__(self, idx): if self.online: - data = self.tokenizer(self.data[idx : idx + 1], truncation=True, padding=True) + data = self.tokenizer(self.data[idx : idx + 1], **self.tokenizer_options) item = {key: torch.tensor(val[0]) for key, val in data.items()} else: item = {key: torch.tensor(val[idx]) for key, val in self.data.items()} From 97730a66c19960d1191c2d94fd5380095e55ed6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 3 Aug 2021 15:38:59 +0200 Subject: [PATCH 03/61] feat(data): add inference option --- .../oguz/huggingface-multihead/data.py | 84 +++++++++++-------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 60d679f..83caf89 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -29,6 +29,7 @@ class MultiHeadDataFrame(Dataset): least one target in `sector` or in `pillar2d` fields. flatten: flatten group targets to 1D for convenience online: online or offline tokenization + inference: if True, does not process target or groups tokenizer_max_len: maximum output length for the tokenizer """ @@ -43,12 +44,14 @@ def __init__( filter: Optional[Union[str, List[str]]] = None, flatten: bool = True, online: bool = False, + inference: bool = False, tokenizer_max_len: int = 200, ): self.group_names = group_names self.flatten = flatten self.tokenizer = tokenizer self.online = online + self.inference = inference self.logger = logging.getLogger() # read dataframe manually if given as path @@ -56,9 +59,6 @@ def __init__( self.logger.info(f"Loading dataframe: {dataframe}") dataframe = pd.read_pickle(dataframe) - # apply literal eval to have lists in target - dataframe[target] = dataframe[target].apply(literal_eval) - # cast filter to array if isinstance(filter, str): filter = [filter] @@ -86,6 +86,9 @@ def __init__( ) if self.online: + # ensure that we are in training + assert not self.inference, "Online tokenization is only supported in training-time" + # save data as exceprt self.data = dataframe[source].tolist() else: @@ -93,34 +96,40 @@ def __init__( self.logger.info("Applying offline tokenization") self.data = tokenizer(dataframe[source].tolist(), **self.tokenizer_options) - # prepare target encoding - all_targets = np.hstack(dataframe[target].to_numpy()) - uniq_targets = np.unique(all_targets) + if not self.inference: + # apply literal eval to have lists in target + dataframe[target] = dataframe[target].apply(literal_eval) - # cluster into groups - if groups: - self.group_encoding = {t: idx for idx, group in enumerate(groups) for t in group} - self.group_decoding = {idx: group for idx, group in enumerate(groups)} + # prepare target encoding + all_targets = np.hstack(dataframe[target].to_numpy()) + uniq_targets = np.unique(all_targets) - self.target_encoding = [{t: idx for idx, t in enumerate(group)} for group in groups] - self.target_decoding = [revdict(encoding) for encoding in self.target_encoding] - self.target_classes = [len(encoding.keys()) for encoding in self.target_encoding] - else: - self.group_encoding = {t: 0 for t in uniq_targets} - self.group_decoding = {0: uniq_targets} + if groups: + # process given groups + self.group_encoding = {t: idx for idx, group in enumerate(groups) for t in group} + self.group_decoding = {idx: group for idx, group in enumerate(groups)} + + self.target_encoding = [{t: idx for idx, t in enumerate(group)} for group in groups] + self.target_decoding = [revdict(encoding) for encoding in self.target_encoding] + self.target_classes = [len(encoding.keys()) for encoding in self.target_encoding] + else: + # single group encoding - decoding + self.group_encoding = {t: 0 for t in uniq_targets} + self.group_decoding = {0: uniq_targets} - self.target_encoding = {t: idx for idx, t in enumerate(uniq_targets)} - self.target_encoding = revdict(self.target_encoding) - self.target_classes = [len(self.target_encoding.keys())] + self.target_encoding = {t: idx for idx, t in enumerate(uniq_targets)} + self.target_encoding = revdict(self.target_encoding) + self.target_classes = [len(self.target_encoding.keys())] - self.logger.info(f"Automatically set target encodings: {self.target_encoding}") - self.logger.info(f"Target size: [{self.target_classes}]") + self.logger.info(f"Using target encodings: {self.target_encoding}") + self.logger.info(f"Target size: [{self.target_classes}]") - # prepare targets - self.target = [self.onehot_encode(ts) for ts in dataframe[target].tolist()] + # prepare targets + self.target = [self.onehot_encode(ts) for ts in dataframe[target].tolist()] - if groups: - self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] + # prepare group targets + if groups: + self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] def group_encode(self, targets: List[str]) -> np.ndarray: """Encodes given targets to group representation""" @@ -181,17 +190,18 @@ def __getitem__(self, idx): else: item = {key: torch.tensor(val[idx]) for key, val in self.data.items()} - if self.flatten: - item["labels"] = torch.tensor(self.target[idx]) - else: - item.update( - { - f"labels_{self.group_names[i]}": torch.tensor(self.target[idx][i]) - for i in range(len(self.target_classes)) - } - ) - - if self.group is not None: - item["groups"] = torch.tensor(self.group[idx]) + if self.inference: + if self.flatten: + item["labels"] = torch.tensor(self.target[idx]) + else: + item.update( + { + f"labels_{self.group_names[i]}": torch.tensor(self.target[idx][i]) + for i in range(len(self.target_classes)) + } + ) + + if self.group is not None: + item["groups"] = torch.tensor(self.group[idx]) return item From 1a54e3e0893ed71c07df841a276d3b5df9a52b52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 3 Aug 2021 21:29:12 +0200 Subject: [PATCH 04/61] feat(modeling): use a dynamically provided threshold for groups --- scripts/training/oguz/huggingface-multihead/model.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index 8d767c9..fd6397d 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -65,7 +65,7 @@ def __init__( ) ) - def forward(self, inputs, gt_groups=None): + def forward(self, inputs, gt_groups=None, group_threshold=0.5): # get hidden representation backbone_outputs = self.backbone(**inputs) if self.pooling: @@ -79,8 +79,12 @@ def forward(self, inputs, gt_groups=None): out_groups = self.heads[0](hidden) # get sample groups - # TODO: dynamic threshold? - groups = gt_groups if self.training and self.use_gt_training else out_groups > 0.5 + # TODO: dynamic threshold (per group?) + groups = ( + gt_groups + if self.training and self.use_gt_training + else out_groups > group_threshold + ) # execute each classification task out_targets = [] From b251fa8e1afcd2ebbffddf77caeba7732be9eb0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 3 Aug 2021 21:31:57 +0200 Subject: [PATCH 05/61] feat(mlflow): add mlflow prediction wrapper * Use context to load inference hyperparameters --- .../oguz/huggingface-multihead/mlflow.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 scripts/training/oguz/huggingface-multihead/mlflow.py diff --git a/scripts/training/oguz/huggingface-multihead/mlflow.py b/scripts/training/oguz/huggingface-multihead/mlflow.py new file mode 100644 index 0000000..732a407 --- /dev/null +++ b/scripts/training/oguz/huggingface-multihead/mlflow.py @@ -0,0 +1,100 @@ +import sys +import json + +sys.path.append(".") + +import pandas as pd +import numpy as np +import torch +import mlflow +from torch.utils.data import DataLoader + +from data import MultiHeadDataFrame + + +def extract_predictions(logits, threshold): + logits = logits > threshold + + preds = [] + for i in range(logits.shape[0]): + preds.append(logits[i, np.nonzero(logits[i, :])].tolist()) + return preds + + +class MLFlowWrapper(mlflow.pyfunc.PythonModel): + def __init__(self, tokenizer, model): + self.tokenizer = tokenizer + self.model = model.eval() + super().__init__() + + def load_context(self, context): + # process inference params + with open(context["infer_params"], "r") as f: + self.infer_params = json.load(f) + + # sanity checks for dataset params + dataset_params = self.infer_params["dataset"] + assert "filter" not in dataset_params, "Can't use a filter in an inference dataset!" + + if "inference" in dataset_params: + assert dataset_params["inference"], "Can only use an inference dataset!" + dataset_params.pop("inference") + + def predict(self, context, model_input): + # get dataset and data loader + dataset = MultiHeadDataFrame( + model_input, + tokenizer=self.tokenizer, + filter=None, + inference=True, + **self.infer_params["dataset"], + ) + dataloader = DataLoader(dataset, **self.infer_params["dataloader"]) + + # containers for logits + logits_targets = [] + if self.model.iterative: + logits_groups = [] + + # forward pass + with torch.no_grad(): + for batch in dataloader: + if self.model.iterative: + batch_groups, batch_targets = self.model.forward( + batch, group_threshold=self.infer_params["threshold"]["group"] + ) + logits_groups.append(batch_groups.detach().numpy()) + else: + batch_targets = self.model.forward(batch) + logits_targets.append(batch_targets.detach().numpy()) + + logits_targets = np.concatenate(logits_targets, axis=0) + preds_targets = extract_predictions( + logits_targets, self.infer_params["threshold"]["target"] + ) + output = { + "logits": [ + ",".join(f"{score:.3f}") + for i in range(logits_targets.shape[0]) + for score in logits_targets[i, :].tolist() + ], + "predictions": [",".join(preds) for preds in preds_targets], + } + + if self.model.iterative: + logits_groups = np.concatenate(logits_groups, axis=0) + preds_groups = extract_predictions( + logits_groups, self.infer_params["threshold"]["group"] + ) + output.extend( + { + "logits_group": [ + ",".join(f"{score:.3f}") + for i in range(logits_groups.shape[0]) + for score in logits_groups[i, :].tolist() + ], + "predictions_group": [",".join(preds) for preds in preds_groups], + } + ) + + return pd.DataFrame.from_dict(output) From 47adf42ca520a83c148ded6fb1e516c686a41619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 3 Aug 2021 21:32:42 +0200 Subject: [PATCH 06/61] feat(mlflow): add artifact for inference params Notes: * Fix argparse batch_size arguments * Add sanity check for iterative models * Log labels and groups separately * Log inference params altogether inside a json file --- .../huggingface-multihead/requirements.txt | 2 +- .../oguz/huggingface-multihead/train.py | 75 ++++++++++++++----- .../oguz/huggingface-multihead/utils.py | 15 ++++ 3 files changed, 73 insertions(+), 19 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/requirements.txt b/scripts/training/oguz/huggingface-multihead/requirements.txt index 91206f3..6d4beb3 100644 --- a/scripts/training/oguz/huggingface-multihead/requirements.txt +++ b/scripts/training/oguz/huggingface-multihead/requirements.txt @@ -1,2 +1,2 @@ transformers==4.6.1 -mlflow==1.12.1 \ No newline at end of file +mlflow==1.18.0 diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 0c3b5e6..24f13b2 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -2,6 +2,7 @@ import logging import os import sys +import json import mlflow import pandas as pd @@ -12,16 +13,17 @@ from constants import SECTORS, PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D from data import MultiHeadDataFrame from model import MultiHeadTransformer +from mlflow import MLFlowWrapper from trainer import MultiHeadTrainer -from utils import str2bool, str2list +from utils import str2bool, str2list, get_conda_env_specs if __name__ == "__main__": parser = argparse.ArgumentParser() # hyperparameters sent by the client parser.add_argument("--epochs", type=int, default=3) - parser.add_argument("--train-batch-size", type=int, default=32) - parser.add_argument("--eval-batch-size", type=int, default=64) + parser.add_argument("--train_batch_size", type=int, default=32) + parser.add_argument("--eval_batch_size", type=int, default=64) parser.add_argument("--warmup_steps", type=int, default=500) parser.add_argument("--learning_rate", type=str, default=5e-5) parser.add_argument("--dropout", type=float, default=0.3) @@ -98,6 +100,10 @@ groups = None group_names = None + # sanity check for iterative option + if args.iterative: + assert groups is not None, "Provide groups for the 'iterative' option" + # build classifier model from backbone model = MultiHeadTransformer( backbone, @@ -270,39 +276,72 @@ def _process(text): writer.write(f"{key} = {value}\n") mlflow.log_artifact(eval_file) + # write eval result to mlflow + for key, value in sorted(eval_result.items()): + mlflow.log_metric(key, value) + # get labels - if args.target == "sectors": + if groups is not None: + labels = [label for gs in groups for label in gs] + elif args.target == "sectors": labels = SECTORS - elif groups is not None: - if args.iterative: - labels = [[gn + ":"] + gs for gs, gn in zip(groups, group_names)] - labels = [label for ls in labels for label in ls] - else: - labels = [label for gs in groups for label in gs] else: labels = None - # write groups to a file which can be accessed later in s3 ouput + # output labels artifact label_file = os.path.join(args.output_data_dir, "labels.txt") with open(label_file, "w") as writer: - print("***** Labels *****") for label in labels: writer.write(f"{label}\n") mlflow.log_artifact(label_file) - # write eval result to MLFlow - for key, value in sorted(eval_result.items()): - mlflow.log_metric(key, value) + if args.iterative: + # get gorups + labels = [label for label in group_names] + + # output groups artifact + group_file = os.path.join(args.output_data_dir, "groups.txt") + with open(group_file, "w") as writer: + for label in labels: + writer.write(f"{label}\n") + mlflow.log_artifact(group_file) # log experiment params to MLFlow mlflow.log_params(vars(args)) - # set tags + # set experiment tags mlflow.set_tags({"split": args.split, "iterative": args.iterative}) - # log model + # output inference artifact + infer_file = os.path.join(args.output_data_dir, "infer_params.json") + with open(infer_file, "w") as writer: + json.dump( + { + "dataset": { + "source": "excerpt", + "target": args.target, + "flatten": True, + }, + "dataloader": { + "batch_size": args.eval_batch_size, + "shuffle": False, + "num_workers": 0, + }, + "threshold": {"group": 0.5, "target": 0.5}, + }, + writer, + ) + mlflow.log_artifact(infer_file) + + # log model with an inference wrapper + mlflow_wrapper = MLFlowWrapper(tokenizer, trainer.model) mlflow.pytorch.log_model( - trainer.model, artifact_path="model", registered_model_name="multi-head-transformer" + mlflow_wrapper, + artifact_path="model", + registered_model_name="multi-head-transformer", + artifacts={"infer_params": infer_file}, + conda_env=get_conda_env_specs(), + code_path=[__file__, "data.py", "model.py"], ) # finish mlflow run diff --git a/scripts/training/oguz/huggingface-multihead/utils.py b/scripts/training/oguz/huggingface-multihead/utils.py index 9effce9..d120b56 100644 --- a/scripts/training/oguz/huggingface-multihead/utils.py +++ b/scripts/training/oguz/huggingface-multihead/utils.py @@ -1,9 +1,12 @@ from collections import OrderedDict from typing import Dict + import argparse +from pathlib import Path import torch import torch.nn.functional as F +import mlflow def revdict(d: Dict): @@ -36,6 +39,18 @@ def str2list(v, sep=","): raise argparse.ArgumentTypeError("String value expected.") +def get_conda_env_specs(): + requirement_file = str(Path(__file__).parent / "requirements.txt") + with open(requirement_file, "r") as f: + requirements = f.readlines() + requirements = [x.replace("\n", "") for x in requirements] + + default_env = mlflow.pytorch.get_default_conda_env() + pip_dependencies = default_env["dependencies"][2]["pip"] + pip_dependencies.extend(requirements) + return default_env + + def build_mlp( depth: int, in_features: int, From cceab6a0c16038d753e177af34a6bd4cf7fe46d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 3 Aug 2021 22:29:42 +0200 Subject: [PATCH 07/61] fix(mlflow): use sigmoid on raw network scores --- scripts/training/oguz/huggingface-multihead/mlflow.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/mlflow.py b/scripts/training/oguz/huggingface-multihead/mlflow.py index 732a407..c2b5d70 100644 --- a/scripts/training/oguz/huggingface-multihead/mlflow.py +++ b/scripts/training/oguz/huggingface-multihead/mlflow.py @@ -63,9 +63,11 @@ def predict(self, context, model_input): batch_groups, batch_targets = self.model.forward( batch, group_threshold=self.infer_params["threshold"]["group"] ) + batch_groups = torch.sigmoid(batch_groups) + batch_targets = torch.sigmoid(batch_targets) logits_groups.append(batch_groups.detach().numpy()) else: - batch_targets = self.model.forward(batch) + batch_targets = torch.sigmoid(self.model.forward(batch)) logits_targets.append(batch_targets.detach().numpy()) logits_targets = np.concatenate(logits_targets, axis=0) @@ -82,7 +84,7 @@ def predict(self, context, model_input): } if self.model.iterative: - logits_groups = np.concatenate(logits_groups, axis=0) + logits_groups = np.sigmoidnp.concatenate(logits_groups, axis=0) preds_groups = extract_predictions( logits_groups, self.infer_params["threshold"]["group"] ) From 83f31ef7ee33869ec19e9d080e7d4bb89d76c1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 09:23:11 +0200 Subject: [PATCH 08/61] fix(mlflow): rename mlflow.py to infer.py * Fix MLFlow import related issues --- .../oguz/huggingface-multihead/{mlflow.py => infer.py} | 4 ++-- scripts/training/oguz/huggingface-multihead/train.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename scripts/training/oguz/huggingface-multihead/{mlflow.py => infer.py} (97%) diff --git a/scripts/training/oguz/huggingface-multihead/mlflow.py b/scripts/training/oguz/huggingface-multihead/infer.py similarity index 97% rename from scripts/training/oguz/huggingface-multihead/mlflow.py rename to scripts/training/oguz/huggingface-multihead/infer.py index c2b5d70..a31b9a6 100644 --- a/scripts/training/oguz/huggingface-multihead/mlflow.py +++ b/scripts/training/oguz/huggingface-multihead/infer.py @@ -6,7 +6,7 @@ import pandas as pd import numpy as np import torch -import mlflow +from mlflow.pyfunc import PythonModel from torch.utils.data import DataLoader from data import MultiHeadDataFrame @@ -21,7 +21,7 @@ def extract_predictions(logits, threshold): return preds -class MLFlowWrapper(mlflow.pyfunc.PythonModel): +class MLFlowWrapper(PythonModel): def __init__(self, tokenizer, model): self.tokenizer = tokenizer self.model = model.eval() diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 24f13b2..bb3553a 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -13,7 +13,7 @@ from constants import SECTORS, PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D from data import MultiHeadDataFrame from model import MultiHeadTransformer -from mlflow import MLFlowWrapper +from infer import MLFlowWrapper from trainer import MultiHeadTrainer from utils import str2bool, str2list, get_conda_env_specs From 81c0c325cc2be97b362a3ab86caa888d5f86f8ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 10:05:00 +0200 Subject: [PATCH 09/61] fix(data): prepare targets and groups --- scripts/training/oguz/huggingface-multihead/data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 83caf89..50a9fb8 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -124,12 +124,12 @@ def __init__( self.logger.info(f"Using target encodings: {self.target_encoding}") self.logger.info(f"Target size: [{self.target_classes}]") - # prepare targets - self.target = [self.onehot_encode(ts) for ts in dataframe[target].tolist()] + # prepare targets + self.target = [self.onehot_encode(ts) for ts in dataframe[target].tolist()] - # prepare group targets - if groups: - self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] + # prepare group targets + if groups: + self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] def group_encode(self, targets: List[str]) -> np.ndarray: """Encodes given targets to group representation""" From 4bf767c251f649b1ab591d8878b303eabbe78101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 10:16:29 +0200 Subject: [PATCH 10/61] fix(data): provide targets in non-inference mode --- scripts/training/oguz/huggingface-multihead/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 50a9fb8..137dbca 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -190,7 +190,7 @@ def __getitem__(self, idx): else: item = {key: torch.tensor(val[idx]) for key, val in self.data.items()} - if self.inference: + if not self.inference: if self.flatten: item["labels"] = torch.tensor(self.target[idx]) else: From 29af38cc14611ddf6bbacda168ba47d2c84e297e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 10:22:29 +0200 Subject: [PATCH 11/61] docs(mlflow): add additional considerations to MLFlow deployment --- docs/source/modeling/tracking.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/modeling/tracking.rst b/docs/source/modeling/tracking.rst index 2b89f0b..1ee21fd 100644 --- a/docs/source/modeling/tracking.rst +++ b/docs/source/modeling/tracking.rst @@ -34,3 +34,8 @@ You can find an example of deployment in the repo. - The key of the deployment is creating a class that inherits from `mlflow.pyfunc.PythonModel` with a `predict()` function. - That class is pickled and logged as artifact of the training. At inference time it will be used to make predictions. +Additionally, consider the following for more configurable deployment: + +- *Dynamic inference parameters*: Store inference hyperparameters (e.g., batch size or thresholds) as a separate artifact in MLFlow. Use `artifacts` options in `log_model` and then retrieve the file using the `context` object provided by the MLFlow in `load_context` or `predict`. +- *Multiple outputs*: `predict` function can return a Pandas DataFrame object. Employ it if the model has multiple targets or for providing logits scores for dynamic threshold adjusting on the client-side. +- *Serving labels*: Log a separate artifact in MLFlow for the client-side to map predictions back to human-readable labels. From 9f339ac8f3b58ad851b643e33a6df52958737a41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 10:35:13 +0200 Subject: [PATCH 12/61] fix(data): add token types only if it is supported --- scripts/training/oguz/huggingface-multihead/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 137dbca..1f45848 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -76,7 +76,7 @@ def __init__( "truncation": True, "padding": "max_length", "add_special_tokens": True, - "return_token_type_ids": True, + "return_token_type_ids": "token_type_ids" in tokenizer.model_input_names, "max_length": min(tokenizer_max_len, tokenizer.model_max_length), } if tokenizer.model_max_length < tokenizer_max_len: From dff1e5f87d6ea74070bdd4bc0cbe10c60029aed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 11:24:10 +0200 Subject: [PATCH 13/61] feat(mlflow): add deployment option --- .../oguz/huggingface-multihead/train.py | 22 ++++++++++--------- .../{infer.py => wrapper.py} | 4 ++-- 2 files changed, 14 insertions(+), 12 deletions(-) rename scripts/training/oguz/huggingface-multihead/{infer.py => wrapper.py} (97%) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index bb3553a..b3224e2 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -13,7 +13,7 @@ from constants import SECTORS, PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D from data import MultiHeadDataFrame from model import MultiHeadTransformer -from infer import MLFlowWrapper +from wrapper import MLFlowWrapper from trainer import MultiHeadTrainer from utils import str2bool, str2list, get_conda_env_specs @@ -61,6 +61,7 @@ # MLFlow related parameters parser.add_argument("--tracking_uri", type=str) parser.add_argument("--experiment_name", type=str) + parser.add_argument("--deploy", type=str2bool, default=True) # SageMaker parameters - data, model, and output directories parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) @@ -334,15 +335,16 @@ def _process(text): mlflow.log_artifact(infer_file) # log model with an inference wrapper - mlflow_wrapper = MLFlowWrapper(tokenizer, trainer.model) - mlflow.pytorch.log_model( - mlflow_wrapper, - artifact_path="model", - registered_model_name="multi-head-transformer", - artifacts={"infer_params": infer_file}, - conda_env=get_conda_env_specs(), - code_path=[__file__, "data.py", "model.py"], - ) + if args.deploy: + mlflow_wrapper = MLFlowWrapper(tokenizer, trainer.model) + mlflow.pyfunc.log_model( + mlflow_wrapper, + artifact_path="model", + registered_model_name="multi-head-transformer", + artifacts={"infer_params": infer_file}, + conda_env=get_conda_env_specs(), + code_path=[__file__, "data.py", "model.py"], + ) # finish mlflow run mlflow.end_run() diff --git a/scripts/training/oguz/huggingface-multihead/infer.py b/scripts/training/oguz/huggingface-multihead/wrapper.py similarity index 97% rename from scripts/training/oguz/huggingface-multihead/infer.py rename to scripts/training/oguz/huggingface-multihead/wrapper.py index a31b9a6..c2b5d70 100644 --- a/scripts/training/oguz/huggingface-multihead/infer.py +++ b/scripts/training/oguz/huggingface-multihead/wrapper.py @@ -6,7 +6,7 @@ import pandas as pd import numpy as np import torch -from mlflow.pyfunc import PythonModel +import mlflow from torch.utils.data import DataLoader from data import MultiHeadDataFrame @@ -21,7 +21,7 @@ def extract_predictions(logits, threshold): return preds -class MLFlowWrapper(PythonModel): +class MLFlowWrapper(mlflow.pyfunc.PythonModel): def __init__(self, tokenizer, model): self.tokenizer = tokenizer self.model = model.eval() From fa86267440396a6a0858af87b982e74305cbdf44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 11:25:05 +0200 Subject: [PATCH 14/61] chore(): delete TODO file --- scripts/training/oguz/huggingface-multihead/TODO | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 scripts/training/oguz/huggingface-multihead/TODO diff --git a/scripts/training/oguz/huggingface-multihead/TODO b/scripts/training/oguz/huggingface-multihead/TODO deleted file mode 100644 index c497d20..0000000 --- a/scripts/training/oguz/huggingface-multihead/TODO +++ /dev/null @@ -1,8 +0,0 @@ -1. Training logic -2. MLFlow hyperparameters logging -2. MLFlow artifact logging -3. Tensorboard training + evaluation logging artifacts -4. HuggingFace: skip_memory_metrics -5. Check Selim's notebook for tricks on optimizer, scheduler, etc. -6. AUC-ROC -7. Not logging parameters I have set? \ No newline at end of file From 44fe9d8a6c08d3c315ee07cb3de715f1cd787124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 12:26:03 +0200 Subject: [PATCH 15/61] refactor(mlflow): use 'predictions' instead of 'logits' --- .../oguz/huggingface-multihead/wrapper.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/wrapper.py b/scripts/training/oguz/huggingface-multihead/wrapper.py index c2b5d70..7590133 100644 --- a/scripts/training/oguz/huggingface-multihead/wrapper.py +++ b/scripts/training/oguz/huggingface-multihead/wrapper.py @@ -52,9 +52,9 @@ def predict(self, context, model_input): dataloader = DataLoader(dataset, **self.infer_params["dataloader"]) # containers for logits - logits_targets = [] + probs_targets = [] if self.model.iterative: - logits_groups = [] + probs_groups = [] # forward pass with torch.no_grad(): @@ -65,35 +65,33 @@ def predict(self, context, model_input): ) batch_groups = torch.sigmoid(batch_groups) batch_targets = torch.sigmoid(batch_targets) - logits_groups.append(batch_groups.detach().numpy()) + probs_groups.append(batch_groups.detach().numpy()) else: batch_targets = torch.sigmoid(self.model.forward(batch)) - logits_targets.append(batch_targets.detach().numpy()) + probs_targets.append(batch_targets.detach().numpy()) - logits_targets = np.concatenate(logits_targets, axis=0) - preds_targets = extract_predictions( - logits_targets, self.infer_params["threshold"]["target"] - ) + probs_targets = np.concatenate(probs_targets, axis=0) + preds_targets = extract_predictions(probs_targets, self.infer_params["threshold"]["target"]) output = { - "logits": [ + "probabilities_target": [ ",".join(f"{score:.3f}") - for i in range(logits_targets.shape[0]) - for score in logits_targets[i, :].tolist() + for i in range(probs_targets.shape[0]) + for score in probs_targets[i, :].tolist() ], - "predictions": [",".join(preds) for preds in preds_targets], + "predictions_target": [",".join(preds) for preds in preds_targets], } if self.model.iterative: - logits_groups = np.sigmoidnp.concatenate(logits_groups, axis=0) + probs_groups = np.concatenate(probs_groups, axis=0) preds_groups = extract_predictions( - logits_groups, self.infer_params["threshold"]["group"] + probs_groups, self.infer_params["threshold"]["group"] ) output.extend( { - "logits_group": [ + "probabilities_group": [ ",".join(f"{score:.3f}") - for i in range(logits_groups.shape[0]) - for score in logits_groups[i, :].tolist() + for i in range(probs_groups.shape[0]) + for score in probs_groups[i, :].tolist() ], "predictions_group": [",".join(preds) for preds in preds_groups], } From 71a2c6315cb1627e5a0bef30f8cb148d31689ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 14:08:32 +0200 Subject: [PATCH 16/61] fix(mlflow): use artifact uri in logging model Also fix `python_model` parameter of log_model --- scripts/training/oguz/huggingface-multihead/train.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index b3224e2..b1aec78 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -333,17 +333,18 @@ def _process(text): writer, ) mlflow.log_artifact(infer_file) + infer_file_uri = mlflow.get_artifact_uri(infer_file) # log model with an inference wrapper if args.deploy: mlflow_wrapper = MLFlowWrapper(tokenizer, trainer.model) mlflow.pyfunc.log_model( - mlflow_wrapper, + python_model=mlflow_wrapper, artifact_path="model", registered_model_name="multi-head-transformer", - artifacts={"infer_params": infer_file}, + artifacts={"infer_params": infer_file_uri}, conda_env=get_conda_env_specs(), - code_path=[__file__, "data.py", "model.py"], + code_path=[__file__, "data.py", "model.py", "wrapper.py"], ) # finish mlflow run From 11d23ec18f48ed95816c0423e98c446b9b58db30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 14:55:51 +0200 Subject: [PATCH 17/61] fix(mlflow): use proper artifact and code paths --- .../training/oguz/huggingface-multihead/train.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index b1aec78..9a80afb 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -270,9 +270,9 @@ def _process(text): eval_result = trainer.evaluate(eval_dataset=test_dataset) # write eval result to file which can be accessed later in s3 ouput + logging.info("Logging eval results") eval_file = os.path.join(args.output_data_dir, "eval_results.txt") with open(eval_file, "w") as writer: - print("***** Eval results *****") for key, value in sorted(eval_result.items()): writer.write(f"{key} = {value}\n") mlflow.log_artifact(eval_file) @@ -282,6 +282,7 @@ def _process(text): mlflow.log_metric(key, value) # get labels + logging.info("Logging model labels") if groups is not None: labels = [label for gs in groups for label in gs] elif args.target == "sectors": @@ -298,6 +299,7 @@ def _process(text): if args.iterative: # get gorups + logging.info("Logging model groups") labels = [label for label in group_names] # output groups artifact @@ -314,6 +316,7 @@ def _process(text): mlflow.set_tags({"split": args.split, "iterative": args.iterative}) # output inference artifact + logging.info("Logging inference params") infer_file = os.path.join(args.output_data_dir, "infer_params.json") with open(infer_file, "w") as writer: json.dump( @@ -333,10 +336,15 @@ def _process(text): writer, ) mlflow.log_artifact(infer_file) - infer_file_uri = mlflow.get_artifact_uri(infer_file) + infer_file_uri = mlflow.get_artifact_uri("infer_params.json") # log model with an inference wrapper + if args.deploy: + # log model with an inference wrapper + logging.info("Logging deployment model") + data_file = os.path.join(os.path.dirname(__file__), "data.py") + mlflow_wrapper = MLFlowWrapper(tokenizer, trainer.model) mlflow.pyfunc.log_model( python_model=mlflow_wrapper, @@ -344,7 +352,7 @@ def _process(text): registered_model_name="multi-head-transformer", artifacts={"infer_params": infer_file_uri}, conda_env=get_conda_env_specs(), - code_path=[__file__, "data.py", "model.py", "wrapper.py"], + code_path=[__file__, data_file], ) # finish mlflow run From 287a9be3d33bc2f2a4150eea356c4bfbed7bba98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 22:16:55 +0200 Subject: [PATCH 18/61] refactor(mlflow): log label artifacts with the model --- .../{run.py => sagemaker_train.py} | 4 ++-- scripts/training/oguz/huggingface-multihead/train.py | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) rename scripts/training/oguz/huggingface-multihead/{run.py => sagemaker_train.py} (97%) diff --git a/scripts/training/oguz/huggingface-multihead/run.py b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py similarity index 97% rename from scripts/training/oguz/huggingface-multihead/run.py rename to scripts/training/oguz/huggingface-multihead/sagemaker_train.py index 2baf198..43753f2 100644 --- a/scripts/training/oguz/huggingface-multihead/run.py +++ b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py @@ -26,7 +26,7 @@ # create SageMaker session sess = sagemaker.Session(default_bucket=DEV_BUCKET.name) -job_name = f"{args.task}-test-{formatted_time()}" +job_name = f"{args.task}-train-{formatted_time()}" # load dataset dataset_version = "0.5" if args.task == "1D" else "0.4.4" @@ -55,7 +55,7 @@ # hyperparameters for the run hyperparameters = { - "epochs": 10, + "epochs": 1, "model_name": "distilbert-base-uncased", "tracking_uri": MLFLOW_SERVER, "experiment_name": f"{args.task}-multihead-transformers", diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 9a80afb..2b9ecef 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -295,7 +295,7 @@ def _process(text): with open(label_file, "w") as writer: for label in labels: writer.write(f"{label}\n") - mlflow.log_artifact(label_file) + artifacts = {"labels": label_file} if args.iterative: # get gorups @@ -307,7 +307,7 @@ def _process(text): with open(group_file, "w") as writer: for label in labels: writer.write(f"{label}\n") - mlflow.log_artifact(group_file) + artifacts.update({"groups": group_file}) # log experiment params to MLFlow mlflow.log_params(vars(args)) @@ -335,10 +335,7 @@ def _process(text): }, writer, ) - mlflow.log_artifact(infer_file) - infer_file_uri = mlflow.get_artifact_uri("infer_params.json") - - # log model with an inference wrapper + artifacts.update({"infer_params": infer_file}) if args.deploy: # log model with an inference wrapper @@ -350,7 +347,7 @@ def _process(text): python_model=mlflow_wrapper, artifact_path="model", registered_model_name="multi-head-transformer", - artifacts={"infer_params": infer_file_uri}, + artifacts=artifacts, conda_env=get_conda_env_specs(), code_path=[__file__, data_file], ) From 8187edd487b6a8609b9951dfde73a40881f4d0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 22:17:55 +0200 Subject: [PATCH 19/61] feat(data): preprocessing for offline testing format --- .../models/oguz/transformer_v0.5_1D.ipynb | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/notebooks/models/oguz/transformer_v0.5_1D.ipynb b/notebooks/models/oguz/transformer_v0.5_1D.ipynb index ea76360..82601ca 100644 --- a/notebooks/models/oguz/transformer_v0.5_1D.ipynb +++ b/notebooks/models/oguz/transformer_v0.5_1D.ipynb @@ -430,6 +430,53 @@ "outputs": [], "metadata": {} }, + { + "cell_type": "code", + "execution_count": null, + "source": [], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Inference Preprocessing (Offline Testing Environment)" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import pandas as pd\n", + "\n", + "DATA_PATH = 'leads.csv'\n", + "data = pd.read_csv(DATA_PATH)\n", + "data.head()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "from ast import literal_eval\n", + "\n", + "lengths = {}\n", + "\n", + "for field in ['extracted_text_as_paragraphs', 'extracted_text_as_sentences']:\n", + " arr = data[field].apply(literal_eval).tolist()\n", + " lengths[field] = [len(ds) for ds in arr]\n", + " \n", + " infer_df = pd.DataFrame.from_dict({\n", + " 'target': [d for ds in arr for d in ds]\n", + " })\n", + " infer_df.to_csv(f'infer_{field}.csv', header=True, index=True)" + ], + "outputs": [], + "metadata": {} + }, { "cell_type": "code", "execution_count": null, From 816d6dc2b13761c713c0b4773faab850d7a5eb31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 4 Aug 2021 22:34:48 +0200 Subject: [PATCH 20/61] feat(sagemaker): add inference job using MLFlow 'log_model' --- .../oguz/huggingface-multihead/infer.py | 51 +++++++++++++ .../huggingface-multihead/sagemaker_infer.py | 74 +++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 scripts/training/oguz/huggingface-multihead/infer.py create mode 100644 scripts/training/oguz/huggingface-multihead/sagemaker_infer.py diff --git a/scripts/training/oguz/huggingface-multihead/infer.py b/scripts/training/oguz/huggingface-multihead/infer.py new file mode 100644 index 0000000..a3bd855 --- /dev/null +++ b/scripts/training/oguz/huggingface-multihead/infer.py @@ -0,0 +1,51 @@ +import argparse +import logging +import os +import sys + +import mlflow +import pandas as pd + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + # hyperparameters sent by the client + parser.add_argument("--eval_batch_size", type=int, default=64) + parser.add_argument( + "--target", + type=str, + default="target", + help="Prediction target", + ) + parser.add_argument("--model_uri", type=str, required=True) + + # SageMaker parameters - data, model, and output directories + parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) + parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) + args, _ = parser.parse_known_args() + + # set up logging + logger = logging.getLogger(__name__) + logging.basicConfig( + level=logging.getLevelName("INFO"), + handlers=[logging.StreamHandler(sys.stdout)], + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + logger.info(f"Args: {args}") + + # load datasets + infer_df = pd.read_pickle(f"{args.test_dir}/infer_df.pickle") + logger.info(f" loaded infer_dataset length is: {infer_df.shape}") + + # get model + loaded_model = mlflow.pyfunc.load_model(args.model_uri) + logging.info(loaded_model.infer_params) + + loaded_model.infer_params["dataset"]["target"] = args.target + loaded_model.infer_params["dataloader"]["batch_size"] = args.eval_batch_size + pred_df = loaded_model.predict(infer_df) + + # save predictions + pred_df.to_csv(f"{args.output_data_dir}/preds.csv", header=True, index=True) diff --git a/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py b/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py new file mode 100644 index 0000000..4f3f326 --- /dev/null +++ b/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py @@ -0,0 +1,74 @@ +import os +import sys +import argparse + +# import main folder for imports +sys.path.append(os.path.abspath(os.getcwd())) + +import pandas as pd + +import sagemaker +from sagemaker.pytorch import PyTorch + +from deep.constants import DEV_BUCKET, SAGEMAKER_ROLE +from deep.utils import formatted_time + +# get args +parser = argparse.ArgumentParser() +parser.add_argument( + "--task", + type=str, + default="1D", + choices=["1D", "2D"], +) +parser.add_argument("--debug", action="store_true", default=False) +parser.add_argument("--dataset", type=str, default=None) +parser.add_argument("--target", type=str, default="target") +parser.add_argument("--model_uri", type=str, default=None) +args, _ = parser.parse_known_args() + +# create SageMaker session +sess = sagemaker.Session(default_bucket=DEV_BUCKET.name) + +# job and experiment names +job_name = f"{args.task}-infer-{formatted_time()}" + +# load dataset +infer_df = pd.read_csv(args.dataset) +if args.debug: + infer_df = infer_df.sample(n=1000) + +# upload dataset to s3 +input_path = DEV_BUCKET / "inference" / "input_data" / job_name # Do not change this +infer_path = str(input_path / "infer_df.pickle") + +infer_df.to_pickle( + infer_path, protocol=4 +) # protocol 4 is necessary, since SageMaker uses python 3.6 + +# hyperparameters for inference +hyperparameters = { + "model_uri": args.model_uri, + "target": args.target, +} + +# create SageMaker estimator +estimator = PyTorch( + entry_point="infer.py", + source_dir=str("scripts/training/oguz/huggingface-multihead"), + output_path=str(DEV_BUCKET / "predictions/"), + code_location=str(input_path), + instance_type="ml.p3.2xlarge", + instance_count=1, + role=SAGEMAKER_ROLE, + framework_version="1.8", + py_version="py36", + hyperparameters=hyperparameters, + job_name=job_name, +) + +# set arguments +fit_arguments = {"train": str(input_path), "test": str(input_path)} + +# transform the estimator +estimator.fit(fit_arguments, job_name=job_name) From 85123b09d12ce94b0a7496c4110bf71d1207565a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 12:09:35 +0200 Subject: [PATCH 21/61] feat(runner): reload best model in training --- scripts/training/oguz/huggingface-multihead/train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 2b9ecef..fcb5e1c 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -242,6 +242,10 @@ def _process(text): learning_rate=float(args.learning_rate), skip_memory_metrics=False, label_names=["labels", "groups"] if args.iterative else ["labels"], + metric_for_best_model="eval_subpillar_micro_f1", + greater_is_better=True, + load_best_model_at_end=True, + save_total_limit=1, ) # create trainer instance From 3372819dd6b1df8209fc970d73db43447d4aeaa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 12:39:46 +0200 Subject: [PATCH 22/61] feat(mlflow): add tokenizer options as arguments to mlflow --- scripts/training/oguz/huggingface-multihead/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index fcb5e1c..dd3fe51 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -316,6 +316,9 @@ def _process(text): # log experiment params to MLFlow mlflow.log_params(vars(args)) + # log tokenizer parameters + mlflow.log_params(train_dataset.tokenizer_options) + # set experiment tags mlflow.set_tags({"split": args.split, "iterative": args.iterative}) From f4d6c1ebaabe6c213fce44b583cdad29635c8d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 12:41:01 +0200 Subject: [PATCH 23/61] fix(mlflow): access artifacts in the pyfunc context object * Also read labels and groups --- .../training/oguz/huggingface-multihead/wrapper.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/wrapper.py b/scripts/training/oguz/huggingface-multihead/wrapper.py index 7590133..974bf02 100644 --- a/scripts/training/oguz/huggingface-multihead/wrapper.py +++ b/scripts/training/oguz/huggingface-multihead/wrapper.py @@ -28,8 +28,17 @@ def __init__(self, tokenizer, model): super().__init__() def load_context(self, context): + # process labels + with open(context.artifacts["labels"], "r") as f: + self.labels = [line.strip() for line in f.readlines()] + + # process groups + if self.model.iterative: + with open(context.artifacts["groups"], "r") as f: + self.groups = [line.strip() for line in f.readlines()] + # process inference params - with open(context["infer_params"], "r") as f: + with open(context.artifacts["infer_params"], "r") as f: self.infer_params = json.load(f) # sanity checks for dataset params From bf39d2ec790b1bcbb88a055c49b567f0130141cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 12:45:00 +0200 Subject: [PATCH 24/61] refactor(data): update filtering info message --- scripts/training/oguz/huggingface-multihead/data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 1f45848..05d99cf 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -69,7 +69,10 @@ def __init__( for f in filter: pos |= np.array([len(item) > 0 for item in dataframe[f].tolist()], dtype=np.bool) dataframe = dataframe[pos] - self.logger.info(f"Filtered data points with non-empty (or) {','.join(filter)} values") + self.logger.info( + f"Filtered data points with non-empty {','.join(filter)} values" + "(using 'or' if multiple fields)" + ) # prepare tokenizer options self.tokenizer_options = { From 4f6a05024a2c39c3c37612f2e8244eaa06c24c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 15:45:14 +0200 Subject: [PATCH 25/61] refactor(data): make 'tokenizer_max_len' optional * Do not manually setting `add_special_tokens` and `return_token_type_ids` --- .../oguz/huggingface-multihead/data.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 05d99cf..8bb5b50 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -45,7 +45,7 @@ def __init__( flatten: bool = True, online: bool = False, inference: bool = False, - tokenizer_max_len: int = 200, + tokenizer_max_len: Optional[int] = None, ): self.group_names = group_names self.flatten = flatten @@ -77,23 +77,24 @@ def __init__( # prepare tokenizer options self.tokenizer_options = { "truncation": True, - "padding": "max_length", - "add_special_tokens": True, - "return_token_type_ids": "token_type_ids" in tokenizer.model_input_names, - "max_length": min(tokenizer_max_len, tokenizer.model_max_length), + "padding": True, } - if tokenizer.model_max_length < tokenizer_max_len: - self.logger.info( - f"Using maximum model length: {tokenizer.model_max_length} instead" - f"of given length: {tokenizer_max_len}" + if tokenizer_max_len: + self.tokenizer_options.update( + { + "padding": "max_length", + "max_length": min(tokenizer_max_len, tokenizer.model_max_length), + } ) + if tokenizer.model_max_length < tokenizer_max_len: + self.logger.info( + f"Using maximum model length: {tokenizer.model_max_length} instead" + f"of given length: {tokenizer_max_len}" + ) if self.online: # ensure that we are in training assert not self.inference, "Online tokenization is only supported in training-time" - - # save data as exceprt - self.data = dataframe[source].tolist() else: # tokenize and save source data self.logger.info("Applying offline tokenization") From 7044ca9326d8f3295d16451ff9aa77bf9ea0a5cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 15:48:29 +0200 Subject: [PATCH 26/61] fix(data): use source length to return dataset length --- scripts/training/oguz/huggingface-multihead/data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 8bb5b50..db7e213 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -92,13 +92,17 @@ def __init__( f"of given length: {tokenizer_max_len}" ) + # save data as exceprt + self.data = dataframe[source].tolist() + self.data_len = len(self.data) + if self.online: # ensure that we are in training assert not self.inference, "Online tokenization is only supported in training-time" else: # tokenize and save source data self.logger.info("Applying offline tokenization") - self.data = tokenizer(dataframe[source].tolist(), **self.tokenizer_options) + self.data = tokenizer(self.data, **self.tokenizer_options) if not self.inference: # apply literal eval to have lists in target @@ -185,7 +189,7 @@ def onehot_decode(self, onehot: Union[np.ndarray, List[np.ndarray]]) -> List[str ] def __len__(self): - return len(self.target) + return self.data_len def __getitem__(self, idx): if self.online: From fae90ac21337dfdf3d06914e3f4929fbf3708383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 15:49:39 +0200 Subject: [PATCH 27/61] fix(data): drop empty rows in preprocessing --- notebooks/models/oguz/transformer_v0.5_1D.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/notebooks/models/oguz/transformer_v0.5_1D.ipynb b/notebooks/models/oguz/transformer_v0.5_1D.ipynb index 82601ca..3f151a9 100644 --- a/notebooks/models/oguz/transformer_v0.5_1D.ipynb +++ b/notebooks/models/oguz/transformer_v0.5_1D.ipynb @@ -470,8 +470,9 @@ " lengths[field] = [len(ds) for ds in arr]\n", " \n", " infer_df = pd.DataFrame.from_dict({\n", - " 'target': [d for ds in arr for d in ds]\n", + " 'excerpt': [d for ds in arr for d in ds]\n", " })\n", + " infer_df = infer_df[~(infer_df['excerpt'].str.len() == 0)]\n", " infer_df.to_csv(f'infer_{field}.csv', header=True, index=True)" ], "outputs": [], From 832492a2ee1f66fa413bf2541223fd3f2d2d34e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 17:55:12 +0200 Subject: [PATCH 28/61] fix(mlflow): cast predictions and probabilities to string --- .../oguz/huggingface-multihead/wrapper.py | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/wrapper.py b/scripts/training/oguz/huggingface-multihead/wrapper.py index 974bf02..8fbc46d 100644 --- a/scripts/training/oguz/huggingface-multihead/wrapper.py +++ b/scripts/training/oguz/huggingface-multihead/wrapper.py @@ -12,16 +12,33 @@ from data import MultiHeadDataFrame -def extract_predictions(logits, threshold): - logits = logits > threshold +def extract_predictions(probs, threshold): + """Extracts predictions from probabilities and threshold""" + probs = probs > threshold preds = [] - for i in range(logits.shape[0]): - preds.append(logits[i, np.nonzero(logits[i, :])].tolist()) + for i in range(probs.shape[0]): + preds.append(np.nonzero(probs[i, :])[0].tolist()) return preds +def list2str(items, is_int=False): + """Converts an array of lists of integers into an array of strings""" + + arr = [] + for item in items: + if len(item) == 0: + arr.append("") + elif is_int: + arr.append(",".join([str(i) for i in item])) + else: + arr.append(",".join([f"{i:.3f}" for i in item])) + return arr + + class MLFlowWrapper(mlflow.pyfunc.PythonModel): + """MLFlow Wrapper class for inference""" + def __init__(self, tokenizer, model): self.tokenizer = tokenizer self.model = model.eval() @@ -68,26 +85,25 @@ def predict(self, context, model_input): # forward pass with torch.no_grad(): for batch in dataloader: + for k, v in batch.items(): + batch[k] = v.to("cuda") + if self.model.iterative: batch_groups, batch_targets = self.model.forward( batch, group_threshold=self.infer_params["threshold"]["group"] ) batch_groups = torch.sigmoid(batch_groups) batch_targets = torch.sigmoid(batch_targets) - probs_groups.append(batch_groups.detach().numpy()) + probs_groups.append(batch_groups.detach().cpu().numpy()) else: batch_targets = torch.sigmoid(self.model.forward(batch)) - probs_targets.append(batch_targets.detach().numpy()) + probs_targets.append(batch_targets.detach().cpu().numpy()) probs_targets = np.concatenate(probs_targets, axis=0) preds_targets = extract_predictions(probs_targets, self.infer_params["threshold"]["target"]) output = { - "probabilities_target": [ - ",".join(f"{score:.3f}") - for i in range(probs_targets.shape[0]) - for score in probs_targets[i, :].tolist() - ], - "predictions_target": [",".join(preds) for preds in preds_targets], + "probabilities_target": list2str(probs_targets.tolist()), + "predictions_target": list2str(preds_targets, is_int=True), } if self.model.iterative: @@ -95,14 +111,10 @@ def predict(self, context, model_input): preds_groups = extract_predictions( probs_groups, self.infer_params["threshold"]["group"] ) - output.extend( + output.update( { - "probabilities_group": [ - ",".join(f"{score:.3f}") - for i in range(probs_groups.shape[0]) - for score in probs_groups[i, :].tolist() - ], - "predictions_group": [",".join(preds) for preds in preds_groups], + "probabilities_group": list2str(probs_groups.tolist()), + "predictions_group": list2str(preds_groups, is_int=True), } ) From e1e7d1b86120ca646ca32126ba81278aab8cffd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 17:56:32 +0200 Subject: [PATCH 29/61] fix(sagemaker): use fixed 'source' field by renaming prior to job execution --- .../oguz/huggingface-multihead/infer.py | 22 +++++++++---------- .../huggingface-multihead/sagemaker_infer.py | 8 +++---- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/infer.py b/scripts/training/oguz/huggingface-multihead/infer.py index a3bd855..7ae8ff3 100644 --- a/scripts/training/oguz/huggingface-multihead/infer.py +++ b/scripts/training/oguz/huggingface-multihead/infer.py @@ -10,13 +10,7 @@ parser = argparse.ArgumentParser() # hyperparameters sent by the client - parser.add_argument("--eval_batch_size", type=int, default=64) - parser.add_argument( - "--target", - type=str, - default="target", - help="Prediction target", - ) + # parser.add_argument("--eval_batch_size", type=int, default=64) parser.add_argument("--model_uri", type=str, required=True) # SageMaker parameters - data, model, and output directories @@ -38,14 +32,18 @@ # load datasets infer_df = pd.read_pickle(f"{args.test_dir}/infer_df.pickle") logger.info(f" loaded infer_dataset length is: {infer_df.shape}") + logger.info(infer_df.head()) # get model - loaded_model = mlflow.pyfunc.load_model(args.model_uri) - logging.info(loaded_model.infer_params) + pyfunc_wrapper = mlflow.pyfunc.load_model(args.model_uri) + + # set eval batch size + # python_model = pyfunc_wrapper._model_impl.python_model + # logging.info(python_model.infer_params) + # python_model.infer_params["dataloader"]["batch_size"] = args.eval_batch_size - loaded_model.infer_params["dataset"]["target"] = args.target - loaded_model.infer_params["dataloader"]["batch_size"] = args.eval_batch_size - pred_df = loaded_model.predict(infer_df) + # predict + pred_df = pyfunc_wrapper.predict(infer_df) # save predictions pred_df.to_csv(f"{args.output_data_dir}/preds.csv", header=True, index=True) diff --git a/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py b/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py index 4f3f326..7755679 100644 --- a/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py +++ b/scripts/training/oguz/huggingface-multihead/sagemaker_infer.py @@ -21,10 +21,10 @@ default="1D", choices=["1D", "2D"], ) +parser.add_argument("--dataset", type=str, default=None, required=True) +parser.add_argument("--model_uri", type=str, default=None, required=True) +parser.add_argument("--source", type=str, default="excerpt") parser.add_argument("--debug", action="store_true", default=False) -parser.add_argument("--dataset", type=str, default=None) -parser.add_argument("--target", type=str, default="target") -parser.add_argument("--model_uri", type=str, default=None) args, _ = parser.parse_known_args() # create SageMaker session @@ -35,6 +35,7 @@ # load dataset infer_df = pd.read_csv(args.dataset) +infer_df.rename(columns={args.source: "excerpt"}, inplace=True) if args.debug: infer_df = infer_df.sample(n=1000) @@ -49,7 +50,6 @@ # hyperparameters for inference hyperparameters = { "model_uri": args.model_uri, - "target": args.target, } # create SageMaker estimator From c215e38c4985560b7f7382802e5252bf50960a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 5 Aug 2021 19:01:48 +0200 Subject: [PATCH 30/61] chore(params): fix 'epoch' hyperparameter --- scripts/training/oguz/huggingface-multihead/sagemaker_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/sagemaker_train.py b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py index 43753f2..8e24f5e 100644 --- a/scripts/training/oguz/huggingface-multihead/sagemaker_train.py +++ b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py @@ -55,7 +55,7 @@ # hyperparameters for the run hyperparameters = { - "epochs": 1, + "epochs": 10, "model_name": "distilbert-base-uncased", "tracking_uri": MLFLOW_SERVER, "experiment_name": f"{args.task}-multihead-transformers", From 7f58e71ac28fb9d76a75c8ad2ddb23764abd8a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 6 Aug 2021 10:29:18 +0200 Subject: [PATCH 31/61] fix(data): apply literal_eval for filtering --- scripts/training/oguz/huggingface-multihead/data.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index db7e213..2980eac 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -67,7 +67,13 @@ def __init__( if filter is not None: pos = np.zeros(len(dataframe), dtype=np.bool) for f in filter: + # apply literal eval to have lists + dataframe[f] = dataframe[f].apply(literal_eval) + + # get positive fields pos |= np.array([len(item) > 0 for item in dataframe[f].tolist()], dtype=np.bool) + + # filter negative rows dataframe = dataframe[pos] self.logger.info( f"Filtered data points with non-empty {','.join(filter)} values" @@ -105,8 +111,9 @@ def __init__( self.data = tokenizer(self.data, **self.tokenizer_options) if not self.inference: - # apply literal eval to have lists in target - dataframe[target] = dataframe[target].apply(literal_eval) + if filter is None or target not in filter: + # apply literal eval to have lists in target + dataframe[target] = dataframe[target].apply(literal_eval) # prepare target encoding all_targets = np.hstack(dataframe[target].to_numpy()) From 0490d450ab3a2610d5c3bba2b186e03a7ab3a135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 6 Aug 2021 15:29:45 +0200 Subject: [PATCH 32/61] feat(modeling): add focal loss-star --- .../oguz/huggingface-multihead/loss.py | 90 +++++++++++++++++++ .../oguz/huggingface-multihead/train.py | 6 +- .../oguz/huggingface-multihead/trainer.py | 19 +++- .../oguz/huggingface-multihead/utils.py | 47 ---------- 4 files changed, 109 insertions(+), 53 deletions(-) create mode 100644 scripts/training/oguz/huggingface-multihead/loss.py diff --git a/scripts/training/oguz/huggingface-multihead/loss.py b/scripts/training/oguz/huggingface-multihead/loss.py new file mode 100644 index 0000000..0a37b25 --- /dev/null +++ b/scripts/training/oguz/huggingface-multihead/loss.py @@ -0,0 +1,90 @@ +import torch +import torch.nn.functional as F + + +def sigmoid_focal_loss( + inputs: torch.Tensor, + targets: torch.Tensor, + alpha: float = 0.25, + gamma: float = 2, + reduction: str = "mean", +): + """ + Original implementation from + https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples or -1 for ignore. Default = 0.25 + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + reduction: 'none' | 'mean' | 'sum' + 'none': No reduction will be applied to the output. + 'mean': The output will be averaged. + 'sum': The output will be summed. + Returns: + Loss tensor with the reduction option applied. + """ + p = torch.sigmoid(inputs) + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = p * targets + (1 - p) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + if reduction == "mean": + loss = loss.mean() + elif reduction == "sum": + loss = loss.sum() + + return loss + + +def sigmoid_focal_loss_star( + inputs: torch.Tensor, + targets: torch.Tensor, + alpha: float = -1, + gamma: float = 1, + reduction: str = "none", +) -> torch.Tensor: + """ + Original implementation from + https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py + FL* described in RetinaNet paper Appendix: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Gamma parameter described in FL*. Default = 1 (no weighting). + reduction: 'none' | 'mean' | 'sum' + 'none': No reduction will be applied to the output. + 'mean': The output will be averaged. + 'sum': The output will be summed. + Returns: + Loss tensor with the reduction option applied. + """ + shifted_inputs = gamma * (inputs * (2 * targets - 1)) + loss = -(F.logsigmoid(shifted_inputs)) / gamma + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss *= alpha_t + + if reduction == "mean": + loss = loss.mean() + elif reduction == "sum": + loss = loss.sum() + + return loss diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index dd3fe51..5ee1466 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -34,8 +34,8 @@ "--loss", type=str, default="ce", - choices=["ce", "focal"], - help="Loss function: 'ce', 'focal'", + choices=["ce", "focal", "focal_star"], + help="Loss function: 'ce', 'focal', 'focal_star'", ) parser.add_argument( "--target", @@ -255,7 +255,7 @@ def _process(text): compute_metrics=compute_metrics, train_dataset=train_dataset, eval_dataset=test_dataset, - focal_loss=(args.loss == "focal"), + loss_fn=args.loss, ) # set env variable for MLFlow artifact logging diff --git a/scripts/training/oguz/huggingface-multihead/trainer.py b/scripts/training/oguz/huggingface-multihead/trainer.py index 553766c..e7f9847 100644 --- a/scripts/training/oguz/huggingface-multihead/trainer.py +++ b/scripts/training/oguz/huggingface-multihead/trainer.py @@ -13,10 +13,16 @@ TrainingArguments, ) -from utils import sigmoid_focal_loss +from loss import sigmoid_focal_loss, sigmoid_focal_loss_star class MultiHeadTrainer(Trainer): + """HuggingFace Trainer compatible with MultiHeadTransformer models. + + Args: + loss_fn: 'ce', 'focal', 'focal_star' + """ + def __init__( self, model: Union[PreTrainedModel, nn.Module] = None, @@ -29,7 +35,7 @@ def __init__( compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), - focal_loss: bool = False, + loss_fn: str = "ce", ): super().__init__( model=model, @@ -43,7 +49,14 @@ def __init__( callbacks=callbacks, optimizers=optimizers, ) - self.loss_fn = sigmoid_focal_loss if focal_loss else torch.nn.BCEWithLogitsLoss() + if loss_fn == "ce": + self.loss_fn = torch.nn.BCEWithLogitsLoss() + elif loss_fn == "focal": + self.loss_fn = sigmoid_focal_loss + elif loss_fn == "focal_star": + self.loss_fn = sigmoid_focal_loss_star + else: + raise "Unknown loss function" def compute_loss(self, model, inputs, return_outputs=False): labels = inputs.pop("labels") diff --git a/scripts/training/oguz/huggingface-multihead/utils.py b/scripts/training/oguz/huggingface-multihead/utils.py index d120b56..6249bac 100644 --- a/scripts/training/oguz/huggingface-multihead/utils.py +++ b/scripts/training/oguz/huggingface-multihead/utils.py @@ -5,7 +5,6 @@ from pathlib import Path import torch -import torch.nn.functional as F import mlflow @@ -91,49 +90,3 @@ def build_mlp( # return network return torch.nn.Sequential(OrderedDict(layers)) - - -def sigmoid_focal_loss( - inputs: torch.Tensor, - targets: torch.Tensor, - alpha: float = 0.25, - gamma: float = 2, - reduction: str = "mean", -): - """ - Original implementation from - https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs - (0 for the negative class and 1 for the positive class). - alpha: (optional) Weighting factor in range (0,1) to balance - positive vs negative examples or -1 for ignore. Default = 0.25 - gamma: Exponent of the modulating factor (1 - p_t) to - balance easy vs hard examples. - reduction: 'none' | 'mean' | 'sum' - 'none': No reduction will be applied to the output. - 'mean': The output will be averaged. - 'sum': The output will be summed. - Returns: - Loss tensor with the reduction option applied. - """ - p = torch.sigmoid(inputs) - ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - p_t = p * targets + (1 - p) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - if reduction == "mean": - loss = loss.mean() - elif reduction == "sum": - loss = loss.sum() - - return loss From fc03a59401494203c6809723cd73b7afebf47dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 6 Aug 2021 17:19:56 +0200 Subject: [PATCH 33/61] fix(mlflow): upload full code path --- scripts/training/oguz/huggingface-multihead/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 5ee1466..251318e 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -347,7 +347,7 @@ def _process(text): if args.deploy: # log model with an inference wrapper logging.info("Logging deployment model") - data_file = os.path.join(os.path.dirname(__file__), "data.py") + code_path = os.path.abspath(os.path.dirname(__file__)) mlflow_wrapper = MLFlowWrapper(tokenizer, trainer.model) mlflow.pyfunc.log_model( @@ -356,7 +356,7 @@ def _process(text): registered_model_name="multi-head-transformer", artifacts=artifacts, conda_env=get_conda_env_specs(), - code_path=[__file__, data_file], + code_path=os.listdir(code_path), ) # finish mlflow run From cf99e320a9c44aa0a7f154f9539da9a8fb7c2b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 6 Aug 2021 17:20:11 +0200 Subject: [PATCH 34/61] fix(mlflow): add deployment dependencies --- scripts/training/oguz/huggingface-multihead/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/training/oguz/huggingface-multihead/requirements.txt b/scripts/training/oguz/huggingface-multihead/requirements.txt index 6d4beb3..f650a41 100644 --- a/scripts/training/oguz/huggingface-multihead/requirements.txt +++ b/scripts/training/oguz/huggingface-multihead/requirements.txt @@ -1,2 +1,5 @@ transformers==4.6.1 mlflow==1.18.0 +sagemaker==2.49.1 +s3fs==2021.07.0 +smdebug==1.0.11 From a29e13dd539577d69c7d47f8892856f8bb5579a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 15:48:57 +0200 Subject: [PATCH 35/61] feat(data): compute target statistics --- .../oguz/huggingface-multihead/data.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 2980eac..5b3eff2 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -1,6 +1,6 @@ import logging from ast import literal_eval -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd @@ -47,6 +47,7 @@ def __init__( inference: bool = False, tokenizer_max_len: Optional[int] = None, ): + self.groups = groups self.group_names = group_names self.flatten = flatten self.tokenizer = tokenizer @@ -115,10 +116,6 @@ def __init__( # apply literal eval to have lists in target dataframe[target] = dataframe[target].apply(literal_eval) - # prepare target encoding - all_targets = np.hstack(dataframe[target].to_numpy()) - uniq_targets = np.unique(all_targets) - if groups: # process given groups self.group_encoding = {t: idx for idx, group in enumerate(groups) for t in group} @@ -128,9 +125,15 @@ def __init__( self.target_decoding = [revdict(encoding) for encoding in self.target_encoding] self.target_classes = [len(encoding.keys()) for encoding in self.target_encoding] else: + # prepare target encoding + all_targets = np.hstack(dataframe[target].to_numpy()) + uniq_targets = np.unique(all_targets) + # single group encoding - decoding self.group_encoding = {t: 0 for t in uniq_targets} self.group_decoding = {0: uniq_targets} + self.groups = [uniq_targets.tolist()] + self.group_names = ["ALL"] self.target_encoding = {t: idx for idx, t in enumerate(uniq_targets)} self.target_encoding = revdict(self.target_encoding) @@ -146,6 +149,29 @@ def __init__( if groups: self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] + def compute_stats(self) -> Dict[str, int]: + """Computes occurences of each target and group""" + + counts = {} + classes = [target for group in self.groups for target in group] + if self.flatten: + sums = np.sum(np.stack(self.target, axis=-1), axis=-1) + counts.update({c: s for c, s in zip(classes, sums.tolist())}) + else: + for i, _ in enumerate(self.group_names): + targets = [target[i] for target in self.target] + sums = np.sum(np.stack(targets, axis=-1), axis=-1) + counts.update({c: s for c, s in zip(self.groups[i], sums)}) + + for i, group_name in enumerate(self.group_names): + counts.update( + {group_name: np.sum(np.array([counts[group] for group in self.groups[i]]))} + ) + counts.update( + {"ALL": np.sum(np.array([counts[group_name] for group_name in self.group_names]))} + ) + return counts + def group_encode(self, targets: List[str]) -> np.ndarray: """Encodes given targets to group representation""" From 76f2ba2ce3ea1f764e7bae0e6bf0cb6ac0af3c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 15:49:31 +0200 Subject: [PATCH 36/61] feat(data): support pickle, csv and excel dataframes --- .../training/oguz/huggingface-multihead/data.py | 4 ++-- .../training/oguz/huggingface-multihead/utils.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 5b3eff2..460bc17 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -7,7 +7,7 @@ import torch from torch.utils.data import Dataset from transformers import PreTrainedTokenizer -from utils import revdict +from utils import revdict, read_dataframe class MultiHeadDataFrame(Dataset): @@ -58,7 +58,7 @@ def __init__( # read dataframe manually if given as path if isinstance(dataframe, str): self.logger.info(f"Loading dataframe: {dataframe}") - dataframe = pd.read_pickle(dataframe) + dataframe = read_dataframe(dataframe) # cast filter to array if isinstance(filter, str): diff --git a/scripts/training/oguz/huggingface-multihead/utils.py b/scripts/training/oguz/huggingface-multihead/utils.py index 6249bac..8861f31 100644 --- a/scripts/training/oguz/huggingface-multihead/utils.py +++ b/scripts/training/oguz/huggingface-multihead/utils.py @@ -4,6 +4,8 @@ import argparse from pathlib import Path +import pandas as pd + import torch import mlflow @@ -50,6 +52,18 @@ def get_conda_env_specs(): return default_env +def read_dataframe(path: str, **kwargs): + """Reads a Pandas DataFrame respecting the file extension""" + + if path.endswith(".pickle"): + return pd.read_pickle(path, **kwargs) + if path.endswith(".csv"): + return pd.read_csv(path, **kwargs) + if path.endswith(".xlsx"): + return pd.read_excel(path, **kwargs) + raise "Unknown data format" + + def build_mlp( depth: int, in_features: int, From d5b35f232d04de486014274a88d9c8a00f5c7b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 16:33:14 +0200 Subject: [PATCH 37/61] feat(modeling): add inverse loss weighting Two types of weighting are applied together: * Across-class balancing: each sigmoidal unit loss is multiplied by inverse frequency * In-class balancing: positive loss is weighted by the in-class frequency w.r.t. negatives --- .../oguz/huggingface-multihead/train.py | 19 +++++++++++++++++-- .../oguz/huggingface-multihead/trainer.py | 14 +++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 251318e..03ac801 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -37,6 +37,9 @@ choices=["ce", "focal", "focal_star"], help="Loss function: 'ce', 'focal', 'focal_star'", ) + parser.add_argument( + "--weighting", type=str, default=None, choices=["inverse", "inverse_square"] + ) parser.add_argument( "--target", type=str, @@ -98,8 +101,7 @@ groups = SUBPILLARS_2D group_names = PILLARS_2D else: - groups = None - group_names = None + raise NotImplementedError # sanity check for iterative option if args.iterative: @@ -248,6 +250,17 @@ def _process(text): save_total_limit=1, ) + # calculate weighting coefficients + loss_weights, loss_pos_weights = None, None + if args.weighting == "square": + classes = [target for group in groups for target in group] + stats = train_dataset.compute_stats() + + loss_weights = [(stats["ALL"] / stats[c]) for c in classes] + loss_pos_weights = [weight - 1 for weight in loss_weights] + if args.weighting == "inverse_square": + raise NotImplementedError + # create trainer instance trainer = MultiHeadTrainer( model=model, @@ -256,6 +269,8 @@ def _process(text): train_dataset=train_dataset, eval_dataset=test_dataset, loss_fn=args.loss, + loss_weights=loss_weights, + loss_pos_weights=loss_pos_weights, ) # set env variable for MLFlow artifact logging diff --git a/scripts/training/oguz/huggingface-multihead/trainer.py b/scripts/training/oguz/huggingface-multihead/trainer.py index e7f9847..2a6e048 100644 --- a/scripts/training/oguz/huggingface-multihead/trainer.py +++ b/scripts/training/oguz/huggingface-multihead/trainer.py @@ -21,6 +21,8 @@ class MultiHeadTrainer(Trainer): Args: loss_fn: 'ce', 'focal', 'focal_star' + loss_weights: weighting applied to different classes + loss_pos_weights: weighting applied to positive versus negative instances """ def __init__( @@ -36,6 +38,8 @@ def __init__( callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), loss_fn: str = "ce", + loss_weights: Optional[List[float]] = None, + loss_pos_weights: Optional[List[float]] = None, ): super().__init__( model=model, @@ -50,10 +54,18 @@ def __init__( optimizers=optimizers, ) if loss_fn == "ce": - self.loss_fn = torch.nn.BCEWithLogitsLoss() + self.loss_fn = torch.nn.BCEWithLogitsLoss( + weight=loss_weights, pos_weight=loss_pos_weights + ) elif loss_fn == "focal": + assert ( + loss_weights is None and loss_pos_weights is None + ), "Does not support weighting with focal loss" self.loss_fn = sigmoid_focal_loss elif loss_fn == "focal_star": + assert ( + loss_weights is None and loss_pos_weights is None + ), "Does not support weighting with focal loss-star" self.loss_fn = sigmoid_focal_loss_star else: raise "Unknown loss function" From 99a3b9b3dd4e50ba6a4d31d638eee9b0c39f857f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 16:54:40 +0200 Subject: [PATCH 38/61] feat(data): support `sectors` as a 1-head classification task --- .../training/oguz/huggingface-multihead/data.py | 14 ++++++++++++-- .../training/oguz/huggingface-multihead/train.py | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 460bc17..3506026 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -47,8 +47,6 @@ def __init__( inference: bool = False, tokenizer_max_len: Optional[int] = None, ): - self.groups = groups - self.group_names = group_names self.flatten = flatten self.tokenizer = tokenizer self.online = online @@ -111,6 +109,18 @@ def __init__( self.logger.info("Applying offline tokenization") self.data = tokenizer(self.data, **self.tokenizer_options) + # process groups + if groups is not None: + if group_names is not None: + assert len(groups) == len( + group_names + ), "Group names should be at equal length with groups" + else: + group_names = [f"Group {i}" for i in range(len(groups))] + + self.groups = groups + self.group_names = group_names + if not self.inference: if filter is None or target not in filter: # apply literal eval to have lists in target diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 03ac801..cdb66f9 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -101,7 +101,8 @@ groups = SUBPILLARS_2D group_names = PILLARS_2D else: - raise NotImplementedError + groups = [SECTORS] + group_names = ["Sectors"] # sanity check for iterative option if args.iterative: From d7286f4f306af1dc328656e746a6c320de9bff29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 16:55:53 +0200 Subject: [PATCH 39/61] docs(modeling): add documentation to multi task transformer --- .../oguz/huggingface-multihead/model.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index fd6397d..c7084b0 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -8,6 +8,29 @@ class MultiHeadTransformer(torch.nn.Module): + """Multi-task MLP classifier using the same transformer backbone. + + Args: + backbone: Pre-trained transformer. + num_heads: Number of classification tasks. + num_classes: List of number of classes in each task. + num_layers: Depth of MLP classfier heads. + dropout: Rate of dropout in tranformer output before MLP classifiers. + pooling: If true, classifiers use averaged representations of all symbols. + If false, classifiers use representation of the start symbol. + freeze_backbone: Only train classifiers with backbone. + iterative: Adds an additional classification head for coarser _group_ task. + Only relevant if the task involves (coarse, fine-grained) labels. + If enabled, an additional classifier is first used to predict the coarse + label and the other heads predict the coarse label. The coarse classifier + acts as a filter, i.e., if a negative prediction occurs for a coarse label, + all predictions for labels in that group are set to high negative values. + use_gt_training: uses ground truth group values in the training + Only relevant if iterative is set to True. + backbone_dim: dimension of the backbone transformer + Set if the dimension is not accessible through the config of backbone. + """ + def __init__( self, backbone: PreTrainedModel, From 44c9634936b41247baf9d377643441855f7e5db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 16:56:39 +0200 Subject: [PATCH 40/61] fix(modeling): return zero if coarse group is predicted as negative Note the model returns pre-sigmoid logits rather than probabilites. --- scripts/training/oguz/huggingface-multihead/model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index c7084b0..a2b5aff 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -6,6 +6,8 @@ from utils import build_mlp +ZERO_SIGMOID_INVERSE = -10 + class MultiHeadTransformer(torch.nn.Module): """Multi-task MLP classifier using the same transformer backbone. @@ -101,7 +103,7 @@ def forward(self, inputs, gt_groups=None, group_threshold=0.5): # execute super-classification task out_groups = self.heads[0](hidden) - # get sample groups + # get group predictions # TODO: dynamic threshold (per group?) groups = ( gt_groups @@ -116,8 +118,9 @@ def forward(self, inputs, gt_groups=None, group_threshold=0.5): out_targets.append( torch.where( torch.repeat_interleave(groups[:, i : i + 1], out_target.shape[1], dim=1), - out_target, - torch.zeros_like(out_target), + out_target, # classifer output if group is predicted as `positive` + torch.zeros_like(out_target) + + ZERO_SIGMOID_INVERSE, # zero if group is predicted as `negative` ) ) out_targets = torch.cat(out_targets, axis=-1) From eb36bdc31ab814a734c1b746600ad7b21a1a0d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 9 Aug 2021 16:33:14 +0200 Subject: [PATCH 41/61] feat(modeling): add inverse loss weighting Two types of weighting are applied together: * Across-class balancing: each sigmoidal unit loss is multiplied by inverse frequency * In-class balancing: positive loss is weighted by the in-class frequency w.r.t. negatives --- scripts/training/oguz/huggingface-multihead/train.py | 2 +- scripts/training/oguz/huggingface-multihead/trainer.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index cdb66f9..2861967 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -253,7 +253,7 @@ def _process(text): # calculate weighting coefficients loss_weights, loss_pos_weights = None, None - if args.weighting == "square": + if args.weighting == "inverse": classes = [target for group in groups for target in group] stats = train_dataset.compute_stats() diff --git a/scripts/training/oguz/huggingface-multihead/trainer.py b/scripts/training/oguz/huggingface-multihead/trainer.py index 2a6e048..675359f 100644 --- a/scripts/training/oguz/huggingface-multihead/trainer.py +++ b/scripts/training/oguz/huggingface-multihead/trainer.py @@ -54,6 +54,11 @@ def __init__( optimizers=optimizers, ) if loss_fn == "ce": + if loss_weights: + loss_weights = torch.FloatTensor(loss_weights).to('cuda') + if loss_pos_weights: + loss_pos_weights = torch.FloatTensor(loss_pos_weights).to('cuda') + self.loss_fn = torch.nn.BCEWithLogitsLoss( weight=loss_weights, pos_weight=loss_pos_weights ) From 51b0bb5da882aa9db2fb886fd0d13d53dbd053b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 10 Aug 2021 15:12:17 +0200 Subject: [PATCH 42/61] feat(mlflow): update model output format * self-explanatory outputs * two different output formats: flatten vs nested * adjust output style through infer config file --- .../oguz/huggingface-multihead/train.py | 13 +++ .../oguz/huggingface-multihead/wrapper.py | 104 ++++++++++++------ 2 files changed, 85 insertions(+), 32 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 2861967..3a634ef 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -97,12 +97,15 @@ if args.target == "subpillars_1d": groups = SUBPILLARS_1D group_names = PILLARS_1D + target_group = "pillars_1d" elif args.target == "subpillars" or args.target == "subpillars_2d": groups = SUBPILLARS_2D group_names = PILLARS_2D + target_group = "pillars_2d" else: groups = [SECTORS] group_names = ["Sectors"] + target_group = None # sanity check for iterative option if args.iterative: @@ -329,6 +332,15 @@ def _process(text): writer.write(f"{label}\n") artifacts.update({"groups": group_file}) + # output target names artifact + logging.info("Logging target names") + target_file = os.path.join(args.output_data_dir, "targets.txt") + with open(target_file, "w") as writer: + writer.write(f"{args.target}\n") + if target_group: + writer.write(f"{target_group}\n") + artifacts.update({"targets": target_file}) + # log experiment params to MLFlow mlflow.log_params(vars(args)) @@ -355,6 +367,7 @@ def _process(text): "num_workers": 0, }, "threshold": {"group": 0.5, "target": 0.5}, + "output": {"probs_only": True, "flatten": True}, }, writer, ) diff --git a/scripts/training/oguz/huggingface-multihead/wrapper.py b/scripts/training/oguz/huggingface-multihead/wrapper.py index 8fbc46d..60b2fad 100644 --- a/scripts/training/oguz/huggingface-multihead/wrapper.py +++ b/scripts/training/oguz/huggingface-multihead/wrapper.py @@ -1,9 +1,5 @@ -import sys import json -sys.path.append(".") - -import pandas as pd import numpy as np import torch import mlflow @@ -12,7 +8,7 @@ from data import MultiHeadDataFrame -def extract_predictions(probs, threshold): +def _extract_predictions(probs, threshold): """Extracts predictions from probabilities and threshold""" probs = probs > threshold @@ -22,20 +18,6 @@ def extract_predictions(probs, threshold): return preds -def list2str(items, is_int=False): - """Converts an array of lists of integers into an array of strings""" - - arr = [] - for item in items: - if len(item) == 0: - arr.append("") - elif is_int: - arr.append(",".join([str(i) for i in item])) - else: - arr.append(",".join([f"{i:.3f}" for i in item])) - return arr - - class MLFlowWrapper(mlflow.pyfunc.PythonModel): """MLFlow Wrapper class for inference""" @@ -54,6 +36,10 @@ def load_context(self, context): with open(context.artifacts["groups"], "r") as f: self.groups = [line.strip() for line in f.readlines()] + # process target name + with open(context.artifacts["targets"], "r") as f: + self.targets = [line.strip() for line in f.readlines()] + # process inference params with open(context.artifacts["infer_params"], "r") as f: self.infer_params = json.load(f) @@ -66,6 +52,12 @@ def load_context(self, context): assert dataset_params["inference"], "Can only use an inference dataset!" dataset_params.pop("inference") + # sanity check for output params + output_params = self.infer_params["output"] + assert ( + output_params["flatten"] and output_params["probs_only"] + ), "Flattened output is only supported when preds_only is enabled" + def predict(self, context, model_input): # get dataset and data loader dataset = MultiHeadDataFrame( @@ -100,22 +92,70 @@ def predict(self, context, model_input): probs_targets.append(batch_targets.detach().cpu().numpy()) probs_targets = np.concatenate(probs_targets, axis=0) - preds_targets = extract_predictions(probs_targets, self.infer_params["threshold"]["target"]) - output = { - "probabilities_target": list2str(probs_targets.tolist()), - "predictions_target": list2str(preds_targets, is_int=True), - } + preds_targets = _extract_predictions( + probs_targets, self.infer_params["threshold"]["target"] + ) + + if self.infer_params["output"]["flatten"]: + # prepare flattened output + output = [ + {self.labels[j]: probs_targets[i, j] for j in range(probs_targets.shape[1])} + for i in range(probs_targets.shape[0]) + ] + else: + # put probabilities inside a nested dictionary + output = [ + { + "probabilities": { + self.targets[0]: { + self.labels[j]: probs_targets[i, j] + for j in range(probs_targets.shape[1]) + } + } + } + for i in range(probs_targets.shape[0]) + ] + + if not self.infer_params["output"]["probs_only"]: + # append predictions + output = [ + out.update({"predictions": {self.targets[0]: preds_targets[i]}}) + for i, out in enumerate(output) + ] if self.model.iterative: probs_groups = np.concatenate(probs_groups, axis=0) - preds_groups = extract_predictions( + preds_groups = _extract_predictions( probs_groups, self.infer_params["threshold"]["group"] ) - output.update( - { - "probabilities_group": list2str(probs_groups.tolist()), - "predictions_group": list2str(preds_groups, is_int=True), - } - ) - return pd.DataFrame.from_dict(output) + if self.infer_params["output"]["flatten"]: + # append group preds in a flattened format + output = [ + out.update( + {self.groups[j]: probs_groups[i, j] for j in range(probs_groups.shape[1])} + ) + for i, out in enumerate(output) + ] + else: + # update probabilities field for the group output + output = [ + out["probabilities"].update( + { + self.targets[1]: { + self.groups[j]: probs_groups[i, j] + for j in range(probs_groups.shape[1]) + } + } + ) + for i, out in enumerate(output) + ] + + if not self.infer_params["output"]["probs_only"]: + # append group predictions + output = [ + out["predictions"].update({self.targets[1]: preds_groups[i]}) + for i, out in enumerate(output) + ] + + return output From 0cbfd3c6fab01dea842b34c9d2a93d048bc95867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 10 Aug 2021 15:33:41 +0200 Subject: [PATCH 43/61] refactor(data): modularize dataset into text and target datasets * text dataset only deals with data source and tokenization * target dataset only deals with encoding/decoding of targets and groups --- .../oguz/huggingface-multihead/data.py | 292 ++++++++++++------ 1 file changed, 196 insertions(+), 96 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 3506026..249298d 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -10,26 +10,14 @@ from utils import revdict, read_dataframe -class MultiHeadDataFrame(Dataset): - """Creates a PyTorch dataset out of a Pandas DataFrame +class TextDataFrame(Dataset): + """Creates a PyTorch dataset out of a text field inside a Pandas DataFrame. Args: - dataframe: path to a DataFrame or directly a DataFrame + dataframe: path to a DataFrame or a DataFrame tokenizer: tokenizer to pre-process source text source: textual source field that will be the input of models - target: target classification field that will be the output of models - groups: transforms target into a multi-target (each multi-label) - that is, each sample is associated with 2D one-hot target matrix - group_names: name assoaciated with each classification head - filter: (None, str, List of strings) filter dataset according - to given group. `None` uses all of the data points. Single str key - uses all data points with at least one target key value. If a list of - strings is given, each key is used to check positivity of the sample, - e.g., ['sector', 'pillar2d'] checks whether the data point has at - least one target in `sector` or in `pillar2d` fields. - flatten: flatten group targets to 1D for convenience online: online or offline tokenization - inference: if True, does not process target or groups tokenizer_max_len: maximum output length for the tokenizer """ @@ -38,19 +26,11 @@ def __init__( dataframe: Union[str, pd.DataFrame], tokenizer: PreTrainedTokenizer, source: str = "excerpt", - target: str = "target", - groups: Optional[List[List[str]]] = None, - group_names: Optional[List[str]] = None, - filter: Optional[Union[str, List[str]]] = None, - flatten: bool = True, online: bool = False, - inference: bool = False, tokenizer_max_len: Optional[int] = None, ): - self.flatten = flatten self.tokenizer = tokenizer self.online = online - self.inference = inference self.logger = logging.getLogger() # read dataframe manually if given as path @@ -58,27 +38,6 @@ def __init__( self.logger.info(f"Loading dataframe: {dataframe}") dataframe = read_dataframe(dataframe) - # cast filter to array - if isinstance(filter, str): - filter = [filter] - - # filter data frame - if filter is not None: - pos = np.zeros(len(dataframe), dtype=np.bool) - for f in filter: - # apply literal eval to have lists - dataframe[f] = dataframe[f].apply(literal_eval) - - # get positive fields - pos |= np.array([len(item) > 0 for item in dataframe[f].tolist()], dtype=np.bool) - - # filter negative rows - dataframe = dataframe[pos] - self.logger.info( - f"Filtered data points with non-empty {','.join(filter)} values" - "(using 'or' if multiple fields)" - ) - # prepare tokenizer options self.tokenizer_options = { "truncation": True, @@ -101,14 +60,54 @@ def __init__( self.data = dataframe[source].tolist() self.data_len = len(self.data) - if self.online: - # ensure that we are in training - assert not self.inference, "Online tokenization is only supported in training-time" - else: + if not self.online: # tokenize and save source data self.logger.info("Applying offline tokenization") self.data = tokenizer(self.data, **self.tokenizer_options) + def __len__(self): + return self.data_len + + def __getitem__(self, idx): + if self.online: + data = self.tokenizer(self.data[idx : idx + 1], **self.tokenizer_options) + item = {key: torch.tensor(val[0]) for key, val in data.items()} + else: + item = {key: torch.tensor(val[idx]) for key, val in self.data.items()} + + return item + + +class MultiTargetDataFrame(Dataset): + """Creates a PyTorch dataset out of a field containing list of labels + for a multi-label classification problem out of a Pandas DataFrame. + + + Args: + dataframe: path to a DataFrame or a DataFrame + target: target classification field that will be the output of models + groups: transforms target into a multi-target (each multi-label) + that is, each sample is associated with 2D one-hot target matrix + group_names: name assoaciated with each classification head + flatten: flatten group targets to 1D for convenience + """ + + def __init__( + self, + dataframe: Union[str, pd.DataFrame], + target: str = "target", + groups: Optional[List[List[str]]] = None, + group_names: Optional[List[str]] = None, + flatten: bool = True, + ): + self.flatten = flatten + self.logger = logging.getLogger() + + # read dataframe manually if given as path + if isinstance(dataframe, str): + self.logger.info(f"Loading dataframe: {dataframe}") + dataframe = read_dataframe(dataframe) + # process groups if groups is not None: if group_names is not None: @@ -121,43 +120,64 @@ def __init__( self.groups = groups self.group_names = group_names - if not self.inference: - if filter is None or target not in filter: - # apply literal eval to have lists in target - dataframe[target] = dataframe[target].apply(literal_eval) + if not isinstance(dataframe[target].iloc[0], list): + # apply literal eval to have lists in target + dataframe[target] = dataframe[target].apply(literal_eval) - if groups: - # process given groups - self.group_encoding = {t: idx for idx, group in enumerate(groups) for t in group} - self.group_decoding = {idx: group for idx, group in enumerate(groups)} + if groups: + # process given groups + self.group_encoding = {t: idx for idx, group in enumerate(groups) for t in group} + self.group_decoding = {idx: group for idx, group in enumerate(groups)} - self.target_encoding = [{t: idx for idx, t in enumerate(group)} for group in groups] - self.target_decoding = [revdict(encoding) for encoding in self.target_encoding] - self.target_classes = [len(encoding.keys()) for encoding in self.target_encoding] - else: - # prepare target encoding - all_targets = np.hstack(dataframe[target].to_numpy()) - uniq_targets = np.unique(all_targets) + self.target_encoding = [{t: idx for idx, t in enumerate(group)} for group in groups] + self.target_decoding = [revdict(encoding) for encoding in self.target_encoding] + self.target_classes = [len(encoding.keys()) for encoding in self.target_encoding] + else: + # prepare target encoding + all_targets = np.hstack(dataframe[target].to_numpy()) + uniq_targets = np.unique(all_targets) + + # single group encoding - decoding + self.group_encoding = {t: 0 for t in uniq_targets} + self.group_decoding = {0: uniq_targets} + self.groups = [uniq_targets.tolist()] + self.group_names = ["ALL"] + + self.target_encoding = {t: idx for idx, t in enumerate(uniq_targets)} + self.target_encoding = revdict(self.target_encoding) + self.target_classes = [len(self.target_encoding.keys())] + + self.logger.info(f"Using target encodings: {self.target_encoding}") + self.logger.info(f"Target size: [{self.target_classes}]") + + # prepare targets + self.target = [self.onehot_encode(ts) for ts in dataframe[target].tolist()] + self.data_len = len(self.target) - # single group encoding - decoding - self.group_encoding = {t: 0 for t in uniq_targets} - self.group_decoding = {0: uniq_targets} - self.groups = [uniq_targets.tolist()] - self.group_names = ["ALL"] + # prepare group targets + if groups: + self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] - self.target_encoding = {t: idx for idx, t in enumerate(uniq_targets)} - self.target_encoding = revdict(self.target_encoding) - self.target_classes = [len(self.target_encoding.keys())] + def __len__(self): + return self.data_len + + def __getitem__(self, idx): + item = {} - self.logger.info(f"Using target encodings: {self.target_encoding}") - self.logger.info(f"Target size: [{self.target_classes}]") + if self.flatten: + item["labels"] = torch.tensor(self.target[idx]) + else: + item.update( + { + f"labels_{self.group_names[i]}": torch.tensor(self.target[idx][i]) + for i in range(len(self.target_classes)) + } + ) - # prepare targets - self.target = [self.onehot_encode(ts) for ts in dataframe[target].tolist()] + if self.group is not None: + item["groups"] = torch.tensor(self.group[idx]) - # prepare group targets - if groups: - self.group = [self.group_encode(ts) for ts in dataframe[target].tolist()] + return item def compute_stats(self) -> Dict[str, int]: """Computes occurences of each target and group""" @@ -231,28 +251,108 @@ def onehot_decode(self, onehot: Union[np.ndarray, List[np.ndarray]]) -> List[str if onehot[i][j] == 1 ] + +class MultiHeadDataFrame(Dataset): + """Creates a PyTorch dataset out of a Pandas DataFrame that supports + multi-head classification tasks where each fine-grained label belongs + to a super-category or a group. + + + Args: + dataframe: path to a DataFrame or a DataFrame + tokenizer: tokenizer to pre-process source text + source: textual source field that will be the input of models + target: target classification field that will be the output of models + groups: transforms target into a multi-target (each multi-label) + that is, each sample is associated with 2D one-hot target matrix + group_names: name assoaciated with each classification head + filter: (None, str, List of strings) filter dataset according + to given group. `None` uses all of the data points. Single str key + uses all data points with at least one target key value. If a list of + strings is given, each key is used to check positivity of the sample, + e.g., ['sector', 'pillar2d'] checks whether the data point has at + least one target in `sector` or in `pillar2d` fields. + flatten: flatten group targets to 1D for convenience + online: online or offline tokenization + inference: if True, does not process target or groups + tokenizer_max_len: maximum output length for the tokenizer + """ + + def __init__( + self, + dataframe: Union[str, pd.DataFrame], + tokenizer: PreTrainedTokenizer, + source: str = "excerpt", + target: str = "target", + groups: Optional[List[List[str]]] = None, + group_names: Optional[List[str]] = None, + filter: Optional[Union[str, List[str]]] = None, + flatten: bool = True, + online: bool = False, + inference: bool = False, + tokenizer_max_len: Optional[int] = None, + ): + self.logger = logging.getLogger() + + if online: + # ensure that we are in training + assert not inference, "Online tokenization is only supported in training-time" + + # read dataframe manually if given as path + if isinstance(dataframe, str): + self.logger.info(f"Loading dataframe: {dataframe}") + dataframe = read_dataframe(dataframe) + + # cast filter to array + if isinstance(filter, str): + filter = [filter] + + # filter data frame + if filter is not None: + pos = np.zeros(len(dataframe), dtype=np.bool) + for f in filter: + # apply literal eval to have lists + dataframe[f] = dataframe[f].apply(literal_eval) + + # get positive fields + pos |= np.array([len(item) > 0 for item in dataframe[f].tolist()], dtype=np.bool) + + # filter negative rows + dataframe = dataframe[pos] + self.logger.info( + f"Filtered data points with non-empty {','.join(filter)} values" + "(using 'or' if multiple fields)" + ) + + # prepare text source data + self.data = TextDataFrame( + dataframe=dataframe, + tokenizer=tokenizer, + source=source, + online=online, + tokenizer_max_len=tokenizer_max_len, + ) + + if not inference: + self.target = MultiTargetDataFrame( + dataframe=dataframe, + target=target, + groups=groups, + group_names=group_names, + flatten=flatten, + ) + assert len(self.data) == len( + self.target + ), "Text source and target have different lengths!" + self.data_len = self.data.data_len + def __len__(self): return self.data_len def __getitem__(self, idx): - if self.online: - data = self.tokenizer(self.data[idx : idx + 1], **self.tokenizer_options) - item = {key: torch.tensor(val[0]) for key, val in data.items()} - else: - item = {key: torch.tensor(val[idx]) for key, val in self.data.items()} - - if not self.inference: - if self.flatten: - item["labels"] = torch.tensor(self.target[idx]) - else: - item.update( - { - f"labels_{self.group_names[i]}": torch.tensor(self.target[idx][i]) - for i in range(len(self.target_classes)) - } - ) + item = self.data[idx] - if self.group is not None: - item["groups"] = torch.tensor(self.group[idx]) + if self.target: + item.update(self.target[idx]) return item From 23f09912a929edc5325f14a6c44c218a1653e28a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Tue, 10 Aug 2021 16:01:48 +0200 Subject: [PATCH 44/61] feat(data): support multi-task learning Usage: * pass a list of targets to denote target columns * pass a list of lists as group_names to denote hierarchy groups for each target * pass a list for each group_name instance to denote targets belonging to that group_name i.e. a list of lists of lists * for one-task learning, either provide lists with length-one or provide data without list --- .../oguz/huggingface-multihead/data.py | 80 +++++++++++++------ 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 249298d..6964c0c 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -86,10 +86,12 @@ class MultiTargetDataFrame(Dataset): Args: dataframe: path to a DataFrame or a DataFrame target: target classification field that will be the output of models - groups: transforms target into a multi-target (each multi-label) + groups: transforms target into a multi-target problem (each multi-label) that is, each sample is associated with 2D one-hot target matrix + e.g., 6 label classification with two groups: [A, B, C], [D, E, F] group_names: name assoaciated with each classification head - flatten: flatten group targets to 1D for convenience + e.g., 2 group names: ABC and DEF + flatten: flatten targets to 1D for convenience """ def __init__( @@ -100,14 +102,6 @@ def __init__( group_names: Optional[List[str]] = None, flatten: bool = True, ): - self.flatten = flatten - self.logger = logging.getLogger() - - # read dataframe manually if given as path - if isinstance(dataframe, str): - self.logger.info(f"Loading dataframe: {dataframe}") - dataframe = read_dataframe(dataframe) - # process groups if groups is not None: if group_names is not None: @@ -116,9 +110,18 @@ def __init__( ), "Group names should be at equal length with groups" else: group_names = [f"Group {i}" for i in range(len(groups))] + else: + assert flatten, "Use flatten if no group information is provided" self.groups = groups self.group_names = group_names + self.flatten = flatten + self.logger = logging.getLogger() + + # read dataframe manually if given as path + if isinstance(dataframe, str): + self.logger.info(f"Loading dataframe: {dataframe}") + dataframe = read_dataframe(dataframe) if not isinstance(dataframe[target].iloc[0], list): # apply literal eval to have lists in target @@ -262,7 +265,7 @@ class MultiHeadDataFrame(Dataset): dataframe: path to a DataFrame or a DataFrame tokenizer: tokenizer to pre-process source text source: textual source field that will be the input of models - target: target classification field that will be the output of models + targets: target classification fields that will be the output of models groups: transforms target into a multi-target (each multi-label) that is, each sample is associated with 2D one-hot target matrix group_names: name assoaciated with each classification head @@ -283,9 +286,9 @@ def __init__( dataframe: Union[str, pd.DataFrame], tokenizer: PreTrainedTokenizer, source: str = "excerpt", - target: str = "target", - groups: Optional[List[List[str]]] = None, - group_names: Optional[List[str]] = None, + targets: Union[str, List[str]] = "target", + groups: Optional[Union[List[List[str]], List[List[List[str]]]]] = None, + group_names: Optional[Union[List[str], List[List[str]]]] = None, filter: Optional[Union[str, List[str]]] = None, flatten: bool = True, online: bool = False, @@ -333,17 +336,39 @@ def __init__( tokenizer_max_len=tokenizer_max_len, ) + # prepare targets + if isinstance(targets, str): + assert isinstance(groups, List[List[str]]), "Expecting `groups` to be a list of lists" + assert isinstance(group_names, List[str]), "Expecting `group_names` to be a list" + + targets = [targets] + groups = [groups] + group_names = [group_names] + self.single = True + else: + assert isinstance( + groups, List[List[List[str]]] + ), "Expecting `groups` to be a list of lists of lists" + assert isinstance( + group_names, List[List[str]] + ), "Expecting `group_names` to be a list of lists" + self.single = False + + self.targets = [] if not inference: - self.target = MultiTargetDataFrame( - dataframe=dataframe, - target=target, - groups=groups, - group_names=group_names, - flatten=flatten, - ) - assert len(self.data) == len( - self.target - ), "Text source and target have different lengths!" + for _target, _groups, _group_names in zip(targets, groups, group_names): + self.targets.append( + MultiTargetDataFrame( + dataframe=dataframe, + target=_target, + groups=_groups, + group_names=_group_names, + flatten=flatten, + ) + ) + assert len(self.data) == len( + self.targets[-1] + ), "Text source and target have different lengths!" self.data_len = self.data.data_len def __len__(self): @@ -352,7 +377,10 @@ def __len__(self): def __getitem__(self, idx): item = self.data[idx] - if self.target: - item.update(self.target[idx]) + if self.single: + item.update(self.target[0]) + else: + for i, target in enumerate(self.targets): + item.update({(f"head{i}_" + k): v for k, v in target[idx].items()}) return item From dcbf81e179d7fdf63b02efe79be575093392a168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 9 Sep 2021 09:05:59 +0300 Subject: [PATCH 45/61] refactor(modeling): refactor and document modeling --- .../huggingface-multihead/augmentations.py | 0 .../oguz/huggingface-multihead/data.py | 16 +- .../oguz/huggingface-multihead/evaluation.py | 110 ++++++++++++ .../oguz/huggingface-multihead/model.py | 158 +++++++++++++----- .../huggingface-multihead/sagemaker_train.py | 2 + .../oguz/huggingface-multihead/test.json | 42 +++++ .../oguz/huggingface-multihead/train.py | 18 +- .../oguz/huggingface-multihead/trainer.py | 9 +- 8 files changed, 286 insertions(+), 69 deletions(-) create mode 100644 scripts/training/oguz/huggingface-multihead/augmentations.py create mode 100644 scripts/training/oguz/huggingface-multihead/evaluation.py create mode 100644 scripts/training/oguz/huggingface-multihead/test.json diff --git a/scripts/training/oguz/huggingface-multihead/augmentations.py b/scripts/training/oguz/huggingface-multihead/augmentations.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 6964c0c..3110e96 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -338,20 +338,20 @@ def __init__( # prepare targets if isinstance(targets, str): - assert isinstance(groups, List[List[str]]), "Expecting `groups` to be a list of lists" - assert isinstance(group_names, List[str]), "Expecting `group_names` to be a list" + # assert isinstance(groups, List[List[str]]), "Expecting `groups` to be a list of lists" + # assert isinstance(group_names, List[str]), "Expecting `group_names` to be a list" targets = [targets] groups = [groups] group_names = [group_names] self.single = True else: - assert isinstance( - groups, List[List[List[str]]] - ), "Expecting `groups` to be a list of lists of lists" - assert isinstance( - group_names, List[List[str]] - ), "Expecting `group_names` to be a list of lists" + # assert isinstance( + # groups, List[List[List[str]]] + # ), "Expecting `groups` to be a list of lists of lists" + # assert isinstance( + # group_names, List[List[str]] + # ), "Expecting `group_names` to be a list of lists" self.single = False self.targets = [] diff --git a/scripts/training/oguz/huggingface-multihead/evaluation.py b/scripts/training/oguz/huggingface-multihead/evaluation.py new file mode 100644 index 0000000..164cee8 --- /dev/null +++ b/scripts/training/oguz/huggingface-multihead/evaluation.py @@ -0,0 +1,110 @@ +from typing import List + +import numpy as np + +from sklearn.metrics import accuracy_score, precision_recall_fscore_support + + +def _prefix(dic, prefix): + """Adds prefix to dictionary keys""" + + return {(prefix + k): v for k, v in dic.items()} + + +def _process(text): + """Replaces special characters in text (for MLFlow)""" + text = text.lower() + text = text.replace(" ", "_") + text = text.replace(">", "") + text = text.replace("&", "_") + return text + + +# compute metrics given preds and labels +def _compute(preds, labels, average="micro", threshold=0.5): + preds = preds > threshold + precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=average) + accuracy = accuracy_score(labels, preds) + return { + "accuracy": accuracy, + "f1": f1, + "precision": precision, + "recall": recall, + } + + +def compute_multitarget_metrics(preds, labels, names: List[str], threshold: float = 0.5): + """Compute metrics for multi-target classification tasks""" + metrics = {} + + # micro evaluation + metrics.update(_prefix(_compute(preds, labels, "micro"), "micro_"), threshold=threshold) + # macro evaluation + metrics.update(_prefix(_compute(preds, labels, "macro"), "macro_"), threshold=threshold) + + # per class evaluation + for idx, name in enumerate(names): + # per class micro evaluation + metrics.update( + _prefix( + _compute(preds[:, idx], labels[:, idx], "binary", threshold=threshold), + f"{_process(name)}_binary_", + ) + ) + + return metrics + + +def compute_multigroup_metrics( + preds, + labels, + groups: List[List[str]], + group_names: List[str], + threshold: float = 0.5, +): + metrics = {} + for idx, group_name in group_names: + metrics.update( + _prefix( + compute_multitarget_metrics( + preds[idx], labels[idx], names=groups[idx], threshold=threshold + ), + _process(group_name), + ) + ) + + preds = np.concatenate(preds, axis=-1) + labels = np.concatenate(labels, axis=-1) + + # micro evaluation + metrics.update(_prefix(_compute(preds, labels, "micro", threshold=threshold), "micro_")) + # macro evaluation + metrics.update(_prefix(_compute(preds, labels, "macro", threshold=threshold), "macro_")) + + return metrics + + +def compute_multitask_metrics( + preds, + labels, + groups: List[List[List[str]]], + group_names: List[List[str]], + task_names: List[str], + threshold: float = 0.5, +): + metrics = {} + for idx, task_name in task_names: + metrics.update( + _prefix( + compute_multigroup_metrics( + preds[idx], + labels[idx], + groups=groups[idx], + group_names=group_names[idx], + threshold=threshold, + ), + _process(task_name), + ) + ) + + return metrics diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index a2b5aff..b745567 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union import numpy as np import torch @@ -9,18 +9,17 @@ ZERO_SIGMOID_INVERSE = -10 -class MultiHeadTransformer(torch.nn.Module): - """Multi-task MLP classifier using the same transformer backbone. +class MultiTargetHead(torch.nn.Module): + """Multi-target MLP classifier head that is able to handle group structure in + multi-label classifications problems (e.g., 6 label classification with two groups: + [A, B, C], [D, E, F]). + Args: - backbone: Pre-trained transformer. - num_heads: Number of classification tasks. - num_classes: List of number of classes in each task. + num_heads: Number of classification groups. + num_classes: List of number of classes in each group. num_layers: Depth of MLP classfier heads. dropout: Rate of dropout in tranformer output before MLP classifiers. - pooling: If true, classifiers use averaged representations of all symbols. - If false, classifiers use representation of the start symbol. - freeze_backbone: Only train classifiers with backbone. iterative: Adds an additional classification head for coarser _group_ task. Only relevant if the task involves (coarse, fine-grained) labels. If enabled, an additional classifier is first used to predict the coarse @@ -35,38 +34,19 @@ class MultiHeadTransformer(torch.nn.Module): def __init__( self, - backbone: PreTrainedModel, - num_heads: int, - num_classes: List[int], + num_classes: List[int] = [1], num_layers: int = 1, - dropout: float = 0.3, - pooling: bool = False, - freeze_backbone: bool = False, iterative: bool = False, use_gt_training: bool = True, backbone_dim: Optional[int] = None, ): super().__init__() - self.pooling = pooling - self.iterative = iterative self.use_gt_training = use_gt_training - - self.backbone = backbone - self.backbone.config.problem_type = "multi_label_classification" - self.backbone.trainable = not freeze_backbone - - if not hasattr(self.backbone.config, "dim"): - assert backbone_dim is not None, "Model config does not include output dim!" - dim = backbone_dim - else: - dim = self.backbone.config.dim - - self.dropout = torch.nn.Dropout(dropout) self.heads = torch.nn.ModuleList() mlp_params = { "depth": num_layers, - "in_features": dim, + "in_features": backbone_dim, "bias": True, "batchnorm": False, "final_norm": False, @@ -75,33 +55,25 @@ def __init__( if iterative: self.heads.append( build_mlp( - middle_features=np.floor(np.sqrt(len(num_classes) * dim)).astype(int), + middle_features=np.floor(np.sqrt(len(num_classes) * backbone_dim)).astype(int), out_features=len(num_classes), **mlp_params ) ) - for i in range(num_heads): + for num_cls in num_classes: self.heads.append( build_mlp( - middle_features=np.floor(np.sqrt(num_classes[i] * dim)).astype(int), - out_features=num_classes[i], + middle_features=np.floor(np.sqrt(num_cls * backbone_dim)).astype(int), + out_features=num_cls, **mlp_params ) ) def forward(self, inputs, gt_groups=None, group_threshold=0.5): - # get hidden representation - backbone_outputs = self.backbone(**inputs) - if self.pooling: - last_hidden_states = torch.mean(backbone_outputs.last_hidden_state, axis=1) - else: - last_hidden_states = backbone_outputs.last_hidden_state[:, 0, :] - hidden = self.dropout(last_hidden_states) - if self.iterative: # execute super-classification task - out_groups = self.heads[0](hidden) + out_groups = self.heads[0](inputs) # get group predictions # TODO: dynamic threshold (per group?) @@ -114,7 +86,7 @@ def forward(self, inputs, gt_groups=None, group_threshold=0.5): # execute each classification task out_targets = [] for i, head in enumerate(self.heads[1:]): - out_target = head(hidden) + out_target = head(inputs) out_targets.append( torch.where( torch.repeat_interleave(groups[:, i : i + 1], out_target.shape[1], dim=1), @@ -128,7 +100,103 @@ def forward(self, inputs, gt_groups=None, group_threshold=0.5): # execute each classification task out_targets = [] for head in self.heads: - out_targets.append(head(hidden)) + out_targets.append(head(inputs)) out_targets = torch.cat(out_targets, axis=-1) return (out_groups, out_targets) if self.iterative else out_targets + + +class MultiHeadTransformer(torch.nn.Module): + """Multi-task classifier each supporting multi-target groups using the same + transformer backbone. + + Args: + backbone: Pre-trained transformer. + num_classes: List of number of classes in each task. + num_layers: Depth of MLP classfier heads. + dropout: Rate of dropout in tranformer output before MLP classifiers. + pooling: If true, classifiers use averaged representations of all symbols. + If false, classifiers use representation of the start symbol. + freeze_backbone: Only train classifiers with backbone. + iterative: Adds an additional classification head for coarser _group_ task. + Only relevant if the task involves (coarse, fine-grained) labels. + If enabled, an additional classifier is first used to predict the coarse + label and the other heads predict the coarse label. The coarse classifier + acts as a filter, i.e., if a negative prediction occurs for a coarse label, + all predictions for labels in that group are set to high negative values. + use_gt_training: uses ground truth group values in the training + Only relevant if iterative is set to True. + backbone_dim: dimension of the backbone transformer + Set if the dimension is not accessible through the config of backbone. + """ + + def __init__( + self, + backbone: PreTrainedModel, + num_classes: Optional[Union[List[int], List[List[int]]]], + num_layers: int = 1, + dropout: float = 0.3, + pooling: bool = False, + freeze_backbone: bool = False, + iterative: bool = False, + use_gt_training: bool = True, + backbone_dim: Optional[int] = None, + ): + super().__init__() + self.pooling = pooling + self.iterative = iterative + self.use_gt_training = use_gt_training + + self.backbone = backbone + self.backbone.config.problem_type = "multi_label_classification" + self.backbone.trainable = not freeze_backbone + + if not hasattr(self.backbone.config, "dim"): + assert backbone_dim is not None, "Model config does not include output dim!" + dim = backbone_dim + else: + dim = self.backbone.config.dim + + if isinstance(num_classes[0], int): + num_classes = [num_classes] + + self.dropout = torch.nn.Dropout(dropout) + self.heads.append(torch.nn.ModuleList()) + + for num_cls in num_classes: + self.heads.append( + MultiTargetHead( + num_classes=num_cls, + num_layers=num_layers, + iterative=iterative, + use_gt_training=use_gt_training, + backbone_dim=dim, + ) + ) + + def forward(self, inputs, gt_groups=None, group_threshold=0.5): + # get hidden representation + backbone_outputs = self.backbone(**inputs) + if self.pooling: + last_hidden_states = torch.mean(backbone_outputs.last_hidden_state, axis=1) + else: + last_hidden_states = backbone_outputs.last_hidden_state[:, 0, :] + hidden = self.dropout(last_hidden_states) + + # execute forward-pass for all heads + groups, targets = [], [] + for idx, head in enumerate(self.heads): + if self.iterative: + out_groups, out_targets = head( + hidden, + gt_groups=gt_groups[idx] if isinstance(gt_groups, list) else None, + group_threshold=group_threshold[idx] + if isinstance(group_threshold, list) + else group_threshold, + ) + groups.append(out_groups) + else: + out_targets = head(hidden) + targets.append(out_targets) + + return (groups, targets) if self.iterative else targets diff --git a/scripts/training/oguz/huggingface-multihead/sagemaker_train.py b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py index 8e24f5e..4e8fedb 100644 --- a/scripts/training/oguz/huggingface-multihead/sagemaker_train.py +++ b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py @@ -66,6 +66,8 @@ "num_layers": 1, "split": target_field, "target": target_field, + "weighting": "inverse", + "learning_rate": 5e-5, } # create SageMaker estimator diff --git a/scripts/training/oguz/huggingface-multihead/test.json b/scripts/training/oguz/huggingface-multihead/test.json new file mode 100644 index 0000000..b9cd937 --- /dev/null +++ b/scripts/training/oguz/huggingface-multihead/test.json @@ -0,0 +1,42 @@ +[ + { + "predictions": { + "subpillars_2d": { + "subpillar #1": 0.25, + "subpillar #2": 0.75 + }, + "subpillars_1d": { + "subpillar #1": 0.25, + "subpillar #2": 0.75 + } + }, + "confidence": { + "subpillars_2d": { + "subpillar #1": 1.0, + "subpillar #2": 0.8 + }, + "subpillars_1d": { + "subpillar #1": 1.0, + "subpillar #2": 0.8 + } + } + }, + { + "predictions": { + "subpillars_2d": { + "subpillar #1": 0.25, + "subpillar #2": 0.75 + } + }, + "confidence": { + "subpillars_2d": { + "subpillar #1": 1.0, + "subpillar #2": 0.8 + }, + "subpillars_1d": { + "subpillar #1": 1.0, + "subpillar #2": 0.8 + } + } + } +] diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 3a634ef..fbd9d31 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -8,6 +8,7 @@ import pandas as pd from sklearn.metrics import accuracy_score, precision_recall_fscore_support + from transformers import AutoModel, AutoTokenizer, TrainingArguments from constants import SECTORS, PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D @@ -42,18 +43,9 @@ ) parser.add_argument( "--target", - type=str, + type=str2list, default="subpillars_1d", - choices=[ - "pillars", - "subpillars", - "pillars_1d", - "subpillars_1d", - "pillars_2d", - "subpillars_2d", - "sectors", - ], - help="Prediction target", + help="Prediction targets", ) parser.add_argument("--split", type=str2list, default="subpillars_1d") parser.add_argument("--iterative", type=str2bool, default=False) @@ -130,7 +122,7 @@ train_df, tokenizer=tokenizer, source="excerpt", - target=args.target, + targets=args.target, groups=groups, group_names=group_names, filter=args.split, @@ -140,7 +132,7 @@ test_df, tokenizer=tokenizer, source="excerpt", - target=args.target, + targets=args.target, groups=groups, group_names=group_names, filter=args.split, diff --git a/scripts/training/oguz/huggingface-multihead/trainer.py b/scripts/training/oguz/huggingface-multihead/trainer.py index 675359f..b1164a0 100644 --- a/scripts/training/oguz/huggingface-multihead/trainer.py +++ b/scripts/training/oguz/huggingface-multihead/trainer.py @@ -36,7 +36,10 @@ def __init__( model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( + None, + None, + ), loss_fn: str = "ce", loss_weights: Optional[List[float]] = None, loss_pos_weights: Optional[List[float]] = None, @@ -55,9 +58,9 @@ def __init__( ) if loss_fn == "ce": if loss_weights: - loss_weights = torch.FloatTensor(loss_weights).to('cuda') + loss_weights = torch.FloatTensor(loss_weights).to("cuda") if loss_pos_weights: - loss_pos_weights = torch.FloatTensor(loss_pos_weights).to('cuda') + loss_pos_weights = torch.FloatTensor(loss_pos_weights).to("cuda") self.loss_fn = torch.nn.BCEWithLogitsLoss( weight=loss_weights, pos_weight=loss_pos_weights From 24f3cd51a89af9ea3633a32e24b0a1efd1b7d5b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 3 Dec 2021 11:32:30 +0100 Subject: [PATCH 46/61] fix(eval): test on all samples --- scripts/training/oguz/huggingface-multihead/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index fbd9d31..d394f6a 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -135,7 +135,7 @@ targets=args.target, groups=groups, group_names=group_names, - filter=args.split, + filter=None, flatten=True, ) From 3e95ce9c028832be65c0c0e523bc873555f1f292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 3 Dec 2021 11:33:36 +0100 Subject: [PATCH 47/61] fix(modeling): do not explicitly pass num_heads to models --- scripts/training/oguz/huggingface-multihead/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index d394f6a..f3dff0c 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -106,7 +106,6 @@ # build classifier model from backbone model = MultiHeadTransformer( backbone, - num_heads=len(groups), num_classes=[len(group) for group in groups], num_layers=args.num_layers, dropout=args.dropout, From 6a62c6da27f00b9b4357d8f7c9eafe1613cbd52b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Fri, 3 Dec 2021 11:34:15 +0100 Subject: [PATCH 48/61] feat(sagemaker): use new data version --- .../huggingface-multihead/sagemaker_train.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/sagemaker_train.py b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py index 4e8fedb..88801ca 100644 --- a/scripts/training/oguz/huggingface-multihead/sagemaker_train.py +++ b/scripts/training/oguz/huggingface-multihead/sagemaker_train.py @@ -29,29 +29,25 @@ job_name = f"{args.task}-train-{formatted_time()}" # load dataset -dataset_version = "0.5" if args.task == "1D" else "0.4.4" -target_field = "subpillars_1d" if args.task == "1D" else "subpillars" -train_df = pd.read_csv( - f"data/frameworks_data/data_v{dataset_version}/data_v{dataset_version}_train.csv" -) -val_df = pd.read_csv( - f"data/frameworks_data/data_v{dataset_version}/data_v{dataset_version}_val.csv" -) +dataset_version = "0.7.1" +target_field = "subpillars_1d" if args.task == "1D" else "subpillars_2d" +train_df = pd.read_csv(f"data/frameworks_data/data_v{dataset_version}/train_v{dataset_version}.csv") +test_df = pd.read_csv(f"data/frameworks_data/data_v{dataset_version}/test_v{dataset_version}.csv") # resample if debug if args.debug: train_df = train_df.sample(n=1000) - val_df = val_df.sample(n=1000) + test_df = test_df.sample(n=1000) # upload dataset to s3 input_path = DEV_BUCKET / "training" / "input_data" / job_name # Do not change this train_path = str(input_path / "train_df.pickle") -val_path = str(input_path / "test_df.pickle") +test_path = str(input_path / "test_df.pickle") train_df.to_pickle( train_path, protocol=4 ) # protocol 4 is necessary, since SageMaker uses python 3.6 -val_df.to_pickle(val_path, protocol=4) +test_df.to_pickle(test_path, protocol=4) # hyperparameters for the run hyperparameters = { From a1e142651591774c4db6dada4d0e2648dd0c60da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Sun, 5 Dec 2021 22:28:56 +0100 Subject: [PATCH 49/61] feat(data): use new virtual analysis framework --- .../oguz/huggingface-multihead/constants.py | 104 ++++++++++-------- 1 file changed, 57 insertions(+), 47 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/constants.py b/scripts/training/oguz/huggingface-multihead/constants.py index 58c37c0..25c6fa6 100644 --- a/scripts/training/oguz/huggingface-multihead/constants.py +++ b/scripts/training/oguz/huggingface-multihead/constants.py @@ -1,63 +1,75 @@ +SECTORS = [ + "Agriculture", + "Cross", + "Education", + "Food Security", + "Health", + "Livelihoods", + "Logistics", + "Nutrition", + "Protection", + "Shelter", + "WASH", +] + PILLARS_1D = [ "Context", - "Humanitarian Profile", - "Displacement", + "Shock/Event", "Casualties", + "Displacement", "Humanitarian Access", - "Information", + "Information And Communication", + "Covid-19", ] SUBPILLARS_1D = [ [ - "Context->Security & Stability", "Context->Demography", "Context->Economy", - "Context->Hazard & Threats", - "Context->Politics", - "Context->Overview", - "Context->Key Event", - "Context->Socio Cultural", - "Context->Legal & Policy", "Context->Environment", - "Context->Stakeholders", - "Context->Response gap", + "Context->Security & Stability", + "Context->Socio Cultural", + "Context->Legal & Policy", + "Context->Politics", + "Context->Technological", ], [ - "Humanitarian Profile->Affected Groups", - "Humanitarian Profile->Casualties", - "Humanitarian Profile->Population Movement", + "Shock/Event->Type And Characteristics", + "Shock/Event->Underlying/Aggravating Factors", + "Shock/Event->Hazard & Threats", ], + ["Casualties->Dead", "Casualties->Injured", "Casualties->Missing"], [ - "Displacement->Push/Pull Factors", - "Displacement->Type/Numbers", - "Displacement->Local Integration", + "Displacement->Type/Numbers/Movements", + "Displacement->Push Factors", + "Displacement->Pull Factors", "Displacement->Intentions", - "Displacement->Displacement", + "Displacement->Local Integration", ], - ["Casualties->Dead", "Casualties->Injured", "Casualties->Missing"], [ + "Humanitarian Access->Relief To Population", + "Humanitarian Access->Population To Relief", "Humanitarian Access->Physical Constraints", - "Humanitarian Access->Humanitarian Access Gaps", + ( + "Humanitarian Access->Number Of People Facing Humanitarian Access Constraints" + "/Humanitarian Access Gaps" + ), ], [ - "Information->Information Gaps", - "Information->Channels & Means", - "Information->Information Challenges", + "Information And Communication->Information Challenges And Barriers", + "Information And Communication->Communication Means And Preferences", + "Information And Communication->Knowledge And Info Gaps (Pop)", + "Information And Communication->Knowledge And Info Gaps (Hum)", + ], + [ + "Covid-19->Cases", + "Covid-19->Deaths", + "Covid-19->Testing", + "Covid-19->Contact Tracing", + "Covid-19->Hospitalization & Care", + "Covid-19->Vaccination", + "Covid-19->Restriction Measures", ], -] - -SECTORS = [ - "Agriculture", - "Cross", - "Education", - "Food Security", - "Health", - "Livelihoods", - "Logistics", - "Nutrition", - "Protection", - "Shelter", - "WASH", ] PILLARS_2D = [ @@ -65,7 +77,7 @@ "Capacities & Response", "Impact", "Priority Interventions", - "People At Risk", + "At Risk", "Priority Needs", ] @@ -73,21 +85,19 @@ [ "Humanitarian Conditions->Coping Mechanisms", "Humanitarian Conditions->Living Standards", - "Humanitarian Conditions->Number Of People In Need", "Humanitarian Conditions->Physical And Mental Well Being", + "Humanitarian Conditions->Number Of People In Need", ], [ "Capacities & Response->International Response", "Capacities & Response->National Response", - "Capacities & Response->Number Of People Reached", - "Capacities & Response->Response Gaps", + "Capacities & Response->Local Response", + "Capacities & Response->Number Of People Reached/Response Gaps", ], [ "Impact->Driver/Aggravating Factors", "Impact->Impact On People", - "Impact->Impact On People Or Impact On Services", - "Impact->Impact On Services", - "Impact->Impact On Systems And Services", + "Impact->Impact On Systems, Services And Networks", "Impact->Number Of People Affected", ], [ @@ -95,8 +105,8 @@ "Priority Interventions->Expressed By Population", ], [ - "People At Risk->Number Of People At Risk", - "People At Risk->Risk And Vulnerabilities", + "At Risk->Risk And Vulnerabilities", + "At Risk->Number Of People At Risk", ], [ "Priority Needs->Expressed By Humanitarian Staff", From 37c7c673fdafd09f0987b0c255ecc248964f99ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Sun, 5 Dec 2021 23:06:08 +0100 Subject: [PATCH 50/61] feat(dataset): have an option to exclude unwanted target labels --- .../oguz/huggingface-multihead/data.py | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 3110e96..3a818f0 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -91,6 +91,7 @@ class MultiTargetDataFrame(Dataset): e.g., 6 label classification with two groups: [A, B, C], [D, E, F] group_names: name assoaciated with each classification head e.g., 2 group names: ABC and DEF + exclude: omit the given target labels. flatten: flatten targets to 1D for convenience """ @@ -100,6 +101,7 @@ def __init__( target: str = "target", groups: Optional[List[List[str]]] = None, group_names: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, flatten: bool = True, ): # process groups @@ -123,10 +125,17 @@ def __init__( self.logger.info(f"Loading dataframe: {dataframe}") dataframe = read_dataframe(dataframe) + # apply literal eval to have lists in target if not isinstance(dataframe[target].iloc[0], list): - # apply literal eval to have lists in target dataframe[target] = dataframe[target].apply(literal_eval) + # omit the given exclude labels + if exclude: + dataframe[target] = [ + [label for label in labels if label not in exclude] + for labels in dataframe[target].tolist() + ] + if groups: # process given groups self.group_encoding = {t: idx for idx, group in enumerate(groups) for t in group} @@ -275,6 +284,9 @@ class MultiHeadDataFrame(Dataset): strings is given, each key is used to check positivity of the sample, e.g., ['sector', 'pillar2d'] checks whether the data point has at least one target in `sector` or in `pillar2d` fields. + exclude: (None, List of strings, List of List of strings) omit the given + targets. For multi-target classification, expects a list with + elements of lists. flatten: flatten group targets to 1D for convenience online: online or offline tokenization inference: if True, does not process target or groups @@ -289,6 +301,7 @@ def __init__( targets: Union[str, List[str]] = "target", groups: Optional[Union[List[List[str]], List[List[List[str]]]]] = None, group_names: Optional[Union[List[str], List[List[str]]]] = None, + exclude: Optional[List[str]] = None, filter: Optional[Union[str, List[str]]] = None, flatten: bool = True, online: bool = False, @@ -354,15 +367,22 @@ def __init__( # ), "Expecting `group_names` to be a list of lists" self.single = False + # prepare omit lists + if exclude is None: + exclude = [None for target in targets] + self.targets = [] if not inference: - for _target, _groups, _group_names in zip(targets, groups, group_names): + for _target, _groups, _group_names, _exclude in zip( + targets, groups, group_names, exclude + ): self.targets.append( MultiTargetDataFrame( dataframe=dataframe, target=_target, groups=_groups, group_names=_group_names, + exclude=_exclude, flatten=flatten, ) ) From 46354e20d481b8b9231584dcbb07d334700a230f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Sun, 5 Dec 2021 23:13:27 +0100 Subject: [PATCH 51/61] fix(modeling): initialize module list correctly --- scripts/training/oguz/huggingface-multihead/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index b745567..517abc4 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -161,7 +161,7 @@ def __init__( num_classes = [num_classes] self.dropout = torch.nn.Dropout(dropout) - self.heads.append(torch.nn.ModuleList()) + self.heads = torch.nn.ModuleList() for num_cls in num_classes: self.heads.append( From ee3e89bd825abee9975b96410e436e0915afccce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Sun, 5 Dec 2021 23:14:00 +0100 Subject: [PATCH 52/61] feat(runner): exclude "NOT_MAPPED" targets --- scripts/training/oguz/huggingface-multihead/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index f3dff0c..3c86620 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -124,6 +124,7 @@ targets=args.target, groups=groups, group_names=group_names, + exclude=["NOT_MAPPED"], filter=args.split, flatten=True, ) @@ -134,6 +135,7 @@ targets=args.target, groups=groups, group_names=group_names, + exclude=["NOT_MAPPED"], filter=None, flatten=True, ) From 0d8c1b1144e2bcd9e9237b6a1a79bc23baf0b2dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 6 Dec 2021 00:57:56 +0100 Subject: [PATCH 53/61] fix(runner): handle multi-task targets correctly --- .../oguz/huggingface-multihead/train.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 3c86620..95775f7 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -86,18 +86,19 @@ backbone = AutoModel.from_pretrained(args.model_name) # get target groups - if args.target == "subpillars_1d": - groups = SUBPILLARS_1D - group_names = PILLARS_1D - target_group = "pillars_1d" - elif args.target == "subpillars" or args.target == "subpillars_2d": - groups = SUBPILLARS_2D - group_names = PILLARS_2D - target_group = "pillars_2d" - else: - groups = [SECTORS] - group_names = ["Sectors"] - target_group = None + groups, group_names = [], [] + for target in args.target: + if target == "subpillars_1d": + groups.append(SUBPILLARS_1D) + group_names.append(PILLARS_1D) + elif target == "subpillars" or target == "subpillars_2d": + groups.append(SUBPILLARS_2D) + group_names.append(PILLARS_2D) + elif target == "sectors": + groups.append([SECTORS]) + group_names.append(["Sectors"]) + else: + raise NotImplementedError # sanity check for iterative option if args.iterative: @@ -330,8 +331,6 @@ def _process(text): target_file = os.path.join(args.output_data_dir, "targets.txt") with open(target_file, "w") as writer: writer.write(f"{args.target}\n") - if target_group: - writer.write(f"{target_group}\n") artifacts.update({"targets": target_file}) # log experiment params to MLFlow From 8eb627bfe77dd2ef88d4d9edc5e9351dc0f5f072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Mon, 6 Dec 2021 00:59:36 +0100 Subject: [PATCH 54/61] refactor(data): better error messages when groups and group_names are not in same length --- scripts/training/oguz/huggingface-multihead/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 3a818f0..893cf4a 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -109,7 +109,7 @@ def __init__( if group_names is not None: assert len(groups) == len( group_names - ), "Group names should be at equal length with groups" + ), f"Group names '{group_names}' should be at equal length with groups '{groups}'" else: group_names = [f"Group {i}" for i in range(len(groups))] else: From 0b6b1d7eea32046cffd0112eb0c35f5bf1ee1680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 15 Dec 2021 14:30:46 +0300 Subject: [PATCH 55/61] fix(data): compute stats for multi-head dataset Computes stats per-task basis. --- scripts/training/oguz/huggingface-multihead/data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 893cf4a..5ed30a4 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -371,6 +371,7 @@ def __init__( if exclude is None: exclude = [None for target in targets] + self.tasks = targets self.targets = [] if not inference: for _target, _groups, _group_names, _exclude in zip( @@ -404,3 +405,11 @@ def __getitem__(self, idx): item.update({(f"head{i}_" + k): v for k, v in target[idx].items()}) return item + + def compute_stats(self) -> Dict[str, int]: + """Computes occurences of each target and group""" + + counts = {} + for task, target in zip(self.tasks, self.targets): + counts[task] = target.compute_stats() + return counts From 90f4369bd4b073f1b05cd7c53a7ede8d55f0d55d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Wed, 15 Dec 2021 14:41:42 +0300 Subject: [PATCH 56/61] refactor(modeling): rename MultiTargetHead to MultiTargetTransformer --- scripts/training/oguz/huggingface-multihead/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index 517abc4..ce9b746 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -9,7 +9,7 @@ ZERO_SIGMOID_INVERSE = -10 -class MultiTargetHead(torch.nn.Module): +class MultiTargetTransformer(torch.nn.Module): """Multi-target MLP classifier head that is able to handle group structure in multi-label classifications problems (e.g., 6 label classification with two groups: [A, B, C], [D, E, F]). @@ -107,8 +107,8 @@ def forward(self, inputs, gt_groups=None, group_threshold=0.5): class MultiHeadTransformer(torch.nn.Module): - """Multi-task classifier each supporting multi-target groups using the same - transformer backbone. + """Multi-task classifier each supporting multi-target groups (MultiTargetTransformer) + using the same transformer backbone. Args: backbone: Pre-trained transformer. @@ -165,7 +165,7 @@ def __init__( for num_cls in num_classes: self.heads.append( - MultiTargetHead( + MultiTargetTransformer( num_classes=num_cls, num_layers=num_layers, iterative=iterative, From 6e9c7f85e62cfd47b31c03551ea09a239ee95cf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 16 Dec 2021 00:45:34 +0300 Subject: [PATCH 57/61] feat(data): add iterative option for labels --- .../oguz/huggingface-multihead/data.py | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 5ed30a4..02136c9 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -93,6 +93,7 @@ class MultiTargetDataFrame(Dataset): e.g., 2 group names: ABC and DEF exclude: omit the given target labels. flatten: flatten targets to 1D for convenience + iterative: include group targets on top of regular targets """ def __init__( @@ -103,6 +104,7 @@ def __init__( group_names: Optional[List[str]] = None, exclude: Optional[List[str]] = None, flatten: bool = True, + iterative: bool = True, ): # process groups if groups is not None: @@ -118,6 +120,7 @@ def __init__( self.groups = groups self.group_names = group_names self.flatten = flatten + self.iterative = iterative self.logger = logging.getLogger() # read dataframe manually if given as path @@ -145,6 +148,8 @@ def __init__( self.target_decoding = [revdict(encoding) for encoding in self.target_encoding] self.target_classes = [len(encoding.keys()) for encoding in self.target_encoding] else: + assert not self.iterative, "Provide groups if iterative labels are asked" + # prepare target encoding all_targets = np.hstack(dataframe[target].to_numpy()) uniq_targets = np.unique(all_targets) @@ -186,7 +191,7 @@ def __getitem__(self, idx): } ) - if self.group is not None: + if self.iterative: item["groups"] = torch.tensor(self.group[idx]) return item @@ -288,6 +293,7 @@ class MultiHeadDataFrame(Dataset): targets. For multi-target classification, expects a list with elements of lists. flatten: flatten group targets to 1D for convenience + iterative: include group targets on top of regular targets online: online or offline tokenization inference: if True, does not process target or groups tokenizer_max_len: maximum output length for the tokenizer @@ -304,11 +310,14 @@ def __init__( exclude: Optional[List[str]] = None, filter: Optional[Union[str, List[str]]] = None, flatten: bool = True, + iterative: bool = True, online: bool = False, inference: bool = False, tokenizer_max_len: Optional[int] = None, ): self.logger = logging.getLogger() + self.flatten = flatten + self.iterative = iterative if online: # ensure that we are in training @@ -348,6 +357,7 @@ def __init__( online=online, tokenizer_max_len=tokenizer_max_len, ) + self.data_len = self.data.data_len # prepare targets if isinstance(targets, str): @@ -367,9 +377,12 @@ def __init__( # ), "Expecting `group_names` to be a list of lists" self.single = False - # prepare omit lists + # set defaults for groups, group_names and exclude + if groups is None: + groups = [None for _ in targets] + group_names = [None for _ in targets] if exclude is None: - exclude = [None for target in targets] + exclude = [None for _ in targets] self.tasks = targets self.targets = [] @@ -385,12 +398,12 @@ def __init__( group_names=_group_names, exclude=_exclude, flatten=flatten, + iterative=iterative, ) ) assert len(self.data) == len( self.targets[-1] ), "Text source and target have different lengths!" - self.data_len = self.data.data_len def __len__(self): return self.data_len @@ -399,10 +412,10 @@ def __getitem__(self, idx): item = self.data[idx] if self.single: - item.update(self.target[0]) + item.update(self.targets[0][idx]) else: - for i, target in enumerate(self.targets): - item.update({(f"head{i}_" + k): v for k, v in target[idx].items()}) + for task, target in zip(self.tasks, self.targets): + item.update({(f"{task}_" + k): v for k, v in target[idx].items()}) return item From 84a8fde32e6ff9d8e3ae01f6b2cd282e10d5e87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 16 Dec 2021 01:08:45 +0300 Subject: [PATCH 58/61] feat(eval): add multihead metrics --- .../oguz/huggingface-multihead/evaluation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/evaluation.py b/scripts/training/oguz/huggingface-multihead/evaluation.py index 164cee8..7c16baa 100644 --- a/scripts/training/oguz/huggingface-multihead/evaluation.py +++ b/scripts/training/oguz/huggingface-multihead/evaluation.py @@ -33,7 +33,7 @@ def _compute(preds, labels, average="micro", threshold=0.5): } -def compute_multitarget_metrics(preds, labels, names: List[str], threshold: float = 0.5): +def compute_multiclass_metrics(preds, labels, names: List[str], threshold: float = 0.5): """Compute metrics for multi-target classification tasks""" metrics = {} @@ -55,7 +55,7 @@ def compute_multitarget_metrics(preds, labels, names: List[str], threshold: floa return metrics -def compute_multigroup_metrics( +def compute_multitarget_metrics( preds, labels, groups: List[List[str]], @@ -66,7 +66,7 @@ def compute_multigroup_metrics( for idx, group_name in group_names: metrics.update( _prefix( - compute_multitarget_metrics( + compute_multiclass_metrics( preds[idx], labels[idx], names=groups[idx], threshold=threshold ), _process(group_name), @@ -84,26 +84,26 @@ def compute_multigroup_metrics( return metrics -def compute_multitask_metrics( +def compute_multihead_metrics( preds, labels, groups: List[List[List[str]]], group_names: List[List[str]], - task_names: List[str], + targets: List[str], threshold: float = 0.5, ): metrics = {} - for idx, task_name in task_names: + for idx, target in targets: metrics.update( _prefix( - compute_multigroup_metrics( + compute_multitarget_metrics( preds[idx], labels[idx], groups=groups[idx], group_names=group_names[idx], threshold=threshold, ), - _process(task_name), + _process(target), ) ) From 3f50ca734639e5656da415f42df5ee85df88b4cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 16 Dec 2021 01:12:20 +0300 Subject: [PATCH 59/61] fix(modeling): work in iterative settings --- scripts/training/oguz/huggingface-multihead/model.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/training/oguz/huggingface-multihead/model.py b/scripts/training/oguz/huggingface-multihead/model.py index ce9b746..7f64e47 100644 --- a/scripts/training/oguz/huggingface-multihead/model.py +++ b/scripts/training/oguz/huggingface-multihead/model.py @@ -8,6 +8,8 @@ ZERO_SIGMOID_INVERSE = -10 +"""Models here only work with flattened datasets.""" + class MultiTargetTransformer(torch.nn.Module): """Multi-target MLP classifier head that is able to handle group structure in @@ -41,6 +43,8 @@ def __init__( backbone_dim: Optional[int] = None, ): super().__init__() + + self.iterative = iterative self.use_gt_training = use_gt_training self.heads = torch.nn.ModuleList() @@ -52,7 +56,7 @@ def __init__( "final_norm": False, } - if iterative: + if self.iterative: self.heads.append( build_mlp( middle_features=np.floor(np.sqrt(len(num_classes) * backbone_dim)).astype(int), From 32a5d7f7a2cfa7580318f721c319cd20e6351700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 16 Dec 2021 01:13:15 +0300 Subject: [PATCH 60/61] feat(trainer): multi-task trainer --- .../oguz/huggingface-multihead/trainer.py | 138 +++++++++++++++++- 1 file changed, 135 insertions(+), 3 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/trainer.py b/scripts/training/oguz/huggingface-multihead/trainer.py index b1164a0..5acf260 100644 --- a/scripts/training/oguz/huggingface-multihead/trainer.py +++ b/scripts/training/oguz/huggingface-multihead/trainer.py @@ -15,9 +15,11 @@ from loss import sigmoid_focal_loss, sigmoid_focal_loss_star +"""Trainers here only work with flattened datasets.""" -class MultiHeadTrainer(Trainer): - """HuggingFace Trainer compatible with MultiHeadTransformer models. + +class MultiTargetTrainer(Trainer): + """HuggingFace Trainer compatible with MultiTargetTransformer models. Args: loss_fn: 'ce', 'focal', 'focal_star' @@ -80,9 +82,9 @@ def __init__( def compute_loss(self, model, inputs, return_outputs=False): labels = inputs.pop("labels") - groups = inputs.pop("groups") if model.iterative: + groups = inputs.pop("groups") pred_groups, pred_labels = model(inputs, gt_groups=groups) # calculate group loss @@ -106,3 +108,133 @@ def compute_loss(self, model, inputs, return_outputs=False): logits["logits_group"] = pred_groups return (loss, logits) if return_outputs else loss + + +class MultiHeadTrainer(MultiTargetTrainer): + """HuggingFace Trainer compatible with MultiHeadTransformer models. + + Args: + loss_fn: 'ce', 'focal', 'focal_star' + loss_weights: weighting applied to different classes + loss_pos_weights: weighting applied to positive versus negative instances + tasks: names of the tasks + """ + + def __init__( + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( + None, + None, + ), + loss_fn: str = "ce", + loss_weights: Optional[Union[List[float], List[List[float]]]] = None, + loss_pos_weights: Optional[Union[List[float], List[List[float]]]] = None, + targets: Optional[Union[str, List[str]]] = "", + ): + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + ) + if loss_fn == "ce": + if loss_weights: + loss_weights = [ + torch.FloatTensor(_loss_weights).to("cuda") for _loss_weights in loss_weights + ] + else: + loss_weights = [None for _ in targets] + if loss_pos_weights: + loss_pos_weights = [ + torch.FloatTensor(_loss_pos_weights).to("cuda") + for _loss_pos_weights in loss_pos_weights + ] + else: + loss_pos_weights = [None for _ in targets] + + self.loss_fn = [ + torch.nn.BCEWithLogitsLoss(weight=_loss_weights, pos_weight=_loss_pos_weights) + for (_loss_weights, _loss_pos_weights) in zip(loss_weights, loss_pos_weights) + ] + elif loss_fn == "focal": + assert ( + loss_weights is None and loss_pos_weights is None + ), "Does not support weighting with focal loss" + self.loss_fn = sigmoid_focal_loss + elif loss_fn == "focal_star": + assert ( + loss_weights is None and loss_pos_weights is None + ), "Does not support weighting with focal loss-star" + self.loss_fn = sigmoid_focal_loss_star + else: + raise "Unknown loss function" + + if isinstance(targets, str): + self.targets = [""] + else: + self.targets = targets + + def compute_loss(self, model, inputs, return_outputs=False): + # collect labels + labels = [] + for _target in self.targets: + labels.append(inputs.pop(f"{_target}_labels")) + + # get the model predictions + if model.iterative: + # collect group gts + groups = [] + for target in self.targets: + groups.append(inputs.pop(f"{target}_groups")) + pred_groups, pred_labels = model(inputs, gt_groups=groups) + else: + pred_labels = model(inputs) + + logits = {} + loss = torch.tensor(0) + for idx, _target in enumerate(self.targets): + # get labels, preds and register logits + _labels = labels[idx] + _pred_labels = pred_labels[idx] + logits[f"{_target}_logits"] = _pred_labels + + # calculate label loss + _loss = self.loss_fn[idx]( + _pred_labels.view(-1, _labels.shape[-1]), + _labels.view(-1, _labels.shape[-1]).float(), + ) + + if model.iterative: + # get groups, preds and register logits + _groups = inputs.pop(f"{_target}_groups") + _pred_groups = pred_groups[idx] + logits[f"{_target}_logits_group"] = _pred_groups + + # calculate group loss + _loss_group = self.loss_fn[idx]( + _pred_groups.view(-1, _groups.shape[-1]), + _groups.view(-1, _groups.shape[-1]).float(), + ) + + # add losses together + _loss = _loss + _loss_group + + # sum losses across all tasks + loss = loss + _loss + + return (loss, logits) if return_outputs else loss From 5c53dd7945d79e8223473ef774e197cb302874d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuz=20Kaan=20Y=C3=BCksel?= Date: Thu, 16 Dec 2021 18:23:03 +0300 Subject: [PATCH 61/61] fix(runner): fix evaluation and training logic for multi-task learning --- .../oguz/huggingface-multihead/data.py | 27 +++- .../oguz/huggingface-multihead/evaluation.py | 31 ++-- .../oguz/huggingface-multihead/train.py | 143 +++++++----------- .../oguz/huggingface-multihead/trainer.py | 2 +- 4 files changed, 94 insertions(+), 109 deletions(-) diff --git a/scripts/training/oguz/huggingface-multihead/data.py b/scripts/training/oguz/huggingface-multihead/data.py index 02136c9..c45bdfd 100644 --- a/scripts/training/oguz/huggingface-multihead/data.py +++ b/scripts/training/oguz/huggingface-multihead/data.py @@ -91,7 +91,9 @@ class MultiTargetDataFrame(Dataset): e.g., 6 label classification with two groups: [A, B, C], [D, E, F] group_names: name assoaciated with each classification head e.g., 2 group names: ABC and DEF - exclude: omit the given target labels. + exclude: (None, List of strings, List of List of strings) omit the given + targets. For multi-target classification, expects a list with + elements of lists. flatten: flatten targets to 1D for convenience iterative: include group targets on top of regular targets """ @@ -102,7 +104,7 @@ def __init__( target: str = "target", groups: Optional[List[List[str]]] = None, group_names: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, + exclude: Optional[Union[str, List[str]]] = None, flatten: bool = True, iterative: bool = True, ): @@ -134,6 +136,9 @@ def __init__( # omit the given exclude labels if exclude: + if isinstance(exclude, str): + exclude = [exclude] + dataframe[target] = [ [label for label in labels if label not in exclude] for labels in dataframe[target].tolist() @@ -196,6 +201,10 @@ def __getitem__(self, idx): return item + def label_names(self) -> List[str]: + """Return label names""" + return self[0].keys() + def compute_stats(self) -> Dict[str, int]: """Computes occurences of each target and group""" @@ -358,6 +367,7 @@ def __init__( tokenizer_max_len=tokenizer_max_len, ) self.data_len = self.data.data_len + self.tokenizer_options = self.data.tokenizer_options # prepare targets if isinstance(targets, str): @@ -381,22 +391,18 @@ def __init__( if groups is None: groups = [None for _ in targets] group_names = [None for _ in targets] - if exclude is None: - exclude = [None for _ in targets] self.tasks = targets self.targets = [] if not inference: - for _target, _groups, _group_names, _exclude in zip( - targets, groups, group_names, exclude - ): + for _target, _groups, _group_names in zip(targets, groups, group_names): self.targets.append( MultiTargetDataFrame( dataframe=dataframe, target=_target, groups=_groups, group_names=_group_names, - exclude=_exclude, + exclude=exclude, flatten=flatten, iterative=iterative, ) @@ -419,6 +425,11 @@ def __getitem__(self, idx): return item + def label_names(self) -> List[str]: + """Return label names""" + data_keys = self.data[0].keys() + return [key for key in self[0].keys() if key not in data_keys] + def compute_stats(self) -> Dict[str, int]: """Computes occurences of each target and group""" diff --git a/scripts/training/oguz/huggingface-multihead/evaluation.py b/scripts/training/oguz/huggingface-multihead/evaluation.py index 7c16baa..1c1bb35 100644 --- a/scripts/training/oguz/huggingface-multihead/evaluation.py +++ b/scripts/training/oguz/huggingface-multihead/evaluation.py @@ -1,9 +1,9 @@ from typing import List -import numpy as np - from sklearn.metrics import accuracy_score, precision_recall_fscore_support +"""Evaluation logic here only work with flattened datasets.""" + def _prefix(dic, prefix): """Adds prefix to dictionary keys""" @@ -15,8 +15,11 @@ def _process(text): """Replaces special characters in text (for MLFlow)""" text = text.lower() text = text.replace(" ", "_") - text = text.replace(">", "") text = text.replace("&", "_") + text = text.replace(">", "") + text = text.replace(",", "") + text = text.replace("(", "") + text = text.replace(")", "") return text @@ -47,7 +50,9 @@ def compute_multiclass_metrics(preds, labels, names: List[str], threshold: float # per class micro evaluation metrics.update( _prefix( - _compute(preds[:, idx], labels[:, idx], "binary", threshold=threshold), + _compute( + preds[:, idx : idx + 1], labels[:, idx : idx + 1], "binary", threshold=threshold + ), f"{_process(name)}_binary_", ) ) @@ -63,18 +68,20 @@ def compute_multitarget_metrics( threshold: float = 0.5, ): metrics = {} - for idx, group_name in group_names: + start = 0 + for idx, group_name in enumerate(group_names): metrics.update( _prefix( compute_multiclass_metrics( - preds[idx], labels[idx], names=groups[idx], threshold=threshold + preds[:, start : start + len(groups[idx])], + labels[:, start : start + len(groups[idx])], + names=groups[idx], + threshold=threshold, ), - _process(group_name), + f"{_process(group_name)}_", ) ) - - preds = np.concatenate(preds, axis=-1) - labels = np.concatenate(labels, axis=-1) + start = start + len(groups[idx]) # micro evaluation metrics.update(_prefix(_compute(preds, labels, "micro", threshold=threshold), "micro_")) @@ -93,7 +100,7 @@ def compute_multihead_metrics( threshold: float = 0.5, ): metrics = {} - for idx, target in targets: + for idx, target in enumerate(targets): metrics.update( _prefix( compute_multitarget_metrics( @@ -103,7 +110,7 @@ def compute_multihead_metrics( group_names=group_names[idx], threshold=threshold, ), - _process(target), + f"{_process(target)}_", ) ) diff --git a/scripts/training/oguz/huggingface-multihead/train.py b/scripts/training/oguz/huggingface-multihead/train.py index 95775f7..992df0d 100644 --- a/scripts/training/oguz/huggingface-multihead/train.py +++ b/scripts/training/oguz/huggingface-multihead/train.py @@ -6,9 +6,6 @@ import mlflow import pandas as pd - -from sklearn.metrics import accuracy_score, precision_recall_fscore_support - from transformers import AutoModel, AutoTokenizer, TrainingArguments from constants import SECTORS, PILLARS_1D, SUBPILLARS_1D, PILLARS_2D, SUBPILLARS_2D @@ -16,6 +13,7 @@ from model import MultiHeadTransformer from wrapper import MLFlowWrapper from trainer import MultiHeadTrainer +from evaluation import compute_multihead_metrics, compute_multitarget_metrics, _prefix from utils import str2bool, str2list, get_conda_env_specs if __name__ == "__main__": @@ -86,12 +84,12 @@ backbone = AutoModel.from_pretrained(args.model_name) # get target groups - groups, group_names = [], [] + targets, groups, group_names = [], [], [] for target in args.target: if target == "subpillars_1d": groups.append(SUBPILLARS_1D) group_names.append(PILLARS_1D) - elif target == "subpillars" or target == "subpillars_2d": + elif target == "subpillars_2d": groups.append(SUBPILLARS_2D) group_names.append(PILLARS_2D) elif target == "sectors": @@ -99,6 +97,7 @@ group_names.append(["Sectors"]) else: raise NotImplementedError + targets.append(target) # sanity check for iterative option if args.iterative: @@ -107,7 +106,7 @@ # build classifier model from backbone model = MultiHeadTransformer( backbone, - num_classes=[len(group) for group in groups], + num_classes=[[len(group) for group in _group] for _group in groups], num_layers=args.num_layers, dropout=args.dropout, pooling=args.pooling, @@ -122,112 +121,70 @@ train_df, tokenizer=tokenizer, source="excerpt", - targets=args.target, + targets=targets, groups=groups, group_names=group_names, exclude=["NOT_MAPPED"], filter=args.split, + iterative=args.iterative, flatten=True, ) test_dataset = MultiHeadDataFrame( test_df, tokenizer=tokenizer, source="excerpt", - targets=args.target, + targets=targets, groups=groups, group_names=group_names, exclude=["NOT_MAPPED"], filter=None, + iterative=args.iterative, flatten=True, ) # compute metrics function for multi-class classification def compute_metrics(pred, threshold=0.5): - # add prefix to dictionary keys - def _prefix(dic, prefix): - return {(prefix + k): v for k, v in dic.items()} - - # compute metrics given preds and labels - def _compute(preds, labels, average="micro"): - preds = preds > threshold - precision, recall, f1, _ = precision_recall_fscore_support( - labels, preds, average=average - ) - accuracy = accuracy_score(labels, preds) - return { - "accuracy": accuracy, - "f1": f1, - "precision": precision, - "recall": recall, - } - - # process pillar texts for MLFlow - def _process(text): - text = text.lower() - text = text.replace(" ", "_") - text = text.replace(">", "") - text = text.replace("&", "_") - return text - - metrics = {} - + # read predictions and labels if args.iterative: # TODO: ensure the ordering is stable preds, preds_group = pred.predictions labels, labels_group = pred.label_ids - - # group micro evaluation - metrics.update(_prefix(_compute(preds_group, labels_group, "micro"), "pillar_micro_")) - # group macro evaluation - metrics.update(_prefix(_compute(preds_group, labels_group, "macro"), "pillar_macro_")) - # per group evaluation - for i, pillar in enumerate(group_names): - metrics.update( - _prefix( - _compute(preds_group[:, i], labels_group[:, i], "binary"), - f"{_process(pillar)}_binary_", - ) - ) else: - labels = pred.label_ids preds = pred.predictions + labels = pred.label_ids - # micro evaluation - metrics.update(_prefix(_compute(preds, labels, "micro"), "subpillar_micro_")) - # macro evaluation - metrics.update(_prefix(_compute(preds, labels, "macro"), "subpillar_macro_")) - # per head evaluation - idx = 0 - for i, pillar in enumerate(group_names): - idx_end = idx + len(groups[i]) - - # per head micro evaluation - metrics.update( - _prefix( - _compute(preds[:, idx:idx_end], labels[:, idx:idx_end], "micro"), - f"{_process(pillar)}_micro_", - ) - ) - # per head macro evaluation + if not (isinstance(preds, list) or isinstance(preds, tuple)): + preds = [preds] + labels = [labels] + + if args.iterative: + preds_group = [preds_group] + labels_group = [labels_group] + + # compute metrics for preds + metrics = compute_multihead_metrics( + preds, + labels, + groups=groups, + group_names=group_names, + targets=targets, + threshold=threshold, + ) + + # compute metrics for group preds + if args.iterative: metrics.update( _prefix( - _compute(preds[:, idx:idx_end], labels[:, idx:idx_end], "macro"), - f"{_process(pillar)}_macro_", - ) - ) - - # per head target evaluation - for j, subpillar in enumerate(groups[i]): - metrics.update( - _prefix( - _compute(preds[:, idx + j], labels[:, idx + j], "binary"), - f"{_process(subpillar)}_binary_", + compute_multitarget_metrics( + preds_group, + labels_group, + groups=group_names, + group_names=targets, + threshold=threshold, ) - ) - - # update index - idx = idx_end - + ), + "group_", + ) return metrics # define training args @@ -238,11 +195,12 @@ def _process(text): per_device_eval_batch_size=args.eval_batch_size, warmup_steps=args.warmup_steps, evaluation_strategy="epoch", + save_strategy="epoch", logging_dir=f"{args.output_data_dir}/logs", learning_rate=float(args.learning_rate), skip_memory_metrics=False, - label_names=["labels", "groups"] if args.iterative else ["labels"], - metric_for_best_model="eval_subpillar_micro_f1", + label_names=train_dataset.label_names(), + metric_for_best_model="eval_subpillars_1d_micro_f1", greater_is_better=True, load_best_model_at_end=True, save_total_limit=1, @@ -251,11 +209,19 @@ def _process(text): # calculate weighting coefficients loss_weights, loss_pos_weights = None, None if args.weighting == "inverse": - classes = [target for group in groups for target in group] stats = train_dataset.compute_stats() - loss_weights = [(stats["ALL"] / stats[c]) for c in classes] - loss_pos_weights = [weight - 1 for weight in loss_weights] + classes = [[target for group in _groups for target in group] for _groups in groups] + loss_weights = [ + [ + 0 if stats[_target][c] else (stats[_target]["ALL"] / stats[_target][c]) + for c in _classes + ] + for _target, _classes in zip(targets, classes) + ] + loss_pos_weights = [ + [weight - 1 for _loss_weights in loss_weights for weight in _loss_weights] + ] if args.weighting == "inverse_square": raise NotImplementedError @@ -269,6 +235,7 @@ def _process(text): loss_fn=args.loss, loss_weights=loss_weights, loss_pos_weights=loss_pos_weights, + targets=targets, ) # set env variable for MLFlow artifact logging diff --git a/scripts/training/oguz/huggingface-multihead/trainer.py b/scripts/training/oguz/huggingface-multihead/trainer.py index 5acf260..84d20e3 100644 --- a/scripts/training/oguz/huggingface-multihead/trainer.py +++ b/scripts/training/oguz/huggingface-multihead/trainer.py @@ -110,7 +110,7 @@ def compute_loss(self, model, inputs, return_outputs=False): return (loss, logits) if return_outputs else loss -class MultiHeadTrainer(MultiTargetTrainer): +class MultiHeadTrainer(Trainer): """HuggingFace Trainer compatible with MultiHeadTransformer models. Args: