From 767ca6a580fae459dd1283433e6970c7d1051f4e Mon Sep 17 00:00:00 2001 From: stolzenp Date: Tue, 20 Feb 2024 00:09:06 +0100 Subject: [PATCH] implementing knn-based influential dataset approach --- src/small_model_training/config.json | 2 +- src/small_model_training/create_inf_subset.py | 64 +++++++++++++++++++ .../text_classification.py | 5 ++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 src/small_model_training/create_inf_subset.py diff --git a/src/small_model_training/config.json b/src/small_model_training/config.json index a807b77..d8f373a 100644 --- a/src/small_model_training/config.json +++ b/src/small_model_training/config.json @@ -1,7 +1,7 @@ { "model_name_or_path": "bert-base-uncased", "tokenizer_name": "distilbert-base-uncased", - "output_dir":"my_awesome_model", + "output_dir":"imdb_model", "learning_rate":2e-5, "per_device_train_batch_size":16, "per_device_eval_batch_size":16, diff --git a/src/small_model_training/create_inf_subset.py b/src/small_model_training/create_inf_subset.py new file mode 100644 index 0000000..d1357e9 --- /dev/null +++ b/src/small_model_training/create_inf_subset.py @@ -0,0 +1,64 @@ +import torch.nn +import numpy as np +from transformers import AutoTokenizer +from transformers import AutoModelForSequenceClassification +from transformers import DataCollatorWithPadding +from transformers import Trainer +from datasets import load_dataset +import evaluate + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + +accuracy = evaluate.load("accuracy") + +model = AutoModelForSequenceClassification.from_pretrained("./imdb_model") + +imdb_train = load_dataset("imdb", split="train[:10%]") +imdb_test = load_dataset("imdb", split="test[:1%]") + +tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + + +def preprocess_function(examples): + return tokenizer(examples["text"], truncation=True) + + +tokenized_imdb_train = imdb_train.map(preprocess_function, batched=True) +tokenized_imdb_test = imdb_test.map(preprocess_function, batched=True) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +trainer = Trainer( + model=model, + train_dataset=tokenized_imdb_train, + eval_dataset=tokenized_imdb_test, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +outputs = trainer.predict(tokenized_imdb_test) + +logits = outputs[0] + +logits = torch.from_numpy(logits) + +scores = torch.nn.functional.softmax(logits, dim=-1) + +first_values = scores[:, 0] +second_values = scores[:, 1] + +distance = first_values-second_values + +dist = torch.abs(distance) +knn_values, knn_indices = dist.topk(5, largest=False) + +fewshot_examples = [] + +for elem in knn_indices: + fewshot_examples.append(imdb_test[elem.item()]["text"]) + +print(fewshot_examples) diff --git a/src/small_model_training/text_classification.py b/src/small_model_training/text_classification.py index 9c17e3c..e1127d6 100644 --- a/src/small_model_training/text_classification.py +++ b/src/small_model_training/text_classification.py @@ -26,6 +26,8 @@ def preprocess_function(examples): tokenized_imdb = dataset.map(preprocess_function, batched=True) + print("done tokenizing") + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) id2label = {0: "NEGATIVE", 1: "POSITIVE"} @@ -46,6 +48,9 @@ def preprocess_function(examples): ) trainer.train() + trainer.evaluate() + + model.save_pretrained("./imdb_model") # TO-DO: calculate influential dataset