Skip to content

Commit

Permalink
implementing knn-based influential dataset approach
Browse files Browse the repository at this point in the history
  • Loading branch information
stolzenp committed Feb 19, 2024
1 parent 0a36f6c commit 767ca6a
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/small_model_training/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"model_name_or_path": "bert-base-uncased",
"tokenizer_name": "distilbert-base-uncased",
"output_dir":"my_awesome_model",
"output_dir":"imdb_model",
"learning_rate":2e-5,
"per_device_train_batch_size":16,
"per_device_eval_batch_size":16,
Expand Down
64 changes: 64 additions & 0 deletions src/small_model_training/create_inf_subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import torch.nn
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer
from datasets import load_dataset
import evaluate

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)

accuracy = evaluate.load("accuracy")

model = AutoModelForSequenceClassification.from_pretrained("./imdb_model")

imdb_train = load_dataset("imdb", split="train[:10%]")
imdb_test = load_dataset("imdb", split="test[:1%]")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)


tokenized_imdb_train = imdb_train.map(preprocess_function, batched=True)
tokenized_imdb_test = imdb_test.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
model=model,
train_dataset=tokenized_imdb_train,
eval_dataset=tokenized_imdb_test,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

outputs = trainer.predict(tokenized_imdb_test)

logits = outputs[0]

logits = torch.from_numpy(logits)

scores = torch.nn.functional.softmax(logits, dim=-1)

first_values = scores[:, 0]
second_values = scores[:, 1]

distance = first_values-second_values

dist = torch.abs(distance)
knn_values, knn_indices = dist.topk(5, largest=False)

fewshot_examples = []

for elem in knn_indices:
fewshot_examples.append(imdb_test[elem.item()]["text"])

print(fewshot_examples)
5 changes: 5 additions & 0 deletions src/small_model_training/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def preprocess_function(examples):

tokenized_imdb = dataset.map(preprocess_function, batched=True)

print("done tokenizing")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
Expand All @@ -46,6 +48,9 @@ def preprocess_function(examples):
)

trainer.train()
trainer.evaluate()

model.save_pretrained("./imdb_model")


# TO-DO: calculate influential dataset
Expand Down

0 comments on commit 767ca6a

Please sign in to comment.