From e907646224f067e6ec1d512a7a20ec5a7bac322a Mon Sep 17 00:00:00 2001 From: Min Song Date: Thu, 23 Jul 2020 09:36:38 +0900 Subject: [PATCH] Add files via upload --- examples/bert_en_word_embeddings.py | 131 +++++++++++++++++++ examples/bert_ko_word_embeddings.py | 196 ++++++++++++++++++++++++++++ examples/bert_sentiment_trainer.py | 104 +++++++++++++++ 3 files changed, 431 insertions(+) create mode 100644 examples/bert_en_word_embeddings.py create mode 100644 examples/bert_ko_word_embeddings.py create mode 100644 examples/bert_sentiment_trainer.py diff --git a/examples/bert_en_word_embeddings.py b/examples/bert_en_word_embeddings.py new file mode 100644 index 0000000..94b1a91 --- /dev/null +++ b/examples/bert_en_word_embeddings.py @@ -0,0 +1,131 @@ +import torch +from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM +from sklearn.metrics.pairwise import cosine_similarity + +#pip install pytorch-pretrained-bert +# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows +import logging +logging.basicConfig(level=logging.INFO) + +import matplotlib.pyplot as plt + +# Load pre-trained model tokenizer (vocabulary) +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + +#1 Sentence Input: +#text = "Here is the sentence I want embeddings for." +text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank." +marked_text = "[CLS] " + text + " [SEP]" +print (marked_text) + +#We’ve imported a BERT-specific tokenizer, let’s take a look at the output: +tokenized_text = tokenizer.tokenize(marked_text) +print (tokenized_text) + +list(tokenizer.vocab.keys())[5000:5020] + +indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + +for tup in zip(tokenized_text, indexed_tokens): + print (tup) + +segments_ids = [1] * len(tokenized_text) +print (segments_ids) + +# Convert inputs to PyTorch tensors +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) + +# Load pre-trained model (weights) +model = BertModel.from_pretrained('bert-base-uncased') + +# Put the model in "evaluation" mode, meaning feed-forward operation. +model.eval() + +# Predict hidden states features for each layer +with torch.no_grad(): + encoded_layers, _ = model(tokens_tensor, segments_tensors) + +print ("Number of layers:", len(encoded_layers)) +layer_i = 0 + +print ("Number of batches:", len(encoded_layers[layer_i])) +batch_i = 0 + +print ("Number of tokens:", len(encoded_layers[layer_i][batch_i])) +token_i = 0 + +print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i])) + + +# For the 5th token in our sentence, select its feature values from layer 5. +token_i = 5 +layer_i = 5 +vec = encoded_layers[layer_i][batch_i][token_i] + +# Plot the values as a histogram to show their distribution. +plt.figure(figsize=(10,10)) +plt.hist(vec, bins=200) +plt.show() + +# Convert the hidden state embeddings into single token vectors + +# Holds the list of 12 layer embeddings for each token +# Will have the shape: [# tokens, # layers, # features] +token_embeddings = [] + +# For each token in the sentence... +for token_i in range(len(tokenized_text)): + + # Holds 12 layers of hidden states for each token + hidden_layers = [] + + # For each of the 12 layers... + for layer_i in range(len(encoded_layers)): + # Lookup the vector for `token_i` in `layer_i` + vec = encoded_layers[layer_i][batch_i][token_i] + + hidden_layers.append(vec) + + token_embeddings.append(hidden_layers) + +print('------------------------------------------------------------') + +# Sanity check the dimensions: +print("Number of tokens in sequence:", len(token_embeddings)) +print("Number of layers per token:", len(token_embeddings[0])) + +concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072] + +summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768] + +sentence_embedding = torch.mean(encoded_layers[11], 1) + +print ("Our final sentence embedding vector of shape:"), sentence_embedding[0].shape[0] + +print (text) +for i,x in enumerate(tokenized_text): + print (i,x) + +print ("First fifteen values of 'bank' as in 'bank robber':") +print (summed_last_4_layers[10][:15]) + +print ("First fifteen values of 'bank' as in 'bank vault':") +print(summed_last_4_layers[6][:15]) + +print ("First fifteen values of 'bank' as in 'river bank':") +print(summed_last_4_layers[19][:15]) + +# Compare "bank" as in "bank robber" to "bank" as in "river bank" +different_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[19].reshape(1,-1))[0][0] + +# Compare "bank" as in "bank robber" to "bank" as in "bank vault" +same_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[6].reshape(1,-1))[0][0] + +print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'bank vault':", same_bank) + +print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'river bank':", different_bank) + + + + diff --git a/examples/bert_ko_word_embeddings.py b/examples/bert_ko_word_embeddings.py new file mode 100644 index 0000000..df2f2d9 --- /dev/null +++ b/examples/bert_ko_word_embeddings.py @@ -0,0 +1,196 @@ + +from transformers import BertTokenizer, BertModel + +# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows +import logging +#logging.basicConfig(level=logging.INFO) +from scipy.spatial.distance import cosine +import matplotlib.pyplot as plt + +#import pytorch_pretrained_bert as ppb +import torch + +#bert multi-lingual model +#https://github.com/google-research/bert/blob/master/multilingual.md + + +tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') +text = "예로부터 말이 많은 사람은 말로 망하고 밤중에 말 타고 토끼를 데리고 도망치는 경우가 많다." +marked_text = "[CLS] " + text + " [SEP]" + +# Tokenize our sentence with the BERT tokenizer. +tokenized_text = tokenizer.tokenize(marked_text) + +# Print out the tokens. +print (tokenized_text) + +print(list(tokenizer.vocab.keys())[5000:5020]) + +# Map the token strings to their vocabulary indeces. +indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + +# Display the words with their indeces. +for tup in zip(tokenized_text, indexed_tokens): + print('{:<12} {:>6,}'.format(tup[0], tup[1])) + + +#segment ID +# Mark each of the 22 tokens as belonging to sentence "1". +segments_ids = [1] * len(tokenized_text) + +print (segments_ids) + +''' +3. Extracting Embeddings +3.1. Running BERT on our text +Next we need to convert our data to torch tensors and call the BERT model. +The BERT PyTorch interface requires that the data be in torch tensors rather than Python lists, +so we convert the lists here - this does not change the shape or the data. +''' +# Convert inputs to PyTorch tensors +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) + + +device = torch.device("cpu") + +# Load pre-trained model (weights) +model = BertModel.from_pretrained('bert-base-multilingual-cased',output_hidden_states = True,) +model.to(device) + +# Put the model in "evaluation" mode, meaning feed-forward operation. +model.eval() + +# Run the text through BERT, and collect all of the hidden states produced +# from all 12 layers. +with torch.no_grad(): + outputs = model(tokens_tensor, segments_tensors) + + # Evaluating the model will return a different number of objects based on + # how it's configured in the `from_pretrained` call earlier. In this case, + # becase we set `output_hidden_states = True`, the third item will be the + # hidden states from all layers. See the documentation for more details: + # https://huggingface.co/transformers/model_doc/bert.html#bertmodel + hidden_states = outputs[2] + + +print ("Number of layers:", len(hidden_states), " (initial embeddings + 12 BERT layers)") +layer_i = 0 + +print ("Number of batches:", len(hidden_states[layer_i])) +batch_i = 0 + +print ("Number of tokens:", len(hidden_states[layer_i][batch_i])) +token_i = 0 + +print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i])) + +# For the 5th token in our sentence, select its feature values from layer 5. +token_i = 5 +layer_i = 5 +vec = hidden_states[layer_i][batch_i][token_i] + +# Plot the values as a histogram to show their distribution. +plt.figure(figsize=(10,10)) +plt.hist(vec, bins=200) +plt.show() + + +# `hidden_states` is a Python list. +print(' Type of hidden_states: ', type(hidden_states)) + +# Each layer in the list is a torch tensor. +print('Tensor shape for each layer: ', hidden_states[0].size()) + +#Let’s combine the layers to make this one whole big tensor. +# Concatenate the tensors for all layers. We use `stack` here to +# create a new dimension in the tensor. +token_embeddings = torch.stack(hidden_states, dim=0) +print(str(token_embeddings.size())) + +#Let’s get rid of the “batches” dimension since we don’t need it. +# Remove dimension 1, the "batches". +token_embeddings = torch.squeeze(token_embeddings, dim=1) +print(str(token_embeddings.size())) + +#Finally, we can switch around the “layers” and “tokens” dimensions with permute. +# Swap dimensions 0 and 1. +token_embeddings = token_embeddings.permute(1,0,2) +print(str(token_embeddings.size())) + + +# Stores the token vectors, with shape [22 x 3,072] +token_vecs_cat = [] + +# `token_embeddings` is a [22 x 12 x 768] tensor. + +# For each token in the sentence... +for token in token_embeddings: + # `token` is a [12 x 768] tensor + + # Concatenate the vectors (that is, append them together) from the last + # four layers. + # Each layer vector is 768 values, so `cat_vec` is length 3,072. + cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0) + + # Use `cat_vec` to represent `token`. + token_vecs_cat.append(cat_vec) + +print('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0]))) + + +#As an alternative method, let’s try creating the word vectors by summing together the last four layers. +# Stores the token vectors, with shape [22 x 768] +token_vecs_sum = [] + +# `token_embeddings` is a [22 x 12 x 768] tensor. + +# For each token in the sentence... +for token in token_embeddings: + # `token` is a [12 x 768] tensor + + # Sum the vectors from the last four layers. + sum_vec = torch.sum(token[-4:], dim=0) + + # Use `sum_vec` to represent `token`. + token_vecs_sum.append(sum_vec) + +print('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0]))) + + +# `token_vecs` is a tensor with shape [22 x 768] +token_vecs = hidden_states[-2][0] + +# Calculate the average of all 22 token vectors. +sentence_embedding = torch.mean(token_vecs, dim=0) +print ("Our final sentence embedding vector of shape:", sentence_embedding.size()) + + +for i, token_str in enumerate(tokenized_text): + print (i, token_str) + +''' +They are at 3, 9, and 15. + +For this analysis, we’ll use the word vectors that we created by summing the last four layers. + +We can try printing out their vectors to compare them. +''' +print('First 10 vector values for each instance of "말".') +print('') +print("말 (언어) ", str(token_vecs_sum[3][:10])) +print("말 (언어) ", str(token_vecs_sum[9][:10])) +print("말 (동물) ", str(token_vecs_sum[15][:10])) + +''' +We can see that the values differ, but let’s calculate the cosine similarity between the vectors to make a more precise comparison. +''' + +# Calculate the cosine similarity between the word +diff_word = 1 - cosine(token_vecs_sum[3], token_vecs_sum[15]) + +# Calculate the cosine similarity between the word +same_word = 1 - cosine(token_vecs_sum[3], token_vecs_sum[9]) + +print('Vector similarity for *similar* meanings: %.2f' % same_word) +print('Vector similarity for *different* meanings: %.2f' % diff_word) \ No newline at end of file diff --git a/examples/bert_sentiment_trainer.py b/examples/bert_sentiment_trainer.py new file mode 100644 index 0000000..6750858 --- /dev/null +++ b/examples/bert_sentiment_trainer.py @@ -0,0 +1,104 @@ +from py_bert.bert_dataset import PYBERTDataset +from py_bert.bert_classification_model import PYBERTClassifier +from py_bert.bert_trainer import PYBERTTrainer +from py_bert.bert_util import create_data_loader, add_sentiment_label, convert_to_df +from transformers import BertModel, BertTokenizer +from sklearn.model_selection import train_test_split + +from py_bert.tokenization_kobert import KoBertTokenizer + +import pyTextMiner as ptm +import torch +import numpy as np +import pandas as pd + +#mode is either en or kr +mode = 'ko_distilkobert' +df = None + +if mode == 'en': + df = pd.read_csv("../data/reviews.csv") + df, class_names = add_sentiment_label(df) +elif mode == 'kr' or mode.startswith('ko'): + mecab_path = 'C:\\mecab\\mecab-ko-dic' + stopwords = '../stopwords/stopwordsKor.txt' + input_file = '../data/ratings.txt' + + pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), + ptm.tokenizer.MeCab(mecab_path), + ptm.lemmatizer.SejongPOSLemmatizer(), + ptm.helper.SelectWordOnly(), + ptm.helper.StopwordFilter(file=stopwords)) + + corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, delimiter='\t', doc_index=1, class_index=2) + + documents = [] + labels = [] + result = pipeline.processCorpus(corpus) + i = 1 + #below is just for a sample test + for doc in result[1:100]: + document = '' + for sent in doc: + for word in sent: + document += word + ' ' + documents.append(document.strip()) + labels.append(corpus.pair_map[i]) + i += 1 + + df, class_names = convert_to_df(documents,labels) + +print(df.head()) +print(df.info()) + +RANDOM_SEED = 42 +np.random.seed(RANDOM_SEED) +torch.manual_seed(RANDOM_SEED) + +#we need a better way of setting MAX_LEN +MAX_LEN = 160 + +#split +df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED) +df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED) + +print(df_train.shape, df_val.shape, df_test.shape) + +tokenizer = None +bert_model_name = '' +if mode == 'kr': + bert_model_name = 'bert-base-multilingual-cased' + tokenizer = BertTokenizer.from_pretrained(bert_model_name) +elif mode == 'en': + bert_model_name = 'bert-base-cased' + tokenizer = BertTokenizer.from_pretrained(bert_model_name) +elif mode == 'ko_bert': + bert_model_name = 'monologg/kobert' + tokenizer = KoBertTokenizer.from_pretrained(bert_model_name) +elif mode == 'ko_distilkobert': + bert_model_name = 'monologg/distilkobert' + tokenizer = KoBertTokenizer.from_pretrained(bert_model_name) + +BATCH_SIZE = 16 +train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE) +val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE) +test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE) + +# print(str(train_data_loader.dataset.__getitem__(0))) +data = next(iter(train_data_loader)) +data.keys() + +print(data['input_ids'].shape) +print(data['attention_mask'].shape) +print(data['targets'].shape) + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +model = PYBERTClassifier(len(class_names), bert_model_name) +model = model.to(device) + +num_epochs = 10 +trainer = PYBERTTrainer() +trainer.train(model, device, train_data_loader, val_data_loader, df_val, df_train, num_epochs=num_epochs) + +