Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
MinSong2 authored Jul 23, 2020
1 parent 7ff88a6 commit e907646
Show file tree
Hide file tree
Showing 3 changed files with 431 additions and 0 deletions.
131 changes: 131 additions & 0 deletions examples/bert_en_word_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from sklearn.metrics.pairwise import cosine_similarity

#pip install pytorch-pretrained-bert
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#1 Sentence Input:
#text = "Here is the sentence I want embeddings for."
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
marked_text = "[CLS] " + text + " [SEP]"
print (marked_text)

#We’ve imported a BERT-specific tokenizer, let’s take a look at the output:
tokenized_text = tokenizer.tokenize(marked_text)
print (tokenized_text)

list(tokenizer.vocab.keys())[5000:5020]

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
print (tup)

segments_ids = [1] * len(tokenized_text)
print (segments_ids)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, _ = model(tokens_tensor, segments_tensors)

print ("Number of layers:", len(encoded_layers))
layer_i = 0

print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0

print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))


# For the 5th token in our sentence, select its feature values from layer 5.
token_i = 5
layer_i = 5
vec = encoded_layers[layer_i][batch_i][token_i]

# Plot the values as a histogram to show their distribution.
plt.figure(figsize=(10,10))
plt.hist(vec, bins=200)
plt.show()

# Convert the hidden state embeddings into single token vectors

# Holds the list of 12 layer embeddings for each token
# Will have the shape: [# tokens, # layers, # features]
token_embeddings = []

# For each token in the sentence...
for token_i in range(len(tokenized_text)):

# Holds 12 layers of hidden states for each token
hidden_layers = []

# For each of the 12 layers...
for layer_i in range(len(encoded_layers)):
# Lookup the vector for `token_i` in `layer_i`
vec = encoded_layers[layer_i][batch_i][token_i]

hidden_layers.append(vec)

token_embeddings.append(hidden_layers)

print('------------------------------------------------------------')

# Sanity check the dimensions:
print("Number of tokens in sequence:", len(token_embeddings))
print("Number of layers per token:", len(token_embeddings[0]))

concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]

summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768]

sentence_embedding = torch.mean(encoded_layers[11], 1)

print ("Our final sentence embedding vector of shape:"), sentence_embedding[0].shape[0]

print (text)
for i,x in enumerate(tokenized_text):
print (i,x)

print ("First fifteen values of 'bank' as in 'bank robber':")
print (summed_last_4_layers[10][:15])

print ("First fifteen values of 'bank' as in 'bank vault':")
print(summed_last_4_layers[6][:15])

print ("First fifteen values of 'bank' as in 'river bank':")
print(summed_last_4_layers[19][:15])

# Compare "bank" as in "bank robber" to "bank" as in "river bank"
different_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[19].reshape(1,-1))[0][0]

# Compare "bank" as in "bank robber" to "bank" as in "bank vault"
same_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[6].reshape(1,-1))[0][0]

print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'bank vault':", same_bank)

print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'river bank':", different_bank)




196 changes: 196 additions & 0 deletions examples/bert_ko_word_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@

from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

#import pytorch_pretrained_bert as ppb
import torch

#bert multi-lingual model
#https://github.com/google-research/bert/blob/master/multilingual.md


tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
text = "예로부터 말이 많은 사람은 말로 망하고 밤중에 말 타고 토끼를 데리고 도망치는 경우가 많다."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

print(list(tokenizer.vocab.keys())[5000:5020])

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
print('{:<12} {:>6,}'.format(tup[0], tup[1]))


#segment ID
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

'''
3. Extracting Embeddings
3.1. Running BERT on our text
Next we need to convert our data to torch tensors and call the BERT model.
The BERT PyTorch interface requires that the data be in torch tensors rather than Python lists,
so we convert the lists here - this does not change the shape or the data.
'''
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


device = torch.device("cpu")

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-cased',output_hidden_states = True,)
model.to(device)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
with torch.no_grad():
outputs = model(tokens_tensor, segments_tensors)

# Evaluating the model will return a different number of objects based on
# how it's configured in the `from_pretrained` call earlier. In this case,
# becase we set `output_hidden_states = True`, the third item will be the
# hidden states from all layers. See the documentation for more details:
# https://huggingface.co/transformers/model_doc/bert.html#bertmodel
hidden_states = outputs[2]


print ("Number of layers:", len(hidden_states), " (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

# For the 5th token in our sentence, select its feature values from layer 5.
token_i = 5
layer_i = 5
vec = hidden_states[layer_i][batch_i][token_i]

# Plot the values as a histogram to show their distribution.
plt.figure(figsize=(10,10))
plt.hist(vec, bins=200)
plt.show()


# `hidden_states` is a Python list.
print(' Type of hidden_states: ', type(hidden_states))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

#Let’s combine the layers to make this one whole big tensor.
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)
print(str(token_embeddings.size()))

#Let’s get rid of the “batches” dimension since we don’t need it.
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)
print(str(token_embeddings.size()))

#Finally, we can switch around the “layers” and “tokens” dimensions with permute.
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
print(str(token_embeddings.size()))


# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
# `token` is a [12 x 768] tensor

# Concatenate the vectors (that is, append them together) from the last
# four layers.
# Each layer vector is 768 values, so `cat_vec` is length 3,072.
cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

# Use `cat_vec` to represent `token`.
token_vecs_cat.append(cat_vec)

print('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))


#As an alternative method, let’s try creating the word vectors by summing together the last four layers.
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
# `token` is a [12 x 768] tensor

# Sum the vectors from the last four layers.
sum_vec = torch.sum(token[-4:], dim=0)

# Use `sum_vec` to represent `token`.
token_vecs_sum.append(sum_vec)

print('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))


# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
print ("Our final sentence embedding vector of shape:", sentence_embedding.size())


for i, token_str in enumerate(tokenized_text):
print (i, token_str)

'''
They are at 3, 9, and 15.
For this analysis, we’ll use the word vectors that we created by summing the last four layers.
We can try printing out their vectors to compare them.
'''
print('First 10 vector values for each instance of "말".')
print('')
print("말 (언어) ", str(token_vecs_sum[3][:10]))
print("말 (언어) ", str(token_vecs_sum[9][:10]))
print("말 (동물) ", str(token_vecs_sum[15][:10]))

'''
We can see that the values differ, but let’s calculate the cosine similarity between the vectors to make a more precise comparison.
'''

# Calculate the cosine similarity between the word
diff_word = 1 - cosine(token_vecs_sum[3], token_vecs_sum[15])

# Calculate the cosine similarity between the word
same_word = 1 - cosine(token_vecs_sum[3], token_vecs_sum[9])

print('Vector similarity for *similar* meanings: %.2f' % same_word)
print('Vector similarity for *different* meanings: %.2f' % diff_word)
Loading

0 comments on commit e907646

Please sign in to comment.