Skip to content

Commit

Permalink
Load official pre-trained model
Browse files Browse the repository at this point in the history
  • Loading branch information
CyberZHG committed Nov 13, 2018
1 parent f98382a commit 3d60647
Show file tree
Hide file tree
Showing 16 changed files with 241 additions and 166 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
[![Travis](https://travis-ci.org/CyberZHG/keras-bert.svg)](https://travis-ci.org/CyberZHG/keras-bert)
[![Coverage](https://coveralls.io/repos/github/CyberZHG/keras-bert/badge.svg?branch=master)](https://coveralls.io/github/CyberZHG/keras-bert)

Implementation of the paper: [BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)
Implementation of the [BERT](https://arxiv.org/pdf/1810.04805.pdf). Official pre-trained models could be loaded for feature extraction.

## Install

Expand All @@ -14,6 +13,10 @@ pip install keras-bert

## Usage

### Load Official Pre-trained Models

See [load model demo](./demo/load_model). You should be able to get the same feature extraction result as the official model.

### Train & Use

```python
Expand Down
Empty file added demo/load_model/__init__.py
Empty file.
93 changes: 93 additions & 0 deletions demo/load_model/load_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import sys
import codecs
import numpy as np
from keras_bert import load_trained_model_from_checkpoint


if len(sys.argv) != 4:
print('python load_model.py CONFIG_PATH CHECKPOINT_PATH DICT_PATH')

config_path, checkpoint_path, dict_path = tuple(sys.argv[1:])

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
model.summary(line_length=120)

tokens = ['[CLS]', '语', '言', '模', '型', '[SEP]']

token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)

token_input = np.asarray([[token_dict[token] for token in tokens] + [0] * (512 - len(tokens))])
seg_input = np.asarray([[0] * len(tokens) + [0] * (512 - len(tokens))])
pos_input = np.asarray([list(range(len(tokens))) + [0] * (512 - len(tokens))])

print(token_input[0][:len(tokens)])

predicts = model.predict([token_input, seg_input, pos_input])[0]
for i, token in enumerate(tokens):
print(token, predicts[i].tolist()[:5])

"""Official outputs:
{
"linex_index": 0,
"features": [
{
"token": "[CLS]",
"layers": [
{
"index": -1,
"values": [-0.63251, 0.203023, 0.079366, -0.032843, 0.566809, ...]
}
]
},
{
"token": "语",
"layers": [
{
"index": -1,
"values": [-0.758835, 0.096518, 1.071875, 0.005038, 0.688799, ...]
}
]
},
{
"token": "言",
"layers": [
{
"index": -1,
"values": [0.547702, -0.792117, 0.444354, -0.711265, 1.20489, ...]
}
]
},
{
"token": "模",
"layers": [
{
"index": -1,
"values": [-0.292423, 0.605271, 0.499686, -0.42458, 0.428554, ...]
}
]
},
{
"token": "型",
"layers": [
{
"index": -1,
"values": [ -0.747346, 0.494315, 0.718516, -0.872353, 0.83496, ...]
}
]
},
{
"token": "[SEP]",
"layers": [
{
"index": -1,
"values": [-0.874138, -0.216504, 1.338839, -0.105871, 0.39609, ...]
}
]
}
]
}
"""
16 changes: 16 additions & 0 deletions demo/visualization/vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,19 @@
model.summary(line_length=120)
output_path = os.path.join(current_path, 'bert_big.png')
keras.utils.plot_model(model, show_shapes=True, to_file=output_path)

inputs, outputs = get_model(
token_num=30000,
pos_num=512,
transformer_num=12,
head_num=12,
embed_dim=768,
feed_forward_dim=768 * 4,
training=False,
)
model = keras.models.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='mse', metrics={})
model.summary(line_length=120)
current_path = os.path.dirname(os.path.abspath(__file__))
output_path = os.path.join(current_path, 'bert_trained.png')
keras.utils.plot_model(model, show_shapes=True, to_file=output_path)
5 changes: 2 additions & 3 deletions keras_bert/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from __future__ import absolute_import

from .bert import get_model, get_custom_objects, get_base_dict, gen_batch_inputs
from .bert import gelu, get_model, get_custom_objects, get_base_dict, gen_batch_inputs
from .loader import load_trained_model_from_checkpoint
30 changes: 16 additions & 14 deletions keras_bert/bert.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import random
import keras
import numpy as np
from keras_transformer import gelu, get_encoders
import tensorflow as tf
from keras_transformer import get_encoders
from keras_transformer import get_custom_objects as get_encoder_custom_objects
from .layers import (get_inputs, Embeddings, Masked, Extract)
from .layers import (get_inputs, get_embedding, Masked, Extract)


TOKEN_PAD = '' # Token for padding
Expand All @@ -13,6 +14,10 @@
TOKEN_MASK = '<MASK>' # Token for masking


def gelu(x):
return 0.5 * x * (1.0 + tf.erf(x / tf.sqrt(2.0)))


def get_model(token_num,
pos_num=512,
seq_len=512,
Expand Down Expand Up @@ -45,14 +50,13 @@ def get_model(token_num,
:return: The compiled model.
"""
inputs = get_inputs(seq_len=seq_len)
embed_layer = Embeddings(
input_dim=token_num,
output_dim=embed_dim,
position_dim=pos_num,
embed_layer = get_embedding(
inputs,
token_num=token_num,
embed_dim=embed_dim,
pos_num=pos_num,
dropout_rate=dropout_rate,
trainable=training,
name='Embeddings',
)(inputs[:3])
)
transformed = embed_layer
if custom_layers is not None:
kwargs = {}
Expand All @@ -65,23 +69,22 @@ def get_model(token_num,
input_layer=transformed,
head_num=head_num,
hidden_dim=feed_forward_dim,
activation=gelu,
attention_activation=None,
feed_forward_activation=gelu,
dropout_rate=dropout_rate,
)
if not training:
return inputs, transformed
return inputs[:3], transformed
mlm_pred_layer = keras.layers.Dense(
units=token_num,
activation='softmax',
trainable=training,
name='Dense-MLM',
)(transformed)
masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]])
extract_layer = Extract(index=0, name='Extract')(transformed)
nsp_pred_layer = keras.layers.Dense(
units=2,
activation='softmax',
trainable=training,
name='NSP',
)(extract_layer)
model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer])
Expand All @@ -96,7 +99,6 @@ def get_model(token_num,
def get_custom_objects():
"""Get all custom objects for loading saved models."""
custom_objects = get_encoder_custom_objects()
custom_objects['Embeddings'] = Embeddings
custom_objects['Masked'] = Masked
custom_objects['Extract'] = Extract
custom_objects['gelu'] = gelu
Expand Down
3 changes: 1 addition & 2 deletions keras_bert/layers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .wrapper import Wrapper
from .inputs import get_inputs
from .embedding import Embeddings
from .embedding import get_embedding
from .masked import Masked
from .extract import Extract
117 changes: 31 additions & 86 deletions keras_bert/layers/embedding.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,44 @@
import keras
import keras.backend as K
from .wrapper import Wrapper
from keras_layer_normalization import LayerNormalization


class Embeddings(Wrapper):
def get_embedding(inputs, token_num, pos_num, embed_dim, dropout_rate=0.1):
"""Get embedding layer.
See: https://arxiv.org/pdf/1810.04805.pdf
"""

def __init__(self,
input_dim,
output_dim,
position_dim=512,
dropout_rate=0.1,
**kwargs):
"""Initialize the layer.
:param input_dim: Number of tokens.
:param output_dim: The dimension of all embedding layers.
:param position_dim: Maximum position.
:param dropout_rate: Dropout rate.
"""
self.supports_masking = True
self.input_dim = input_dim
self.output_dim = output_dim
self.position_dim = position_dim
self.dropout_rate = dropout_rate
super(Embeddings, self).__init__(**kwargs)

def get_config(self):
config = {
'input_dim': self.input_dim,
'output_dim': self.output_dim,
'position_dim': self.position_dim,
'dropout_rate': self.dropout_rate,
}
base_config = super(Embeddings, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
return input_shape[0] + (self.output_dim,)

def compute_mask(self, inputs, input_mask=None):
return K.not_equal(inputs[0], 0)

def build(self, input_shape):
self.layers['Embedding-Token'] = keras.layers.Embedding(
input_dim=self.input_dim,
output_dim=self.output_dim,
:param inputs: Input layers.
:param token_num: Number of tokens.
:param pos_num: Maximum position.
:param embed_dim: The dimension of all embedding layers.
:param dropout_rate: Dropout rate.
:return: The merged embedding layer.
"""
embeddings = [
keras.layers.Embedding(
input_dim=token_num,
output_dim=embed_dim,
mask_zero=True,
trainable=self.trainable,
name='Embedding-Token',
)
self.layers['Embedding-Segment'] = keras.layers.Embedding(
)(inputs[0]),
keras.layers.Embedding(
input_dim=2,
output_dim=self.output_dim,
trainable=self.trainable,
output_dim=embed_dim,
name='Embedding-Segment',
)
self.layers['Embedding-Position'] = keras.layers.Embedding(
input_dim=self.position_dim,
output_dim=self.output_dim,
trainable=self.trainable,
)(inputs[1]),
keras.layers.Embedding(
input_dim=pos_num,
output_dim=embed_dim,
name='Embedding-Position',
)
self.layers['Dropout-Token'] = keras.layers.Dropout(
rate=self.dropout_rate,
trainable=self.trainable,
name='Dropout-Token',
)
self.layers['Dropout-Segment'] = keras.layers.Dropout(
rate=self.dropout_rate,
trainable=self.trainable,
name='Dropout-Segment',
)
self.layers['Dropout-Position'] = keras.layers.Dropout(
rate=self.dropout_rate,
trainable=self.trainable,
name='Dropout-Position',
)
self.layers['Embedding'] = keras.layers.Add(name='Embedding')
self.layers['Embedding-Dropout'] = keras.layers.Dropout(
rate=self.dropout_rate,
trainable=self.trainable,
)(inputs[2]),
]
embed_layer = keras.layers.Add(name='Embedding')(embeddings)
if dropout_rate > 0.0:
dropout_layer = keras.layers.Dropout(
rate=dropout_rate,
name='Embedding-Dropout',
)
super(Embeddings, self).build(input_shape)

def call(self, inputs, **kwargs):
input_token, input_segment, input_position = inputs[:3]
dropouts = [
self.layers['Dropout-Token'](self.layers['Embedding-Token'](input_token)),
self.layers['Dropout-Segment'](self.layers['Embedding-Segment'](input_segment)),
self.layers['Dropout-Position'](self.layers['Embedding-Position'](input_position)),
]
embed_layer = self.layers['Embedding'](dropouts)
return self.layers['Embedding-Dropout'](embed_layer)
)(embed_layer)
else:
dropout_layer = embed_layer
norm_layer = LayerNormalization(name='Embedding-Norm')(dropout_layer)
return norm_layer
Loading

0 comments on commit 3d60647

Please sign in to comment.