diff --git a/keras_bert/activations/__init__.py b/keras_bert/activations/__init__.py deleted file mode 100644 index aa23898..0000000 --- a/keras_bert/activations/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .gelu import gelu diff --git a/keras_bert/activations/gelu.py b/keras_bert/activations/gelu.py deleted file mode 100644 index aa8ec9b..0000000 --- a/keras_bert/activations/gelu.py +++ /dev/null @@ -1,13 +0,0 @@ -import math -import keras.backend as K - - -def gelu(x): - """An approximation of gelu. - - See: https://arxiv.org/pdf/1606.08415.pdf - """ - if K.backend() == 'tensorflow': - import tensorflow as tf - return 0.5 * x * (1.0 + tf.erf(x / tf.sqrt(2.0))) - return 0.5 * x * (1.0 + K.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * K.pow(x, 3)))) diff --git a/keras_bert/bert.py b/keras_bert/bert.py index e72a27b..0c6beb0 100644 --- a/keras_bert/bert.py +++ b/keras_bert/bert.py @@ -1,10 +1,9 @@ import random import keras import numpy as np -from keras_multi_head import MultiHeadAttention -from .layers import (get_inputs, Embeddings, Transformer, - FeedForward, Masked, Extract, LayerNormalization) -from .activations import gelu +from keras_transformer import gelu, get_encoders +from keras_transformer import get_custom_objects as get_encoder_custom_objects +from .layers import (get_inputs, Embeddings, Masked, Extract) TOKEN_PAD = '' # Token for padding @@ -61,14 +60,14 @@ def get_model(token_num, kwargs['trainable'] = training transformed = custom_layers(transformed, **kwargs) else: - for i in range(transformer_num): - transformed = Transformer( - head_num=head_num, - hidden_dim=feed_forward_dim, - dropout_rate=dropout_rate, - trainable=training, - name='Transformer-%d' % (i + 1), - )(transformed) + transformed = get_encoders( + encoder_num=transformer_num, + input_layer=transformed, + head_num=head_num, + hidden_dim=feed_forward_dim, + activation=gelu, + dropout_rate=dropout_rate, + ) if not training: return inputs, transformed mlm_pred_layer = keras.layers.Dense( @@ -96,16 +95,12 @@ def get_model(token_num, def get_custom_objects(): """Get all custom objects for loading saved models.""" - return { - 'Embeddings': Embeddings, - 'MultiHeadAttention': MultiHeadAttention, - 'FeedForward': FeedForward, - 'LayerNormalization': LayerNormalization, - 'Transformer': Transformer, - 'Masked': Masked, - 'Extract': Extract, - 'gelu': gelu, - } + custom_objects = get_encoder_custom_objects() + custom_objects['Embeddings'] = Embeddings + custom_objects['Masked'] = Masked + custom_objects['Extract'] = Extract + custom_objects['gelu'] = gelu + return custom_objects def get_base_dict(): diff --git a/keras_bert/layers/__init__.py b/keras_bert/layers/__init__.py index f59cacf..0c030d5 100644 --- a/keras_bert/layers/__init__.py +++ b/keras_bert/layers/__init__.py @@ -1,8 +1,5 @@ from .wrapper import Wrapper from .inputs import get_inputs from .embedding import Embeddings -from .feed_forward import FeedForward -from .layer_norm import LayerNormalization -from .transformer import Transformer from .masked import Masked from .extract import Extract diff --git a/keras_bert/layers/feed_forward.py b/keras_bert/layers/feed_forward.py deleted file mode 100644 index cf9628c..0000000 --- a/keras_bert/layers/feed_forward.py +++ /dev/null @@ -1,49 +0,0 @@ -import keras -import keras.backend as K -from ..activations import gelu - - -class FeedForward(keras.layers.Layer): - """Position-wise feed-forward layer. - - See: https://arxiv.org/pdf/1706.03762.pdf - """ - - def __init__(self, hidden_dim, **kwargs): - self.supports_masking = True - self.hidden_dim = hidden_dim - self.W1, self.b1 = None, None - self.W2, self.b2 = None, None - super(FeedForward, self).__init__(**kwargs) - - def get_config(self): - config = { - 'hidden_dim': self.hidden_dim, - } - base_config = super(FeedForward, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def compute_output_shape(self, input_shape): - return input_shape - - def compute_mask(self, inputs, input_mask=None): - return input_mask - - def build(self, input_shape): - feature_dim = input_shape[-1] - self.W1 = self.add_weight(shape=(feature_dim, self.hidden_dim), - name='{}_W1'.format(self.name), - initializer=keras.initializers.get('glorot_normal')) - self.b1 = self.add_weight(shape=(self.hidden_dim,), - name='{}_b1'.format(self.name), - initializer=keras.initializers.get('zeros')) - self.W2 = self.add_weight(shape=(self.hidden_dim, feature_dim), - name='{}_W2'.format(self.name), - initializer=keras.initializers.get('glorot_normal')) - self.b2 = self.add_weight(shape=(feature_dim,), - name='{}_b2'.format(self.name), - initializer=keras.initializers.get('zeros')) - super(FeedForward, self).build(input_shape) - - def call(self, x, mask=None): - return K.dot(gelu(K.dot(x, self.W1) + self.b1), self.W2) + self.b2 diff --git a/keras_bert/layers/layer_norm.py b/keras_bert/layers/layer_norm.py deleted file mode 100644 index 1e58485..0000000 --- a/keras_bert/layers/layer_norm.py +++ /dev/null @@ -1,34 +0,0 @@ -import keras -import keras.backend as K - - -class LayerNormalization(keras.layers.Layer): - """Layer normalization. - - See: https://arxiv.org/pdf/1607.06450.pdf - """ - - def __init__(self, **kwargs): - self.supports_masking = True - self.gamma, self.beta = None, None - super(LayerNormalization, self).__init__(**kwargs) - - def compute_output_shape(self, input_shape): - return input_shape - - def compute_mask(self, inputs, input_mask=None): - return input_mask - - def build(self, input_shape): - self.gamma = self.add_weight(shape=input_shape[-1:], - name='{}_gamma'.format(self.name), - initializer=keras.initializers.get('ones')) - self.beta = self.add_weight(shape=input_shape[-1:], - name='{}_beta'.format(self.name), - initializer=keras.initializers.get('zeros')) - super(LayerNormalization, self).build(input_shape) - - def call(self, x, mask=None): - mean = K.mean(x, axis=-1, keepdims=True) - std = K.std(x, axis=-1, keepdims=True) - return self.gamma * (x - mean) / (std + K.epsilon()) + self.beta diff --git a/keras_bert/layers/transformer.py b/keras_bert/layers/transformer.py deleted file mode 100644 index 0f6511f..0000000 --- a/keras_bert/layers/transformer.py +++ /dev/null @@ -1,106 +0,0 @@ -import keras -from keras_multi_head import MultiHeadAttention -from .layer_norm import LayerNormalization -from .feed_forward import FeedForward -from .wrapper import Wrapper -from ..activations import gelu - - -class Transformer(Wrapper): - """Generate a set of transformer layers. - - See: https://arxiv.org/pdf/1706.03762.pdf - """ - - def __init__(self, - head_num, - hidden_dim, - dropout_rate=0.1, - **kwargs): - """Initialize the layer. - - :param head_num: Number of heads. - :param hidden_dim: Hidden dimension for feed forward layer. - :param dropout_rate: Dropout rate. - """ - self.supports_masking = True - self.head_num = head_num - self.hidden_dim = hidden_dim - self.dropout_rate = dropout_rate - super(Transformer, self).__init__(**kwargs) - - def get_config(self): - config = { - 'head_num': self.head_num, - 'hidden_dim': self.hidden_dim, - 'dropout_rate': self.dropout_rate, - } - base_config = super(Transformer, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def compute_output_shape(self, input_shape): - return input_shape - - def compute_mask(self, inputs, input_mask=None): - return input_mask - - def build(self, input_shape): - layer = MultiHeadAttention( - head_num=self.head_num, - trainable=self.trainable, - kernel_activation=gelu, - name='%s-MultiHead' % self.name, - ) - self.layers[layer.name] = layer - layer = LayerNormalization( - trainable=self.trainable, - name='%s-MultiHead-Norm' % self.name, - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-MultiHead-Dropout' % self.name, - ) - self.layers[layer.name] = layer - layer = keras.layers.Add( - trainable=self.trainable, - name='%s-MultiHead-Add' % self.name, - ) - self.layers[layer.name] = layer - layer = FeedForward( - hidden_dim=self.hidden_dim, - trainable=self.trainable, - name='%s-FeedForward' % self.name, - ) - self.layers[layer.name] = layer - layer = LayerNormalization( - trainable=self.trainable, - name='%s-FeedForward-Norm' % self.name, - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-FeedForward-Dropout' % self.name, - ) - self.layers[layer.name] = layer - layer = keras.layers.Add( - trainable=self.trainable, - name='%s-FeedForward-Add' % self.name, - ) - self.layers[layer.name] = layer - super(Transformer, self).build(input_shape) - - def call(self, inputs, mask=None): - multi_head_layer = self.layers['%s-MultiHead' % self.name](inputs) - multi_head_norm = self.layers['%s-MultiHead-Norm' % self.name](multi_head_layer) - multi_head_dropout_layer = self.layers['%s-MultiHead-Dropout' % self.name](multi_head_norm) - multi_head_residual_layer = self.layers['%s-MultiHead-Add' % self.name]([inputs, multi_head_dropout_layer]) - feed_forward_layer = self.layers['%s-FeedForward' % self.name](multi_head_residual_layer) - feed_forward_norm = self.layers['%s-FeedForward-Norm' % self.name](feed_forward_layer) - feed_forward_dropout_layer = self.layers['%s-FeedForward-Dropout' % self.name](feed_forward_norm) - feed_forward_residual_layer = self.layers['%s-FeedForward-Add' % self.name]( - [multi_head_residual_layer, feed_forward_dropout_layer] - ) - return feed_forward_residual_layer diff --git a/requirements-dev.txt b/requirements-dev.txt index 3abf46d..e67855c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ +tensorflow pycodestyle coverage diff --git a/requirements.txt b/requirements.txt index f5e1732..c466c7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ numpy -tensorflow Keras -keras-multi-head==0.7.0 +keras-transformer==0.4.0 diff --git a/setup.py b/setup.py index 48471fd..f672aaa 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='keras-bert', - version='0.13.0', + version='0.14.0', packages=find_packages(), url='https://github.com/CyberZHG/keras-bert', license='MIT', @@ -13,7 +13,7 @@ install_requires=[ 'numpy', 'keras', - 'keras-multi-head==0.7.0', + 'keras-transformer==0.4.0', ], classifiers=( "Programming Language :: Python :: 2.7", diff --git a/tests/activations/__init__.py b/tests/activations/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/activations/test_gelu.py b/tests/activations/test_gelu.py deleted file mode 100644 index 3a7ab99..0000000 --- a/tests/activations/test_gelu.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest -import keras.backend as K -from keras_bert.activations import gelu - - -class TestGelu(unittest.TestCase): - - def test_sample(self): - results = gelu(K.constant([-30.0, -1.0, 0.0, 1.0, 30.0])).eval(session=K.get_session()) - self.assertEqual(0.0, results[0]) - self.assertGreater(0.0, results[1]) - self.assertLess(-1.0, results[1]) - self.assertEqual(0.0, results[2]) - self.assertGreater(1.0, results[3]) - self.assertLess(0.0, results[3]) - self.assertEqual(30.0, results[4]) diff --git a/tests/layers/test_embedding.py b/tests/layers/test_embedding.py index b9669c6..a9b8a57 100644 --- a/tests/layers/test_embedding.py +++ b/tests/layers/test_embedding.py @@ -12,7 +12,7 @@ def test_sample(self): model.compile( optimizer='adam', loss='mse', - metrics=['mse'], + metrics={}, ) model.summary(line_length=120) self.assertEqual((None, 512, 768), model.layers[-1].output_shape) diff --git a/tests/layers/test_extract.py b/tests/layers/test_extract.py index e05989e..f17ed6c 100644 --- a/tests/layers/test_extract.py +++ b/tests/layers/test_extract.py @@ -22,7 +22,7 @@ def test_sample(self): model.compile( optimizer='adam', loss='mse', - metrics=['mse'], + metrics={}, ) model.summary() inputs = np.asarray([[ diff --git a/tests/layers/test_feed_forward.py b/tests/layers/test_feed_forward.py deleted file mode 100644 index 8187b71..0000000 --- a/tests/layers/test_feed_forward.py +++ /dev/null @@ -1,95 +0,0 @@ -import unittest -import keras -import numpy as np -from keras_bert.layers import FeedForward - - -class TestFeedForward(unittest.TestCase): - - def test_sample(self): - input_layer = keras.layers.Input( - shape=(1, 3), - name='Input', - ) - feed_forward_layer = FeedForward( - hidden_dim=4, - weights=[ - np.asarray([ - [0.1, 0.2, 0.3, 0.4], - [-0.1, 0.2, -0.3, 0.4], - [0.1, -0.2, 0.3, -0.4], - ]), - np.asarray([ - 0.0, -0.1, 0.2, -0.3, - ]), - np.asarray([ - [0.1, 0.2, 0.3], - [-0.1, 0.2, -0.3], - [0.1, -0.2, 0.3], - [-0.1, 0.2, 0.3], - ]), - np.asarray([ - 0.0, 0.1, -0.2, - ]), - ], - name='FeedForward', - )(input_layer) - model = keras.models.Model( - inputs=input_layer, - outputs=feed_forward_layer, - ) - model.compile( - optimizer='adam', - loss='mse', - metrics=['mse'], - ) - model.summary() - inputs = np.array([[[0.2, 0.1, 0.3]]]) - predict = model.predict(inputs) - expected = np.asarray([[[0.0381447, 0.03196586, -0.15434185]]]) - self.assertTrue(np.allclose(expected, predict), predict) - - def test_fit(self): - input_layer = keras.layers.Input( - shape=(1, 3), - name='Input', - ) - feed_forward_layer = FeedForward( - hidden_dim=4, - name='FeedForward', - )(input_layer) - model = keras.models.Model( - inputs=input_layer, - outputs=feed_forward_layer, - ) - model.compile( - optimizer='adam', - loss='mse', - metrics=['mse'], - ) - - def _generator(batch_size=32): - while True: - inputs = np.random.random((batch_size, 1, 3)) - outputs = inputs * 0.8 + 0.3 - yield inputs, outputs - - for _ in range(3): - model.fit_generator( - generator=_generator(), - steps_per_epoch=1000, - epochs=30, - validation_data=_generator(), - validation_steps=100, - callbacks=[ - keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) - ], - ) - for inputs, _ in _generator(batch_size=3): - predicts = model.predict(inputs) - expect = np.round(inputs * 0.8 + 0.3, decimals=1) - actual = np.round(predicts, decimals=1) - if np.allclose(expect, actual): - return - break - self.assertTrue(np.allclose(expect, actual), (expect, actual)) diff --git a/tests/layers/test_layer_norm.py b/tests/layers/test_layer_norm.py deleted file mode 100644 index d5e7ed7..0000000 --- a/tests/layers/test_layer_norm.py +++ /dev/null @@ -1,88 +0,0 @@ -import unittest -import keras -import numpy as np -from keras_multi_head import MultiHeadAttention -from keras_bert.activations import gelu -from keras_bert.layers import LayerNormalization - - -class TestLayerNorm(unittest.TestCase): - - def test_sample(self): - input_layer = keras.layers.Input( - shape=(2, 3), - name='Input', - ) - norm_layer = LayerNormalization( - name='Layer-Normalization', - )(input_layer) - model = keras.models.Model( - inputs=input_layer, - outputs=norm_layer, - ) - model.compile( - optimizer='adam', - loss='mse', - metrics=['mse'], - ) - model.summary() - inputs = np.array([[ - [0.2, 0.1, 0.3], - [0.5, 0.1, 0.1], - ]]) - predict = model.predict(inputs) - expected = np.asarray([[ - [0.0, -1.22474487, 1.22474487], - [1.41421356, -0.707106781, -0.707106781], - ]]) - self.assertTrue(np.allclose(expected, predict)) - - def test_fit(self): - input_layer = keras.layers.Input( - shape=(2, 3), - name='Input', - ) - att_layer = MultiHeadAttention( - head_num=3, - kernel_activation=gelu, - name='Multi-Head-Attentions' - )(input_layer) - dense_layer = keras.layers.Dense(units=3, name='Dense-1')(att_layer) - norm_layer = LayerNormalization( - name='Layer-Normalization', - trainable=False, - )(dense_layer) - dense_layer = keras.layers.Dense(units=3, name='Dense-2')(norm_layer) - model = keras.models.Model( - inputs=input_layer, - outputs=dense_layer, - ) - model.compile( - optimizer=keras.optimizers.Adam(lr=1e-3), - loss='mse', - metrics={}, - ) - model.summary() - - def _generator(batch_size=32): - while True: - inputs = np.random.random((batch_size, 2, 3)) - outputs = np.asarray([[[0.0, -0.1, 0.2]] * 2] * batch_size) - yield inputs, outputs - - model.fit_generator( - generator=_generator(), - steps_per_epoch=1000, - epochs=30, - validation_data=_generator(), - validation_steps=100, - callbacks=[ - keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) - ], - ) - for inputs, _ in _generator(batch_size=3): - predicts = model.predict(inputs) - expect = np.round(np.asarray([[[0.0, -0.1, 0.2]] * 2] * 3), decimals=1) - actual = np.round(predicts, decimals=1) - self.assertTrue(np.allclose(expect, actual), (expect, actual)) - break diff --git a/tests/layers/test_masked.py b/tests/layers/test_masked.py index e78e786..307eee4 100644 --- a/tests/layers/test_masked.py +++ b/tests/layers/test_masked.py @@ -1,7 +1,8 @@ import unittest import keras import numpy as np -from keras_bert.layers import get_inputs, Embeddings, Transformer, Masked +from keras_transformer import gelu, get_encoders +from keras_bert.layers import get_inputs, Embeddings, Masked class TestMasked(unittest.TestCase): @@ -36,12 +37,14 @@ def test_mask_result(self): mask_zero=True, name='Embedding', )(input_layer) - transformer_layer = Transformer( + transformer_layer = get_encoders( + encoder_num=1, + input_layer=embed_layer, head_num=1, hidden_dim=12, + activation=gelu, dropout_rate=0.1, - name='Transformer', - )(embed_layer) + ) dense_layer = keras.layers.Dense( units=12, activation='softmax', diff --git a/tests/layers/test_transformer.py b/tests/layers/test_transformer.py deleted file mode 100644 index 85d082d..0000000 --- a/tests/layers/test_transformer.py +++ /dev/null @@ -1,86 +0,0 @@ -import unittest -import keras -import numpy as np -from keras_bert.layers import Transformer - - -class TestTransfomer(unittest.TestCase): - - def test_sample(self): - input_layer = keras.layers.Input( - shape=(512,), - name='Input', - ) - embed_layer = keras.layers.Embedding( - input_dim=12, - output_dim=768, - mask_zero=True, - name='Embedding', - )(input_layer) - output_layer = Transformer( - head_num=12, - hidden_dim=768 * 4, - dropout_rate=0.001, - name='Transformer', - )(embed_layer) - model = keras.models.Model(inputs=input_layer, outputs=output_layer) - model.compile( - optimizer='adam', - loss='mse', - metrics=['mse'], - ) - model.summary(line_length=120) - self.assertEqual((None, 512, 768), model.layers[-1].output_shape) - - def test_fit(self): - input_layer = keras.layers.Input( - shape=(2, 3), - name='Input', - ) - dense_layer = keras.layers.Dense(units=3, name='Dense-1')(input_layer) - transformer_layer = Transformer( - head_num=3, - hidden_dim=12, - dropout_rate=0.001, - name='Transformer-1', - )(dense_layer) - transformer_layer = Transformer( - head_num=3, - hidden_dim=12, - dropout_rate=0.001, - name='Transformer-2', - )(transformer_layer) - dense_layer = keras.layers.Dense(units=3, name='Dense-2')(transformer_layer) - model = keras.models.Model( - inputs=input_layer, - outputs=dense_layer, - ) - model.compile( - optimizer=keras.optimizers.Adam(lr=1e-3), - loss='mse', - metrics=['mse'], - ) - model.summary() - - def _generator(batch_size=32): - while True: - inputs = np.random.random((batch_size, 2, 3)) - outputs = np.asarray([[[0.0, -0.1, 0.2]] * 2] * batch_size) - yield inputs, outputs - - model.fit_generator( - generator=_generator(), - steps_per_epoch=1000, - epochs=10, - validation_data=_generator(), - validation_steps=100, - callbacks=[ - keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) - ], - ) - for inputs, _ in _generator(batch_size=3): - predicts = model.predict(inputs) - expect = np.round(np.asarray([[[0.0, -0.1, 0.2]] * 2] * 3), decimals=1) - actual = np.round(predicts, decimals=1) - self.assertTrue(np.allclose(expect, actual), (expect, actual)) - break diff --git a/tests/test_bert_fit.h5 b/tests/test_bert_fit.h5 index f8388d7..563727c 100644 Binary files a/tests/test_bert_fit.h5 and b/tests/test_bert_fit.h5 differ