diff --git a/keras_bert/bert.py b/keras_bert/bert.py index 894f775..e72a27b 100644 --- a/keras_bert/bert.py +++ b/keras_bert/bert.py @@ -1,8 +1,8 @@ import random import keras import numpy as np -from keras_self_attention import ScaledDotProductAttention -from .layers import (get_inputs, Embeddings, Transformer, MultiHeadAttention, +from keras_multi_head import MultiHeadAttention +from .layers import (get_inputs, Embeddings, Transformer, FeedForward, Masked, Extract, LayerNormalization) from .activations import gelu @@ -98,7 +98,6 @@ def get_custom_objects(): """Get all custom objects for loading saved models.""" return { 'Embeddings': Embeddings, - 'ScaledDotProductAttention': ScaledDotProductAttention, 'MultiHeadAttention': MultiHeadAttention, 'FeedForward': FeedForward, 'LayerNormalization': LayerNormalization, diff --git a/keras_bert/layers/__init__.py b/keras_bert/layers/__init__.py index d7ddc81..f59cacf 100644 --- a/keras_bert/layers/__init__.py +++ b/keras_bert/layers/__init__.py @@ -1,7 +1,6 @@ from .wrapper import Wrapper from .inputs import get_inputs from .embedding import Embeddings -from .multi_head import MultiHeadAttention from .feed_forward import FeedForward from .layer_norm import LayerNormalization from .transformer import Transformer diff --git a/keras_bert/layers/multi_head.py b/keras_bert/layers/multi_head.py deleted file mode 100644 index a979486..0000000 --- a/keras_bert/layers/multi_head.py +++ /dev/null @@ -1,141 +0,0 @@ -import keras -from keras_self_attention import ScaledDotProductAttention -from ..activations.gelu import gelu -from .wrapper import Wrapper - - -class MultiHeadAttention(Wrapper): - """Generate multi-head attention layers. - - See: https://arxiv.org/pdf/1706.03762.pdf - """ - - def __init__(self, - head_num, - dropout_rate=0.1, - **kwargs): - """Initialize the layer. - - :param head_num: Number of heads. - :param dropout_rate: Dropout rate. - :param feature_dim: The dimension of input feature. - """ - self.supports_masking = True - self.head_num = head_num - self.dropout_rate = dropout_rate - super(MultiHeadAttention, self).__init__(**kwargs) - - def get_config(self): - config = { - 'head_num': self.head_num, - 'dropout_rate': self.dropout_rate, - } - base_config = super(MultiHeadAttention, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def compute_output_shape(self, input_shape): - return input_shape - - def compute_mask(self, inputs, input_mask=None): - return input_mask - - def build(self, input_shape): - feature_dim = input_shape[-1] - if feature_dim % self.head_num != 0: - raise IndexError('Invalid head number %d with the given input dim %d' % (self.head_num, feature_dim)) - for i in range(self.head_num): - layer = keras.layers.Dense( - units=feature_dim // self.head_num, - activation=gelu, - use_bias=False, - trainable=self.trainable, - name='%s-Dense-Q_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = keras.layers.Dense( - units=feature_dim // self.head_num, - activation=gelu, - use_bias=False, - trainable=self.trainable, - name='%s-Dense-K_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = keras.layers.Dense( - units=feature_dim // self.head_num, - activation=gelu, - use_bias=False, - trainable=self.trainable, - name='%s-Dense-V_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-Dense-Dropout-Q_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-Dense-Dropout-K_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-Dense-Dropout-V_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = ScaledDotProductAttention( - trainable=self.trainable, - name='%s-Attention_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-Attention-Dropout_%d' % (self.name, i + 1), - ) - self.layers[layer.name] = layer - if self.head_num > 1: - layer = keras.layers.Concatenate(name='%s-Concat' % self.name) - self.layers[layer.name] = layer - layer = keras.layers.Dense( - units=feature_dim, - activation=gelu, - use_bias=False, - trainable=self.trainable, - name='%s-Dense_O' % self.name, - ) - self.layers[layer.name] = layer - layer = keras.layers.Dropout( - rate=self.dropout_rate, - trainable=self.trainable, - name='%s-Dense-Dropout_O' % self.name, - ) - self.layers[layer.name] = layer - super(MultiHeadAttention, self).build(input_shape) - - def call(self, inputs, mask=None): - outputs = [] - for i in range(self.head_num): - query_layer = self.layers['%s-Dense-Q_%d' % (self.name, i + 1)](inputs) - key_layer = self.layers['%s-Dense-K_%d' % (self.name, i + 1)](inputs) - value_layer = self.layers['%s-Dense-V_%d' % (self.name, i + 1)](inputs) - query_dropout_layer = self.layers['%s-Dense-Dropout-Q_%d' % (self.name, i + 1)](query_layer) - key_dropout_layer = self.layers['%s-Dense-Dropout-K_%d' % (self.name, i + 1)](key_layer) - value_dropout_layer = self.layers['%s-Dense-Dropout-V_%d' % (self.name, i + 1)](value_layer) - att_layer = self.layers['%s-Attention_%d' % (self.name, i + 1)]([ - query_dropout_layer, - key_dropout_layer, - value_dropout_layer, - ]) - dropout_layer = self.layers['%s-Attention-Dropout_%d' % (self.name, i + 1)](att_layer) - outputs.append(dropout_layer) - if self.head_num == 1: - concat_layer = outputs[0] - else: - concat_layer = self.layers['%s-Concat' % self.name](outputs) - dense_layer = self.layers['%s-Dense_O' % self.name](concat_layer) - dropout_layer = self.layers['%s-Dense-Dropout_O' % self.name](dense_layer) - return dropout_layer diff --git a/keras_bert/layers/transformer.py b/keras_bert/layers/transformer.py index 4a958ce..0f6511f 100644 --- a/keras_bert/layers/transformer.py +++ b/keras_bert/layers/transformer.py @@ -1,8 +1,9 @@ import keras -from .multi_head import MultiHeadAttention +from keras_multi_head import MultiHeadAttention from .layer_norm import LayerNormalization from .feed_forward import FeedForward from .wrapper import Wrapper +from ..activations import gelu class Transformer(Wrapper): @@ -46,8 +47,8 @@ def compute_mask(self, inputs, input_mask=None): def build(self, input_shape): layer = MultiHeadAttention( head_num=self.head_num, - dropout_rate=self.dropout_rate, trainable=self.trainable, + kernel_activation=gelu, name='%s-MultiHead' % self.name, ) self.layers[layer.name] = layer diff --git a/requirements.txt b/requirements.txt index b29715e..f5e1732 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy tensorflow Keras -keras-self-attention==0.30.0 +keras-multi-head==0.7.0 diff --git a/setup.py b/setup.py index 628a333..48471fd 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='keras-bert', - version='0.11.0', + version='0.13.0', packages=find_packages(), url='https://github.com/CyberZHG/keras-bert', license='MIT', @@ -13,6 +13,7 @@ install_requires=[ 'numpy', 'keras', + 'keras-multi-head==0.7.0', ], classifiers=( "Programming Language :: Python :: 2.7", diff --git a/tests/layers/test_layer_norm.py b/tests/layers/test_layer_norm.py index 1246d21..d5e7ed7 100644 --- a/tests/layers/test_layer_norm.py +++ b/tests/layers/test_layer_norm.py @@ -1,7 +1,9 @@ import unittest import keras import numpy as np -from keras_bert.layers import LayerNormalization, MultiHeadAttention +from keras_multi_head import MultiHeadAttention +from keras_bert.activations import gelu +from keras_bert.layers import LayerNormalization class TestLayerNorm(unittest.TestCase): @@ -42,8 +44,8 @@ def test_fit(self): ) att_layer = MultiHeadAttention( head_num=3, - dropout_rate=1e-5, - name='MH' + kernel_activation=gelu, + name='Multi-Head-Attentions' )(input_layer) dense_layer = keras.layers.Dense(units=3, name='Dense-1')(att_layer) norm_layer = LayerNormalization( @@ -58,7 +60,7 @@ def test_fit(self): model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss='mse', - metrics=['mse'], + metrics={}, ) model.summary() diff --git a/tests/layers/test_masked.py b/tests/layers/test_masked.py index a903b88..e78e786 100644 --- a/tests/layers/test_masked.py +++ b/tests/layers/test_masked.py @@ -1,5 +1,4 @@ import unittest -import random import keras import numpy as np from keras_bert.layers import get_inputs, Embeddings, Transformer, Masked @@ -15,7 +14,7 @@ def test_sample(self): model.compile( optimizer='adam', loss='mse', - metrics=['mse'], + metrics={}, ) model.summary(line_length=120) model.predict([ @@ -81,84 +80,3 @@ def test_mask_result(self): [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], ]) self.assertTrue(np.allclose(expect, predicts[1])) - - def test_fit(self): - input_layer = keras.layers.Input( - shape=(15,), - name='Input', - ) - embed_layer = keras.layers.Embedding( - input_dim=12, - output_dim=24, - mask_zero=True, - name='Embedding', - )(input_layer) - rnn_layer = keras.layers.Bidirectional( - keras.layers.LSTM(units=100, return_sequences=True), - name='Bi-LSTM', - )(embed_layer) - dense_layer = keras.layers.Dense( - units=12, - activation='softmax', - name='Dense', - )(rnn_layer) - mask_layer = keras.layers.Input( - shape=(None,), - name='Mask', - ) - masked_layer = Masked( - name='Masked', - )([dense_layer, mask_layer]) - model = keras.models.Model( - inputs=[input_layer, mask_layer], - outputs=masked_layer, - ) - model.compile( - optimizer=keras.optimizers.Adam(lr=1e-4), - loss=keras.losses.sparse_categorical_crossentropy, - metrics=[keras.metrics.sparse_categorical_crossentropy], - ) - model.summary(line_length=150) - - def _generator(batch_size=32): - while True: - inputs, masked, outputs = [], [], [] - for _ in range(batch_size): - inputs.append([]) - masked.append([]) - outputs.append([]) - has_mask = False - for i in range(1, 11): - inputs[-1].append(i) - outputs[-1].append([i]) - if random.random() < 0.3: - has_mask = True - inputs[-1][-1] = 11 - masked[-1].append(1) - else: - masked[-1].append(0) - if not has_mask: - masked[-1][0] = 1 - inputs[-1] += [0] * (15 - len(inputs[-1])) - masked[-1] += [0] * (15 - len(masked[-1])) - outputs[-1] += [[0]] * (15 - len(outputs[-1])) - yield [np.asarray(inputs), np.asarray(masked)], np.asarray(outputs) - - model.fit_generator( - generator=_generator(), - steps_per_epoch=1000, - epochs=10, - validation_data=_generator(), - validation_steps=100, - callbacks=[ - keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) - ], - ) - for inputs, outputs in _generator(batch_size=32): - predicts = model.predict(inputs) - actual = np.argmax(predicts, axis=-1) - for i in range(32): - for j in range(15): - if inputs[1][i][j]: - self.assertEqual(j + 1, actual[i][j]) - break diff --git a/tests/layers/test_multi_head.py b/tests/layers/test_multi_head.py deleted file mode 100644 index c5b8db1..0000000 --- a/tests/layers/test_multi_head.py +++ /dev/null @@ -1,90 +0,0 @@ -import unittest -import keras -import numpy as np -from keras_bert.layers import MultiHeadAttention - - -class TestMultiHead(unittest.TestCase): - - def test_sample(self): - input_layer = keras.layers.Input( - shape=(512,), - name='Input', - ) - embed_layer = keras.layers.Embedding( - input_dim=12, - output_dim=768, - mask_zero=True, - name='Embedding', - )(input_layer) - output_layer = MultiHeadAttention( - head_num=12, - name='Multi-Head', - )(embed_layer) - model = keras.models.Model(inputs=input_layer, outputs=output_layer) - model.compile( - optimizer='adam', - loss='mse', - metrics=['mse'], - ) - model.summary() - self.assertEqual((None, 512, 768), model.layers[-1].output_shape) - - def test_invalid_head_num(self): - with self.assertRaises(IndexError): - input_layer = keras.layers.Input( - shape=(2, 3), - name='Input', - ) - MultiHeadAttention( - head_num=2, - dropout_rate=0.01, - name='Multi-Head', - )(input_layer) - - def test_fit(self): - input_layer = keras.layers.Input( - shape=(2, 3), - name='Input', - ) - att_layer = MultiHeadAttention( - head_num=3, - dropout_rate=0.01, - name='Multi-Head-1', - )(input_layer) - dense_layer = keras.layers.Dense(units=3, name='Dense-1')(att_layer) - att_layer = MultiHeadAttention( - head_num=3, - name='Multi-Head-2', - )(dense_layer) - output_layer = keras.layers.Dense(units=3, name='Dense-2')(att_layer) - model = keras.models.Model(inputs=input_layer, outputs=output_layer) - model.compile( - optimizer='adam', - loss='mse', - metrics=['mse'], - ) - model.summary() - - def _generator(batch_size=32): - while True: - inputs = np.random.random((batch_size, 2, 3)) - outputs = np.asarray([[[0.0, -0.1, 0.2]] * 2] * batch_size) - yield inputs, outputs - - model.fit_generator( - generator=_generator(), - steps_per_epoch=1000, - epochs=10, - validation_data=_generator(), - validation_steps=100, - callbacks=[ - keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) - ], - ) - for inputs, _ in _generator(batch_size=3): - predicts = model.predict(inputs) - expect = np.asarray([[[0.0, -0.1, 0.2]] * 2] * 3) - actual = np.round(predicts, decimals=1) - self.assertTrue(np.allclose(expect, actual), (expect, actual)) - break diff --git a/tests/layers/test_transformer.py b/tests/layers/test_transformer.py index 359a1d5..85d082d 100644 --- a/tests/layers/test_transformer.py +++ b/tests/layers/test_transformer.py @@ -4,7 +4,7 @@ from keras_bert.layers import Transformer -class TestMultiHead(unittest.TestCase): +class TestTransfomer(unittest.TestCase): def test_sample(self): input_layer = keras.layers.Input( @@ -20,6 +20,7 @@ def test_sample(self): output_layer = Transformer( head_num=12, hidden_dim=768 * 4, + dropout_rate=0.001, name='Transformer', )(embed_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) diff --git a/tests/test_bert_fit.h5 b/tests/test_bert_fit.h5 index c5b9d14..f8388d7 100644 Binary files a/tests/test_bert_fit.h5 and b/tests/test_bert_fit.h5 differ