diff --git a/.circleci/config.yml b/.circleci/config.yml
index 812a421..4336605 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -56,6 +56,10 @@ jobs:
             source venv/bin/activate
             pytest -s --cov=tavolo tests/
             codecov
+  test-3.8:
+    <<: *test-template
+    docker:
+      - image: circleci/python:3.6
   test-3.6:
     <<: *test-template
     docker:
diff --git a/README.rst b/README.rst
index 69f308a..cbf28d5 100644
--- a/README.rst
+++ b/README.rst
@@ -6,10 +6,10 @@
 
 ------------
 
-.. image:: https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg
+.. image:: https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7%20%7C%203.8-blue.svg
     :alt: Supported Python versions
 
-.. image:: https://img.shields.io/badge/tensorflow-2.0.0--rc0-orange.svg
+.. image:: https://img.shields.io/badge/tensorflow-2.0-orange.svg
     :alt: Supported TensorFlow versions
 
 .. image:: https://codecov.io/gh/eliorc/tavolo/branch/master/graph/badge.svg
@@ -27,8 +27,7 @@ Tavolo
 | You see, the deep learning world is moving fast, and new ideas keep on coming.
 | tavolo gathers implementations of these useful ideas from the community (by contribution, from `Kaggle`_ etc.)
   and makes them accessible in a single PyPI hosted package that compliments the `tf.keras`_ module.
-|
-| *Notice: tavolo is developed for TensorFlow 2.0 (right now on pre-release), most modules will work with earlier versions but some won't (like LayerNormalization)*
+
 
 Documentation
 -------------
@@ -41,8 +40,8 @@ Showcase
 --------
 
 | tavolo's API is straightforward and adopting its modules is as easy as it gets.
-| In tavolo, you'll find implementations for basic layers like `LayerNormalization`_ to complex modules like the Transformer's
-  `MultiHeadedSelfAttention`_. You'll also find non-layer implementations that can ease development, like the `LearningRateFinder`_.
+| In tavolo, you'll find implementations for basic layers like `PositionalEncoding`_ to complex modules like the Transformer's
+  `MultiHeadedAttention`_. You'll also find non-layer implementations that can ease development, like the `LearningRateFinder`_.
 | For example, if we wanted to add head a multi-headed attention mechanism into our model and look for the optimal learning rate, it would look something like:
 
 .. code-block:: python3
@@ -52,7 +51,7 @@ Showcase
 
     model = tf.keras.Sequential([
         tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len),
-        tvl.seq2seq.MultiHeadedSelfAttention(n_heads=8),  # <--- Add self attention
+        tvl.seq2seq.MultiHeadedAttention(n_heads=8),  # <--- Add self attention
         tf.keras.layers.LSTM(n_lstm_units, return_sequences=True),
         tf.keras.layers.Dense(n_hidden_units, activation='relu'),
         tf.keras.layers.Dense(1, activation='sigmoid')])
@@ -70,8 +69,8 @@ Showcase
 .. _`TensorFlow`: https://www.tensorflow.org/
 .. _`Kaggle`: https://www.kaggle.com
 .. _`tf.keras`: https://www.tensorflow.org/guide/keras
-.. _`LayerNormalization`: https://tavolo.readthedocs.io/en/latest/normalization.html#layer-normalization
-.. _`MultiHeadedSelfAttention`: https://tavolo.readthedocs.io/en/latest/seq2seq.html#multi-headed-self-attention
+.. _`PositionalEncoding`: https://tavolo.readthedocs.io/en/latest/embeddings.html#module-embeddings.PositionalEncoding
+.. _`MultiHeadedAttention`: https://tavolo.readthedocs.io/en/latest/seq2seq.html#multi-headed-self-attention
 .. _`LearningRateFinder`: https://tavolo.readthedocs.io/en/latest/learning.html#learning-rate-finder
 
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f05ce5c..184baff 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -9,16 +9,13 @@ Welcome to tavolo's documentation!
 | tavolo gathers implementations of these useful ideas from the community (by contribution, from `Kaggle`_ etc.)
   and makes them accessible in a single PyPI hosted package that compliments the `tf.keras`_ module.
 
-.. warning::
-
-    tavolo is developed for TensorFlow 2.0 (right now on pre-release), most modules will work with earlier versions but some won't (like LayerNormalization)
 
 Showcase
 --------
 
 | tavolo's API is straightforward and adopting its modules is as easy as it gets.
-| In tavolo, you'll find implementations for basic layers like :ref:`layer_normalization` to complex modules like the Transformer's
-  :ref:`multi_headed_self_attention`. You'll also find non-layer implementations that can ease development, like the :ref:`learning_rate_finder`.
+| In tavolo, you'll find implementations for basic layers like :ref:`positional_encoding` to complex modules like the Transformer's
+  :ref:`multi_headed_attention`. You'll also find non-layer implementations that can ease development, like the :ref:`learning_rate_finder`.
 | For example, if we wanted to add head a multi-headed attention mechanism into our model and look for the optimal learning rate, it would look something like:
 
 .. code-block:: python3
@@ -28,7 +25,7 @@ Showcase
 
     model = tf.keras.Sequential([
         tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len),
-        tvl.seq2seq.MultiHeadedSelfAttention(n_heads=8),  # <--- Add self attention
+        tvl.seq2seq.MultiHeadedAttention(n_heads=8),  # <--- Add self attention
         tf.keras.layers.LSTM(n_lstm_units, return_sequences=True),
         tf.keras.layers.Dense(n_hidden_units, activation='relu'),
         tf.keras.layers.Dense(1, activation='sigmoid')])
@@ -59,7 +56,6 @@ Showcase
 
    embeddings
    learning
-   normalization
    seq2seq
    seq2vec
 
diff --git a/docs/source/normalization.rst b/docs/source/normalization.rst
deleted file mode 100644
index af7e234..0000000
--- a/docs/source/normalization.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-Normalization
-=============
-
-Modules for applying normalization techniques
-
-.. contents:: Modules
-    :local:
-    :depth: 1
-
-
--------
-
-.. _`layer_normalization`:
-
-``LayerNormalization``
-++++++++++++++++++++++
-
-.. automodule:: normalization.LayerNormalization
diff --git a/docs/source/seq2seq.rst b/docs/source/seq2seq.rst
index 9756580..3cc17ba 100644
--- a/docs/source/seq2seq.rst
+++ b/docs/source/seq2seq.rst
@@ -10,9 +10,9 @@ Layers mapping sequences to sequences
 
 -------
 
-.. _`multi_headed_self_attention`:
+.. _`multi_headed_attention`:
 
-``MultiHeadedSelfAttention``
+``MultiHeadedAttention``
 ++++++++++++++++++++++++++++
 
-.. automodule:: seq2seq.MultiHeadedSelfAttention
+.. automodule:: seq2seq.MultiHeadedAttention
diff --git a/setup.py b/setup.py
index 713da07..7e4ca62 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup
 
-VERSION = '0.4.1'
+VERSION = '0.5.0'
 
 setup(name='tavolo',
       version=VERSION,
diff --git a/tavolo/__init__.py b/tavolo/__init__.py
index 7132553..3541648 100644
--- a/tavolo/__init__.py
+++ b/tavolo/__init__.py
@@ -1,8 +1,7 @@
 __name__ = 'tavolo'
-__version__ = '0.4.1'
+__version__ = '0.5.0'
 
 from . import embeddings
-from . import normalization
 from . import seq2vec
 from . import seq2seq
 from . import learning
diff --git a/tavolo/normalization.py b/tavolo/normalization.py
deleted file mode 100644
index 423dbc4..0000000
--- a/tavolo/normalization.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import tensorflow as tf
-
-
-class LayerNormalization(tf.keras.layers.Layer):
-    """
-    Apply layer normalization
-
-
-    Arguments
-    ---------
-
-    - `epsilon` (``float``): Small number to avoid division by zero
-    - `name` (``str``): Layer name
-
-
-    Input shape
-    -----------
-
-    Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when
-    using this layer as the first layer in a model.
-
-
-    Output shape
-    ------------
-
-    Same shape as input.
-
-
-    Examples
-    --------
-
-    .. code-block:: python3
-
-        import tensorflow as tf
-        import tavolo as tvl
-
-        model = tf.keras.Sequential([SomeLayer(),
-                                     tvl.normalization.LayerNormalization()])  # Apply layer normalization on SomeLayer's output
-
-
-    References
-    ----------
-    `Layer Normalization`_
-
-
-    .. _Layer Normalization:
-        https://arxiv.org/pdf/1607.06450
-    """
-
-    def __init__(self, epsilon: float = 1e-8,
-                 name: str = 'layer_normalization',
-                 **kwargs):
-        """
-        :param epsilon: Small number to avoid division by zero
-        :param name: Layer name
-        """
-        super().__init__(name=name, **kwargs)
-
-        self.epsilon = epsilon
-        self.beta, self.gamma = None, None
-
-    def build(self, input_shape):
-        params_shape = input_shape[-1:]
-
-        # Initialize beta and gamma
-        self.beta = self.add_variable('beta',
-                                      shape=params_shape,
-                                      initializer=tf.keras.initializers.zeros,
-                                      dtype=self.dtype)
-        self.gamma = self.add_variable('gamma',
-                                       shape=params_shape,
-                                       initializer=tf.keras.initializers.ones,
-                                       dtype=self.dtype)
-
-        super().build(input_shape)
-
-    def compute_mask(self, inputs, mask=None):
-        return mask
-
-    def call(self, inputs,
-             **kwargs) -> tf.Tensor:
-        # Calculate mean and variance
-        mean, variance = tf.nn.moments(inputs, axes=-1, keepdims=True)  # shape=(batch_size, 1)
-
-        # Normalize
-        normalized = (inputs - mean) / ((variance + self.epsilon) ** .5)  # shape=(batch_size, channels)
-
-        return self.gamma * normalized + self.beta  # shape=(batch_size, channels)
-
-    def get_config(self):
-        base_config = super().get_config()
-        base_config['epsilon'] = self.epsilon
-
-        return base_config
-
-    @classmethod
-    def from_config(cls, config: dict):
-        return cls(**config)
diff --git a/tavolo/seq2seq.py b/tavolo/seq2seq.py
index 8f03261..3b7b54b 100644
--- a/tavolo/seq2seq.py
+++ b/tavolo/seq2seq.py
@@ -3,9 +3,9 @@
 import tensorflow as tf
 
 
-class MultiHeadedSelfAttention(tf.keras.layers.Layer):
+class MultiHeadedAttention(tf.keras.layers.Layer):
     """
-    Applies (multi headed) self attention, taken from the Transformer
+    Applies (multi headed) attention, as in the Transformer
     
     
     Arguments
@@ -14,7 +14,7 @@ class MultiHeadedSelfAttention(tf.keras.layers.Layer):
     - `n_heads` (``int``): Number of attention heads
     - `n_units` (``int``): Number of units (sum of units of all heads), defaults to the last dimension of the input
     - `dropout_rate` (``float``): Rate of outputs to drop in the range [0, 1]
-    - `causality` (``bool``): Use causality (make each time point in output dependent only on previous timepoints of input)
+    - `causal` (``bool``): Use causality (make each time point in output dependent only on previous timepoints of input)
     - `name` (``str``): Layer name
     
     
@@ -42,7 +42,7 @@ class MultiHeadedSelfAttention(tf.keras.layers.Layer):
     
 
         model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, 8, input_length=max_sequence_length),
-                                     tvl.seq2seq.MultiHeadedSelfAttention()])
+                                     tvl.seq2seq.MultiHeadedAttention()])
 
 
     Apply a single headed self attention
@@ -54,7 +54,21 @@ class MultiHeadedSelfAttention(tf.keras.layers.Layer):
 
 
         model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, 8, input_length=max_sequence_length),
-                                     tvl.seq2seq.MultiHeadedSelfAttention(n_heads=1)])
+                                     tvl.seq2seq.MultiHeadedAttention(n_heads=1)])
+
+    .. note::
+
+        When the intention is to apply attention using a query vector (not self attention), use the optional
+        ``query`` (and ``query_mask``) argument when calling. This means that for using non-self attention
+        this, you must utilize the `functional API`_ or use `model subclassing`_.
+
+
+    .. _`functional API`:
+        https://www.tensorflow.org/guide/keras/functional
+
+    .. _`model subclassing`:
+        https://www.tensorflow.org/guide/keras/custom_layers_and_models#building_models
+
 
     References
     ----------
@@ -69,8 +83,8 @@ def __init__(self,
                  n_heads: int = 4,
                  n_units: Optional[int] = None,
                  dropout_rate: float = 0.,
-                 causality: bool = False,
-                 name: str = 'multi_headed_self_attention',
+                 causal: bool = False,
+                 name: str = 'multi_headed_attention',
                  **kwargs):
         """
         Apply multi-headed attention
@@ -83,7 +97,7 @@ def __init__(self,
         :param n_heads: Number of attention heads
         :param n_units: Number of units (sum of units of all heads), defaults to the last dimension of the input
         :param dropout_rate: Rate of outputs to drop in the range [0, 1]
-        :param causality: Use causality (make each time point in output dependent only on previous timepoints of input)
+        :param causal: Use causality (make each time point in output dependent only on previous timepoints of input)
         :param name: Layer name
         """
 
@@ -92,7 +106,7 @@ def __init__(self,
         self.n_heads = n_heads
         self.n_units = n_units
         self.dropout_rate = dropout_rate
-        self.causality = causality
+        self.causal = causal
         self.Q = None
         self.K = None
         self.V = None
@@ -128,6 +142,9 @@ def build(self, input_shape):
                                        name='V',
                                        dtype=self.dtype)
 
+        self.attention = tf.keras.layers.Attention(use_scale=True,
+                                                   causal=self.causal)
+
         self.output_projection = tf.keras.layers.Dense(units=channels,
                                                        activation=None,
                                                        use_bias=False,
@@ -156,69 +173,38 @@ def call(self, inputs,
 
         if query is None:
             query = inputs  # Self attention
-
-            if query_mask is None:
-                query_mask = mask
+            query_mask = mask
 
         # Linear projections
         Q = self.Q(query)  # shape=(batch_size, time_steps, n_units)
         K = self.K(inputs)  # shape=(batch_size, time_steps, n_units)
         V = self.V(inputs)  # shape=(batch_size, time_steps, n_units)
 
-        # Split and concat
+        # Split and concat, for parallel execution
         Q = tf.concat(tf.split(Q, self.n_heads, axis=2),
                       axis=0)  # shape=(batch_size * n_heads, time_steps, n_units / n_heads)
         K = tf.concat(tf.split(K, self.n_heads, axis=2),
                       axis=0)  # shape=(batch_size * n_heads, time_steps, n_units / n_heads)
         V = tf.concat(tf.split(V, self.n_heads, axis=2),
                       axis=0)  # shape=(batch_size * n_heads, time_steps, n_units_input / n_heads)
+        attention_mask = list()
+        if query_mask is not None:
+            query_mask = tf.tile(query_mask, multiples=(self.n_heads, 1))  # shape=(batch_size * n_heads, time_steps)
+            attention_mask.append(query_mask)
+        if mask is not None:
+            mask = tf.tile(mask, multiples=(self.n_heads, 1))  # shape=(batch_size * n_heads, time_steps)
+            attention_mask.append(mask)
 
         # Attention query
-        QK = tf.matmul(Q, tf.transpose(K, perm=(0, 2, 1)))  # shape=(n_heads * batch_size, time_steps, time_steps)
-
-        # Scale
-        QK /= K.get_shape().as_list()[-1] ** 0.5  # shape=(n_heads * batch_size, time_steps, time_steps)
-
-        # Optional key masking
-        # If no mask will given a mask will be created to to represent the whole sequence minus the padding
-        input_mask = mask if mask is not None else tf.sign(
-            tf.abs(tf.reduce_sum(inputs, axis=-1)))  # shape=(batch_size, time_steps)
-        input_mask = tf.tile(input_mask, multiples=(self.n_heads, 1))  # shape=(batch_size * n_heads, time_steps)
-        input_mask = tf.tile(tf.expand_dims(input_mask, axis=1),
-                             multiples=(
-                                 1, tf.shape(query)[1], 1))  # shape=(batch_size * n_heads, time_steps, time_steps)
-        padding = tf.ones_like(QK) * self.very_small_value  # This will make sure the padded part won't be attended
-        QK = tf.where(tf.equal(input_mask, False), padding, QK)  # shape=(batch_size * n_heads, time_steps, time_steps)
-
-        # Causality
-        if self.causality:
-            causality_mask = tf.ones_like(QK[0, :, :])  # shape=(time_steps, time_steps)
-            causality_mask = tf.linalg.LinearOperatorLowerTriangular(
-                causality_mask).to_dense()  # shape=(time_steps, time_steps)
-            causality_mask = tf.tile(tf.expand_dims(  # shape=(batch_size * n_heads, time_steps, time_steps)
-                causality_mask, axis=0), multiples=(tf.shape(QK)[0], 1, 1))
-
-            padding = tf.ones_like(QK) * self.very_small_value
-            QK = tf.where(tf.equal(causality_mask, False), padding,
-                          QK)  # shape=(batch_size * n_heads, time_steps, time_steps)
-
-        # Create attention weights
-        alphas = tf.nn.softmax(QK)  # shape=(batch_size * n_heads, time_steps, time_steps)
-
-        # Optional query masking
-        query_mask = query_mask if query_mask is not None else tf.sign(
-            tf.abs(tf.reduce_sum(query, axis=-1)))  # shape=(batch_size, time_steps)
-        query_mask = tf.tile(query_mask, multiples=(self.n_heads, 1))  # shape=(batch_size * n_heads, time_steps)
-        query_mask = tf.tile(tf.expand_dims(query_mask, axis=-1), multiples=(
-            1, 1, tf.shape(inputs)[1]))  # shape=(batch_size * n_heads, time_steps, time_steps)
-        alphas *= tf.cast(query_mask, dtype=self.dtype)  # shape=(batch_size * n_heads, time_steps, time_steps)
+        attended = self.attention([Q, V, K],
+                                  mask=attention_mask)  # shape=(batch_size * n_heads, time_steps, n_units / n_heads)
 
         # Dropout
-        alphas = self.dropout(alphas, training=training)  # shape=(batch_size * n_heads, time_steps, time_steps)
+        attended = self.dropout(attended,
+                                training=training)  # shape=(batch_size * n_heads, time_steps, n_units / n_heads)
 
-        # Attend and restore shape
-        outputs = tf.matmul(alphas, V)  # shape=(batch_size * n_heads, time_steps, n_units / n_heads)
-        outputs = tf.concat(tf.split(outputs, self.n_heads, axis=0),
+        # Restore original shape
+        outputs = tf.concat(tf.split(attended, self.n_heads, axis=0),
                             axis=2)  # shape=(batch_size, time_steps, n_units)
 
         # Project output
@@ -231,7 +217,7 @@ def get_config(self):
         base_config['n_heads'] = self.n_heads
         base_config['n_units'] = self.n_units
         base_config['dropout_rate'] = self.dropout_rate
-        base_config['causality'] = self.causality
+        base_config['causal'] = self.causal
 
         return base_config
 
diff --git a/tests/normalization/__init__.py b/tests/normalization/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/normalization/layer_normalization_test.py b/tests/normalization/layer_normalization_test.py
deleted file mode 100644
index 2ba72b9..0000000
--- a/tests/normalization/layer_normalization_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import tensorflow as tf
-
-from tavolo.normalization import LayerNormalization
-
-
-def test_shapes():
-    """ Test input-output shapes """
-
-    # Inputs shape
-    input_shape_2d = (56, 10)
-    input_shape_3d = (56, 10, 30)
-
-    inputs_2d = tf.random.normal(shape=input_shape_2d)
-    inputs_3d = tf.random.normal(shape=input_shape_3d)
-
-    layer_norm_2d = LayerNormalization(name='layer_norm_2d')
-    layer_norm_3d = LayerNormalization(name='layer_norm_3d')
-
-    output_2d, output_3d = layer_norm_2d(inputs_2d), layer_norm_3d(inputs_3d)
-
-    # Assert correctness of output shapes
-    assert output_2d.shape == input_shape_2d
-    assert output_3d.shape == input_shape_3d
-
-
-def test_masking():
-    """ Test masking support """
-
-    # Input
-    input_shape_3d = (56, 10, 30)
-    inputs_3d = tf.random.normal(shape=input_shape_3d)
-    mask = tf.less(tf.reduce_sum(tf.reduce_sum(inputs_3d, axis=-1, keepdims=True), axis=-1, keepdims=True), 0)
-    masked_input = tf.where(tf.broadcast_to(mask, input_shape_3d), tf.zeros_like(inputs_3d), inputs_3d)
-
-    # Layers
-    masking_layer = tf.keras.layers.Masking(mask_value=0., input_shape=input_shape_3d[1:])
-    layer_norm_3d = LayerNormalization(name='layer_norm_3d')
-
-    result = layer_norm_3d(masking_layer(masked_input))
-
-    assert result.shape == input_shape_3d
-
-
-def test_logic():
-    """ Test logic on known input """
-
-    # Input
-    input_shape_2d = (56, 10)
-    inputs_2d = tf.ones(shape=input_shape_2d)
-
-    layer_norm_2d = LayerNormalization(name='layer_norm_2d')
-
-    # Assert output correctness
-    assert tf.reduce_sum(layer_norm_2d(inputs_2d)).numpy() == 0
-
-
-def test_serialization():
-    """ Test layer serialization (get_config, from_config) """
-
-    simple = LayerNormalization()
-    restored = LayerNormalization.from_config(simple.get_config())
-
-    assert restored.get_config() == simple.get_config()
diff --git a/tests/seq2seq/multi_headed_self_attention_test.py b/tests/seq2seq/multi_headed_attention_test.py
similarity index 50%
rename from tests/seq2seq/multi_headed_self_attention_test.py
rename to tests/seq2seq/multi_headed_attention_test.py
index 8544e94..559918c 100644
--- a/tests/seq2seq/multi_headed_self_attention_test.py
+++ b/tests/seq2seq/multi_headed_attention_test.py
@@ -1,7 +1,7 @@
 import pytest
 import tensorflow as tf
 
-from tavolo.seq2seq import MultiHeadedSelfAttention
+from tavolo.seq2seq import MultiHeadedAttention
 
 
 def test_shapes():
@@ -13,11 +13,11 @@ def test_shapes():
 
     inputs_3d = tf.random.normal(shape=input_shape_3d)
 
-    single_self_attention = MultiHeadedSelfAttention(n_heads=1,
-                                                     name='self_attention')
-    multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=4,
-                                                           n_units=n_units_mh,
-                                                           name='mh_self_attention')
+    single_self_attention = MultiHeadedAttention(n_heads=1,
+                                                 name='self_attention')
+    multi_headed_self_attention = MultiHeadedAttention(n_heads=4,
+                                                       n_units=n_units_mh,
+                                                       name='mh_self_attention')
 
     output_single, output_mh = single_self_attention(inputs_3d), multi_headed_self_attention(inputs_3d)
 
@@ -26,21 +26,37 @@ def test_shapes():
     assert output_mh.shape == input_shape_3d
 
 
+def test_query():
+    """ Test the ability to use query separately """
+    # Inputs shape
+    input_shape_3d = (56, 10, 30)
+    n_units_mh = 128
+
+    inputs_3d = tf.random.normal(shape=input_shape_3d)
+
+    multi_headed_attention = MultiHeadedAttention(n_heads=4,
+                                                  n_units=n_units_mh,
+                                                  name='mh_attention')
+
+    output_self, output_non_self = multi_headed_attention(inputs_3d), \
+                                   multi_headed_attention(inputs_3d, query=inputs_3d)
+
+    assert tf.reduce_all(tf.math.equal(output_self, output_non_self))
+
+
 def test_masking():
     """ Test masking support """
 
     # Input
     input_shape_3d = (56, 10, 30)
     inputs_3d = tf.random.normal(shape=input_shape_3d)
-    mask = tf.less(tf.reduce_sum(tf.reduce_sum(inputs_3d, axis=-1, keepdims=True), axis=-1, keepdims=True), 0)
-    masked_input = tf.where(tf.broadcast_to(mask, input_shape_3d), tf.zeros_like(inputs_3d), inputs_3d)
+    mask = tf.less(tf.reduce_sum(tf.reduce_sum(inputs_3d, axis=-1, keepdims=True), axis=-1), 0)
 
     # Layers
-    masking_layer = tf.keras.layers.Masking(mask_value=0., input_shape=input_shape_3d[1:])
-    multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=3,
-                                                           name='mh_self_attention')
+    multi_headed_self_attention = MultiHeadedAttention(n_heads=3,
+                                                       name='mh_self_attention')
 
-    result = multi_headed_self_attention(masking_layer(masked_input))
+    result = multi_headed_self_attention(inputs_3d, mask=mask)
 
     assert result.shape == input_shape_3d
 
@@ -54,10 +70,10 @@ def test_causality():
     inputs_3d = tf.random.normal(shape=input_shape_3d)
 
     # Layers
-    multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=4,
-                                                           n_units=n_units_mh,
-                                                           causality=True,
-                                                           name='mh_self_attention')
+    multi_headed_self_attention = MultiHeadedAttention(n_heads=4,
+                                                       n_units=n_units_mh,
+                                                       causal=True,
+                                                       name='mh_self_attention')
 
     result = multi_headed_self_attention(inputs_3d)
 
@@ -74,8 +90,8 @@ def test_causality():
 def test_serialization():
     """ Test layer serialization (get_config, from_config) """
 
-    simple = MultiHeadedSelfAttention()
-    restored = MultiHeadedSelfAttention.from_config(simple.get_config())
+    simple = MultiHeadedAttention()
+    restored = MultiHeadedAttention.from_config(simple.get_config())
 
     assert restored.get_config() == simple.get_config()
 
@@ -89,9 +105,9 @@ def test_exceptions():
 
     inputs_3d = tf.random.normal(shape=input_shape_3d)
 
-    multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=n_heads,
-                                                           n_units=n_units_mh,
-                                                           name='mh_self_attention')
+    multi_headed_self_attention = MultiHeadedAttention(n_heads=n_heads,
+                                                       n_units=n_units_mh,
+                                                       name='mh_self_attention')
 
     # n_units % n_heads != 0, not divisible
     with pytest.raises(ValueError) as excinfo: