diff --git a/.circleci/config.yml b/.circleci/config.yml index 812a421..4336605 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -56,6 +56,10 @@ jobs: source venv/bin/activate pytest -s --cov=tavolo tests/ codecov + test-3.8: + <<: *test-template + docker: + - image: circleci/python:3.6 test-3.6: <<: *test-template docker: diff --git a/README.rst b/README.rst index 69f308a..cbf28d5 100644 --- a/README.rst +++ b/README.rst @@ -6,10 +6,10 @@ ------------ -.. image:: https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg +.. image:: https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7%20%7C%203.8-blue.svg :alt: Supported Python versions -.. image:: https://img.shields.io/badge/tensorflow-2.0.0--rc0-orange.svg +.. image:: https://img.shields.io/badge/tensorflow-2.0-orange.svg :alt: Supported TensorFlow versions .. image:: https://codecov.io/gh/eliorc/tavolo/branch/master/graph/badge.svg @@ -27,8 +27,7 @@ Tavolo | You see, the deep learning world is moving fast, and new ideas keep on coming. | tavolo gathers implementations of these useful ideas from the community (by contribution, from `Kaggle`_ etc.) and makes them accessible in a single PyPI hosted package that compliments the `tf.keras`_ module. -| -| *Notice: tavolo is developed for TensorFlow 2.0 (right now on pre-release), most modules will work with earlier versions but some won't (like LayerNormalization)* + Documentation ------------- @@ -41,8 +40,8 @@ Showcase -------- | tavolo's API is straightforward and adopting its modules is as easy as it gets. -| In tavolo, you'll find implementations for basic layers like `LayerNormalization`_ to complex modules like the Transformer's - `MultiHeadedSelfAttention`_. You'll also find non-layer implementations that can ease development, like the `LearningRateFinder`_. +| In tavolo, you'll find implementations for basic layers like `PositionalEncoding`_ to complex modules like the Transformer's + `MultiHeadedAttention`_. You'll also find non-layer implementations that can ease development, like the `LearningRateFinder`_. | For example, if we wanted to add head a multi-headed attention mechanism into our model and look for the optimal learning rate, it would look something like: .. code-block:: python3 @@ -52,7 +51,7 @@ Showcase model = tf.keras.Sequential([ tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len), - tvl.seq2seq.MultiHeadedSelfAttention(n_heads=8), # <--- Add self attention + tvl.seq2seq.MultiHeadedAttention(n_heads=8), # <--- Add self attention tf.keras.layers.LSTM(n_lstm_units, return_sequences=True), tf.keras.layers.Dense(n_hidden_units, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid')]) @@ -70,8 +69,8 @@ Showcase .. _`TensorFlow`: https://www.tensorflow.org/ .. _`Kaggle`: https://www.kaggle.com .. _`tf.keras`: https://www.tensorflow.org/guide/keras -.. _`LayerNormalization`: https://tavolo.readthedocs.io/en/latest/normalization.html#layer-normalization -.. _`MultiHeadedSelfAttention`: https://tavolo.readthedocs.io/en/latest/seq2seq.html#multi-headed-self-attention +.. _`PositionalEncoding`: https://tavolo.readthedocs.io/en/latest/embeddings.html#module-embeddings.PositionalEncoding +.. _`MultiHeadedAttention`: https://tavolo.readthedocs.io/en/latest/seq2seq.html#multi-headed-self-attention .. _`LearningRateFinder`: https://tavolo.readthedocs.io/en/latest/learning.html#learning-rate-finder diff --git a/docs/source/index.rst b/docs/source/index.rst index f05ce5c..184baff 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -9,16 +9,13 @@ Welcome to tavolo's documentation! | tavolo gathers implementations of these useful ideas from the community (by contribution, from `Kaggle`_ etc.) and makes them accessible in a single PyPI hosted package that compliments the `tf.keras`_ module. -.. warning:: - - tavolo is developed for TensorFlow 2.0 (right now on pre-release), most modules will work with earlier versions but some won't (like LayerNormalization) Showcase -------- | tavolo's API is straightforward and adopting its modules is as easy as it gets. -| In tavolo, you'll find implementations for basic layers like :ref:`layer_normalization` to complex modules like the Transformer's - :ref:`multi_headed_self_attention`. You'll also find non-layer implementations that can ease development, like the :ref:`learning_rate_finder`. +| In tavolo, you'll find implementations for basic layers like :ref:`positional_encoding` to complex modules like the Transformer's + :ref:`multi_headed_attention`. You'll also find non-layer implementations that can ease development, like the :ref:`learning_rate_finder`. | For example, if we wanted to add head a multi-headed attention mechanism into our model and look for the optimal learning rate, it would look something like: .. code-block:: python3 @@ -28,7 +25,7 @@ Showcase model = tf.keras.Sequential([ tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len), - tvl.seq2seq.MultiHeadedSelfAttention(n_heads=8), # <--- Add self attention + tvl.seq2seq.MultiHeadedAttention(n_heads=8), # <--- Add self attention tf.keras.layers.LSTM(n_lstm_units, return_sequences=True), tf.keras.layers.Dense(n_hidden_units, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid')]) @@ -59,7 +56,6 @@ Showcase embeddings learning - normalization seq2seq seq2vec diff --git a/docs/source/normalization.rst b/docs/source/normalization.rst deleted file mode 100644 index af7e234..0000000 --- a/docs/source/normalization.rst +++ /dev/null @@ -1,18 +0,0 @@ -Normalization -============= - -Modules for applying normalization techniques - -.. contents:: Modules - :local: - :depth: 1 - - -------- - -.. _`layer_normalization`: - -``LayerNormalization`` -++++++++++++++++++++++ - -.. automodule:: normalization.LayerNormalization diff --git a/docs/source/seq2seq.rst b/docs/source/seq2seq.rst index 9756580..3cc17ba 100644 --- a/docs/source/seq2seq.rst +++ b/docs/source/seq2seq.rst @@ -10,9 +10,9 @@ Layers mapping sequences to sequences ------- -.. _`multi_headed_self_attention`: +.. _`multi_headed_attention`: -``MultiHeadedSelfAttention`` +``MultiHeadedAttention`` ++++++++++++++++++++++++++++ -.. automodule:: seq2seq.MultiHeadedSelfAttention +.. automodule:: seq2seq.MultiHeadedAttention diff --git a/setup.py b/setup.py index 713da07..7e4ca62 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup -VERSION = '0.4.1' +VERSION = '0.5.0' setup(name='tavolo', version=VERSION, diff --git a/tavolo/__init__.py b/tavolo/__init__.py index 7132553..3541648 100644 --- a/tavolo/__init__.py +++ b/tavolo/__init__.py @@ -1,8 +1,7 @@ __name__ = 'tavolo' -__version__ = '0.4.1' +__version__ = '0.5.0' from . import embeddings -from . import normalization from . import seq2vec from . import seq2seq from . import learning diff --git a/tavolo/normalization.py b/tavolo/normalization.py deleted file mode 100644 index 423dbc4..0000000 --- a/tavolo/normalization.py +++ /dev/null @@ -1,98 +0,0 @@ -import tensorflow as tf - - -class LayerNormalization(tf.keras.layers.Layer): - """ - Apply layer normalization - - - Arguments - --------- - - - `epsilon` (``float``): Small number to avoid division by zero - - `name` (``str``): Layer name - - - Input shape - ----------- - - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when - using this layer as the first layer in a model. - - - Output shape - ------------ - - Same shape as input. - - - Examples - -------- - - .. code-block:: python3 - - import tensorflow as tf - import tavolo as tvl - - model = tf.keras.Sequential([SomeLayer(), - tvl.normalization.LayerNormalization()]) # Apply layer normalization on SomeLayer's output - - - References - ---------- - `Layer Normalization`_ - - - .. _Layer Normalization: - https://arxiv.org/pdf/1607.06450 - """ - - def __init__(self, epsilon: float = 1e-8, - name: str = 'layer_normalization', - **kwargs): - """ - :param epsilon: Small number to avoid division by zero - :param name: Layer name - """ - super().__init__(name=name, **kwargs) - - self.epsilon = epsilon - self.beta, self.gamma = None, None - - def build(self, input_shape): - params_shape = input_shape[-1:] - - # Initialize beta and gamma - self.beta = self.add_variable('beta', - shape=params_shape, - initializer=tf.keras.initializers.zeros, - dtype=self.dtype) - self.gamma = self.add_variable('gamma', - shape=params_shape, - initializer=tf.keras.initializers.ones, - dtype=self.dtype) - - super().build(input_shape) - - def compute_mask(self, inputs, mask=None): - return mask - - def call(self, inputs, - **kwargs) -> tf.Tensor: - # Calculate mean and variance - mean, variance = tf.nn.moments(inputs, axes=-1, keepdims=True) # shape=(batch_size, 1) - - # Normalize - normalized = (inputs - mean) / ((variance + self.epsilon) ** .5) # shape=(batch_size, channels) - - return self.gamma * normalized + self.beta # shape=(batch_size, channels) - - def get_config(self): - base_config = super().get_config() - base_config['epsilon'] = self.epsilon - - return base_config - - @classmethod - def from_config(cls, config: dict): - return cls(**config) diff --git a/tavolo/seq2seq.py b/tavolo/seq2seq.py index 8f03261..3b7b54b 100644 --- a/tavolo/seq2seq.py +++ b/tavolo/seq2seq.py @@ -3,9 +3,9 @@ import tensorflow as tf -class MultiHeadedSelfAttention(tf.keras.layers.Layer): +class MultiHeadedAttention(tf.keras.layers.Layer): """ - Applies (multi headed) self attention, taken from the Transformer + Applies (multi headed) attention, as in the Transformer Arguments @@ -14,7 +14,7 @@ class MultiHeadedSelfAttention(tf.keras.layers.Layer): - `n_heads` (``int``): Number of attention heads - `n_units` (``int``): Number of units (sum of units of all heads), defaults to the last dimension of the input - `dropout_rate` (``float``): Rate of outputs to drop in the range [0, 1] - - `causality` (``bool``): Use causality (make each time point in output dependent only on previous timepoints of input) + - `causal` (``bool``): Use causality (make each time point in output dependent only on previous timepoints of input) - `name` (``str``): Layer name @@ -42,7 +42,7 @@ class MultiHeadedSelfAttention(tf.keras.layers.Layer): model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, 8, input_length=max_sequence_length), - tvl.seq2seq.MultiHeadedSelfAttention()]) + tvl.seq2seq.MultiHeadedAttention()]) Apply a single headed self attention @@ -54,7 +54,21 @@ class MultiHeadedSelfAttention(tf.keras.layers.Layer): model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, 8, input_length=max_sequence_length), - tvl.seq2seq.MultiHeadedSelfAttention(n_heads=1)]) + tvl.seq2seq.MultiHeadedAttention(n_heads=1)]) + + .. note:: + + When the intention is to apply attention using a query vector (not self attention), use the optional + ``query`` (and ``query_mask``) argument when calling. This means that for using non-self attention + this, you must utilize the `functional API`_ or use `model subclassing`_. + + + .. _`functional API`: + https://www.tensorflow.org/guide/keras/functional + + .. _`model subclassing`: + https://www.tensorflow.org/guide/keras/custom_layers_and_models#building_models + References ---------- @@ -69,8 +83,8 @@ def __init__(self, n_heads: int = 4, n_units: Optional[int] = None, dropout_rate: float = 0., - causality: bool = False, - name: str = 'multi_headed_self_attention', + causal: bool = False, + name: str = 'multi_headed_attention', **kwargs): """ Apply multi-headed attention @@ -83,7 +97,7 @@ def __init__(self, :param n_heads: Number of attention heads :param n_units: Number of units (sum of units of all heads), defaults to the last dimension of the input :param dropout_rate: Rate of outputs to drop in the range [0, 1] - :param causality: Use causality (make each time point in output dependent only on previous timepoints of input) + :param causal: Use causality (make each time point in output dependent only on previous timepoints of input) :param name: Layer name """ @@ -92,7 +106,7 @@ def __init__(self, self.n_heads = n_heads self.n_units = n_units self.dropout_rate = dropout_rate - self.causality = causality + self.causal = causal self.Q = None self.K = None self.V = None @@ -128,6 +142,9 @@ def build(self, input_shape): name='V', dtype=self.dtype) + self.attention = tf.keras.layers.Attention(use_scale=True, + causal=self.causal) + self.output_projection = tf.keras.layers.Dense(units=channels, activation=None, use_bias=False, @@ -156,69 +173,38 @@ def call(self, inputs, if query is None: query = inputs # Self attention - - if query_mask is None: - query_mask = mask + query_mask = mask # Linear projections Q = self.Q(query) # shape=(batch_size, time_steps, n_units) K = self.K(inputs) # shape=(batch_size, time_steps, n_units) V = self.V(inputs) # shape=(batch_size, time_steps, n_units) - # Split and concat + # Split and concat, for parallel execution Q = tf.concat(tf.split(Q, self.n_heads, axis=2), axis=0) # shape=(batch_size * n_heads, time_steps, n_units / n_heads) K = tf.concat(tf.split(K, self.n_heads, axis=2), axis=0) # shape=(batch_size * n_heads, time_steps, n_units / n_heads) V = tf.concat(tf.split(V, self.n_heads, axis=2), axis=0) # shape=(batch_size * n_heads, time_steps, n_units_input / n_heads) + attention_mask = list() + if query_mask is not None: + query_mask = tf.tile(query_mask, multiples=(self.n_heads, 1)) # shape=(batch_size * n_heads, time_steps) + attention_mask.append(query_mask) + if mask is not None: + mask = tf.tile(mask, multiples=(self.n_heads, 1)) # shape=(batch_size * n_heads, time_steps) + attention_mask.append(mask) # Attention query - QK = tf.matmul(Q, tf.transpose(K, perm=(0, 2, 1))) # shape=(n_heads * batch_size, time_steps, time_steps) - - # Scale - QK /= K.get_shape().as_list()[-1] ** 0.5 # shape=(n_heads * batch_size, time_steps, time_steps) - - # Optional key masking - # If no mask will given a mask will be created to to represent the whole sequence minus the padding - input_mask = mask if mask is not None else tf.sign( - tf.abs(tf.reduce_sum(inputs, axis=-1))) # shape=(batch_size, time_steps) - input_mask = tf.tile(input_mask, multiples=(self.n_heads, 1)) # shape=(batch_size * n_heads, time_steps) - input_mask = tf.tile(tf.expand_dims(input_mask, axis=1), - multiples=( - 1, tf.shape(query)[1], 1)) # shape=(batch_size * n_heads, time_steps, time_steps) - padding = tf.ones_like(QK) * self.very_small_value # This will make sure the padded part won't be attended - QK = tf.where(tf.equal(input_mask, False), padding, QK) # shape=(batch_size * n_heads, time_steps, time_steps) - - # Causality - if self.causality: - causality_mask = tf.ones_like(QK[0, :, :]) # shape=(time_steps, time_steps) - causality_mask = tf.linalg.LinearOperatorLowerTriangular( - causality_mask).to_dense() # shape=(time_steps, time_steps) - causality_mask = tf.tile(tf.expand_dims( # shape=(batch_size * n_heads, time_steps, time_steps) - causality_mask, axis=0), multiples=(tf.shape(QK)[0], 1, 1)) - - padding = tf.ones_like(QK) * self.very_small_value - QK = tf.where(tf.equal(causality_mask, False), padding, - QK) # shape=(batch_size * n_heads, time_steps, time_steps) - - # Create attention weights - alphas = tf.nn.softmax(QK) # shape=(batch_size * n_heads, time_steps, time_steps) - - # Optional query masking - query_mask = query_mask if query_mask is not None else tf.sign( - tf.abs(tf.reduce_sum(query, axis=-1))) # shape=(batch_size, time_steps) - query_mask = tf.tile(query_mask, multiples=(self.n_heads, 1)) # shape=(batch_size * n_heads, time_steps) - query_mask = tf.tile(tf.expand_dims(query_mask, axis=-1), multiples=( - 1, 1, tf.shape(inputs)[1])) # shape=(batch_size * n_heads, time_steps, time_steps) - alphas *= tf.cast(query_mask, dtype=self.dtype) # shape=(batch_size * n_heads, time_steps, time_steps) + attended = self.attention([Q, V, K], + mask=attention_mask) # shape=(batch_size * n_heads, time_steps, n_units / n_heads) # Dropout - alphas = self.dropout(alphas, training=training) # shape=(batch_size * n_heads, time_steps, time_steps) + attended = self.dropout(attended, + training=training) # shape=(batch_size * n_heads, time_steps, n_units / n_heads) - # Attend and restore shape - outputs = tf.matmul(alphas, V) # shape=(batch_size * n_heads, time_steps, n_units / n_heads) - outputs = tf.concat(tf.split(outputs, self.n_heads, axis=0), + # Restore original shape + outputs = tf.concat(tf.split(attended, self.n_heads, axis=0), axis=2) # shape=(batch_size, time_steps, n_units) # Project output @@ -231,7 +217,7 @@ def get_config(self): base_config['n_heads'] = self.n_heads base_config['n_units'] = self.n_units base_config['dropout_rate'] = self.dropout_rate - base_config['causality'] = self.causality + base_config['causal'] = self.causal return base_config diff --git a/tests/normalization/__init__.py b/tests/normalization/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/normalization/layer_normalization_test.py b/tests/normalization/layer_normalization_test.py deleted file mode 100644 index 2ba72b9..0000000 --- a/tests/normalization/layer_normalization_test.py +++ /dev/null @@ -1,63 +0,0 @@ -import tensorflow as tf - -from tavolo.normalization import LayerNormalization - - -def test_shapes(): - """ Test input-output shapes """ - - # Inputs shape - input_shape_2d = (56, 10) - input_shape_3d = (56, 10, 30) - - inputs_2d = tf.random.normal(shape=input_shape_2d) - inputs_3d = tf.random.normal(shape=input_shape_3d) - - layer_norm_2d = LayerNormalization(name='layer_norm_2d') - layer_norm_3d = LayerNormalization(name='layer_norm_3d') - - output_2d, output_3d = layer_norm_2d(inputs_2d), layer_norm_3d(inputs_3d) - - # Assert correctness of output shapes - assert output_2d.shape == input_shape_2d - assert output_3d.shape == input_shape_3d - - -def test_masking(): - """ Test masking support """ - - # Input - input_shape_3d = (56, 10, 30) - inputs_3d = tf.random.normal(shape=input_shape_3d) - mask = tf.less(tf.reduce_sum(tf.reduce_sum(inputs_3d, axis=-1, keepdims=True), axis=-1, keepdims=True), 0) - masked_input = tf.where(tf.broadcast_to(mask, input_shape_3d), tf.zeros_like(inputs_3d), inputs_3d) - - # Layers - masking_layer = tf.keras.layers.Masking(mask_value=0., input_shape=input_shape_3d[1:]) - layer_norm_3d = LayerNormalization(name='layer_norm_3d') - - result = layer_norm_3d(masking_layer(masked_input)) - - assert result.shape == input_shape_3d - - -def test_logic(): - """ Test logic on known input """ - - # Input - input_shape_2d = (56, 10) - inputs_2d = tf.ones(shape=input_shape_2d) - - layer_norm_2d = LayerNormalization(name='layer_norm_2d') - - # Assert output correctness - assert tf.reduce_sum(layer_norm_2d(inputs_2d)).numpy() == 0 - - -def test_serialization(): - """ Test layer serialization (get_config, from_config) """ - - simple = LayerNormalization() - restored = LayerNormalization.from_config(simple.get_config()) - - assert restored.get_config() == simple.get_config() diff --git a/tests/seq2seq/multi_headed_self_attention_test.py b/tests/seq2seq/multi_headed_attention_test.py similarity index 50% rename from tests/seq2seq/multi_headed_self_attention_test.py rename to tests/seq2seq/multi_headed_attention_test.py index 8544e94..559918c 100644 --- a/tests/seq2seq/multi_headed_self_attention_test.py +++ b/tests/seq2seq/multi_headed_attention_test.py @@ -1,7 +1,7 @@ import pytest import tensorflow as tf -from tavolo.seq2seq import MultiHeadedSelfAttention +from tavolo.seq2seq import MultiHeadedAttention def test_shapes(): @@ -13,11 +13,11 @@ def test_shapes(): inputs_3d = tf.random.normal(shape=input_shape_3d) - single_self_attention = MultiHeadedSelfAttention(n_heads=1, - name='self_attention') - multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=4, - n_units=n_units_mh, - name='mh_self_attention') + single_self_attention = MultiHeadedAttention(n_heads=1, + name='self_attention') + multi_headed_self_attention = MultiHeadedAttention(n_heads=4, + n_units=n_units_mh, + name='mh_self_attention') output_single, output_mh = single_self_attention(inputs_3d), multi_headed_self_attention(inputs_3d) @@ -26,21 +26,37 @@ def test_shapes(): assert output_mh.shape == input_shape_3d +def test_query(): + """ Test the ability to use query separately """ + # Inputs shape + input_shape_3d = (56, 10, 30) + n_units_mh = 128 + + inputs_3d = tf.random.normal(shape=input_shape_3d) + + multi_headed_attention = MultiHeadedAttention(n_heads=4, + n_units=n_units_mh, + name='mh_attention') + + output_self, output_non_self = multi_headed_attention(inputs_3d), \ + multi_headed_attention(inputs_3d, query=inputs_3d) + + assert tf.reduce_all(tf.math.equal(output_self, output_non_self)) + + def test_masking(): """ Test masking support """ # Input input_shape_3d = (56, 10, 30) inputs_3d = tf.random.normal(shape=input_shape_3d) - mask = tf.less(tf.reduce_sum(tf.reduce_sum(inputs_3d, axis=-1, keepdims=True), axis=-1, keepdims=True), 0) - masked_input = tf.where(tf.broadcast_to(mask, input_shape_3d), tf.zeros_like(inputs_3d), inputs_3d) + mask = tf.less(tf.reduce_sum(tf.reduce_sum(inputs_3d, axis=-1, keepdims=True), axis=-1), 0) # Layers - masking_layer = tf.keras.layers.Masking(mask_value=0., input_shape=input_shape_3d[1:]) - multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=3, - name='mh_self_attention') + multi_headed_self_attention = MultiHeadedAttention(n_heads=3, + name='mh_self_attention') - result = multi_headed_self_attention(masking_layer(masked_input)) + result = multi_headed_self_attention(inputs_3d, mask=mask) assert result.shape == input_shape_3d @@ -54,10 +70,10 @@ def test_causality(): inputs_3d = tf.random.normal(shape=input_shape_3d) # Layers - multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=4, - n_units=n_units_mh, - causality=True, - name='mh_self_attention') + multi_headed_self_attention = MultiHeadedAttention(n_heads=4, + n_units=n_units_mh, + causal=True, + name='mh_self_attention') result = multi_headed_self_attention(inputs_3d) @@ -74,8 +90,8 @@ def test_causality(): def test_serialization(): """ Test layer serialization (get_config, from_config) """ - simple = MultiHeadedSelfAttention() - restored = MultiHeadedSelfAttention.from_config(simple.get_config()) + simple = MultiHeadedAttention() + restored = MultiHeadedAttention.from_config(simple.get_config()) assert restored.get_config() == simple.get_config() @@ -89,9 +105,9 @@ def test_exceptions(): inputs_3d = tf.random.normal(shape=input_shape_3d) - multi_headed_self_attention = MultiHeadedSelfAttention(n_heads=n_heads, - n_units=n_units_mh, - name='mh_self_attention') + multi_headed_self_attention = MultiHeadedAttention(n_heads=n_heads, + n_units=n_units_mh, + name='mh_self_attention') # n_units % n_heads != 0, not divisible with pytest.raises(ValueError) as excinfo: