From 8bb77a1cb5ef2152dde02169e9597f8bb551c9ed Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 18 Aug 2023 15:26:39 +0000 Subject: [PATCH 01/88] v1 --- src/transformers/modeling_utils.py | 58 ++++++++++++++++ .../models/llama/modeling_llama.py | 67 +++++++++++++------ src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 11 +++ 4 files changed, 115 insertions(+), 22 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 26502481562e7b..49b2d2df5af649 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -66,6 +66,7 @@ is_accelerate_available, is_auto_gptq_available, is_bitsandbytes_available, + is_flash_attn_available, is_offline_mode, is_optimum_available, is_remote_url, @@ -1090,6 +1091,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix is_parallelizable = False supports_gradient_checkpointing = False + # Flash Attention 2 support + _supports_flash_attn_2 = False + @property def dummy_inputs(self) -> Dict[str, torch.Tensor]: """ @@ -1223,6 +1227,60 @@ def make_inputs_require_grads(module, input, output): self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads) + def enable_flash_attn_2(self) -> None: + """ + Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference + and training. + If you don't know about Flash Attention, check out the official repository of flash attention: + https://github.com/Dao-AILab/flash-attention + + For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a + look at this specific section of the documentation to learn more about it: + https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models + """ + if not self._supports_flash_attn_2: + raise ValueError( + "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to " + "request support for this architecture." + ) + + if not is_flash_attn_available(): + raise ImportError( + "Flash Attention 2.0 is not available. Please refer to the documentation of https://github.com/Dao-AILab/flash-attention for" + " installing it." + ) + + _is_bettertransformer = getattr(self, "use_bettertransformer", False) + + if _is_bettertransformer: + raise ValueError( + "Flash Attention 2 and BetterTransformer API are not compatible. Please use one API or the other." + ) + + self._enable_flash_attn_2() + self._flash_attn_2_enabled = True + + def disable_flash_attn_2(self) -> None: + """ + Disables the Flash Attention 2.0 implementation for this model for more memory efficient inference + and training. + """ + if not self._supports_flash_attn_2: + raise ValueError( + "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to " + "request support for this architecture." + ) + + _flash_attn_2_enabled = self._flash_attn_2_enabled + + if not _flash_attn_2_enabled: + raise ValueError( + "Flash Attention 2.0 is not enabled. Please enable it with `model.enable_flash_attn_2()`." + ) + + self._disable_flash_attn_2() + self._flash_attn_2_enabled = False + def disable_input_require_grads(self): """ Removes the `_require_grads_hook`. diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 309c3ef1de92f8..ad5697bff2d9f8 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -30,7 +30,12 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_utils import PreTrainedModel -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) from .configuration_llama import LlamaConfig @@ -256,6 +261,9 @@ def __init__(self, config: LlamaConfig): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self._is_using_flash_attn_2 = False + self._init_rope() def _init_rope(self): @@ -332,33 +340,37 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) + if not self._is_using_flash_attn_2: + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" ) - attn_weights = attn_weights + attention_mask - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + else: + # TODO: here + pass if self.config.pretraining_tp > 1: attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) @@ -464,6 +476,7 @@ class LlamaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True def _init_weights(self, module): std = self.config.initializer_range @@ -480,6 +493,16 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, LlamaModel): module.gradient_checkpointing = value + def _enable_flash_attn_2(self): + for module in self.modules(): + if isinstance(module, LlamaAttention): + module.is_using_flash_attn2 = True + + def _disable_flash_attn_2(self): + for module in self.modules(): + if isinstance(module, LlamaAttention): + module.is_using_flash_attn2 = False + LLAMA_INPUTS_DOCSTRING = r""" Args: diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 83b0128fbc58d5..e838628766f438 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -113,6 +113,7 @@ is_decord_available, is_detectron2_available, is_faiss_available, + is_flash_attn_available, is_flax_available, is_ftfy_available, is_in_notebook, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 54ed4030a2b6b0..535892f83e9394 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -71,6 +71,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True) _apex_available = _is_package_available("apex") _bitsandbytes_available = _is_package_available("bitsandbytes") +_flash_attn_available = _is_package_available("flash_attn") # `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed. _bs4_available = importlib.util.find_spec("bs4") is not None _coloredlogs_available = _is_package_available("coloredlogs") @@ -515,6 +516,16 @@ def is_bitsandbytes_available(): return _bitsandbytes_available and torch.cuda.is_available() +def is_flash_attn_available(): + if not is_torch_available(): + return False + + # Let's add an extra check to see if cuda is available + import torch + + return _flash_attn_available and torch.cuda.is_available() + + def is_torchdistx_available(): return _torchdistx_available From 2e18421a99fb89fe76d79ac38d4e2df4d07ed284 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 18 Aug 2023 15:28:20 +0000 Subject: [PATCH 02/88] oops --- src/transformers/modeling_utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 49b2d2df5af649..857595e1ef8793 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1229,13 +1229,12 @@ def make_inputs_require_grads(module, input, output): def enable_flash_attn_2(self) -> None: """ - Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference - and training. + Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference and training. If you don't know about Flash Attention, check out the official repository of flash attention: https://github.com/Dao-AILab/flash-attention - For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a - look at this specific section of the documentation to learn more about it: + For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this + specific section of the documentation to learn more about it: https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models """ if not self._supports_flash_attn_2: @@ -1262,8 +1261,8 @@ def enable_flash_attn_2(self) -> None: def disable_flash_attn_2(self) -> None: """ - Disables the Flash Attention 2.0 implementation for this model for more memory efficient inference - and training. + Disables the Flash Attention 2.0 implementation for this model for more memory efficient inference and + training. """ if not self._supports_flash_attn_2: raise ValueError( From fe5795ec8aefe9d7d373845f86819420a9ba57f3 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 18 Aug 2023 18:27:00 +0200 Subject: [PATCH 03/88] working v1 --- src/transformers/modeling_utils.py | 6 ++++ .../models/llama/modeling_llama.py | 36 +++++++++++++------ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 857595e1ef8793..d5cbf7f73fa47a 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1248,6 +1248,12 @@ def enable_flash_attn_2(self) -> None: "Flash Attention 2.0 is not available. Please refer to the documentation of https://github.com/Dao-AILab/flash-attention for" " installing it." ) + else: + is_flash_greater_than_2 = version.parse(importlib.metadata.version("flash_attn")) > version.parse("2.0.0") + if not is_flash_greater_than_2: + raise ValueError( + "You need flash_attn package version to be greater than 2.0. Make sure to have that version installed." + ) _is_bettertransformer = getattr(self, "use_bettertransformer", False) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index ad5697bff2d9f8..3bf4f6813fc009 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -35,9 +35,13 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + is_flash_attn_available, ) from .configuration_llama import LlamaConfig +if is_flash_attn_available(): + from flash_attn import flash_attn_func + logger = logging.get_logger(__name__) @@ -323,12 +327,21 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + if self._is_using_flash_attn_2: + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dime x hidden_dim + # therefore we need to transpose-back the qkv states to their original shape + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) @@ -336,11 +349,11 @@ def forward( past_key_value = (key_states, value_states) if use_cache else None - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - if not self._is_using_flash_attn_2: + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): @@ -365,13 +378,16 @@ def forward( f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}" ) - + attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) else: - # TODO: here - pass + attn_output = flash_attn_func( + query_states, key_states, value_states, 0.0, causal=True + ) + attn_weights = None + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + if self.config.pretraining_tp > 1: attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) @@ -496,12 +512,12 @@ def _set_gradient_checkpointing(self, module, value=False): def _enable_flash_attn_2(self): for module in self.modules(): if isinstance(module, LlamaAttention): - module.is_using_flash_attn2 = True + module._is_using_flash_attn_2 = True def _disable_flash_attn_2(self): for module in self.modules(): if isinstance(module, LlamaAttention): - module.is_using_flash_attn2 = False + module._is_using_flash_attn_2 = False LLAMA_INPUTS_DOCSTRING = r""" From 4bd15e2dadb3cb62da110f5cccd3a700cc077046 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 18 Aug 2023 16:29:23 +0000 Subject: [PATCH 04/88] fixup --- src/transformers/models/llama/modeling_llama.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 3bf4f6813fc009..4eed537651bc29 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -33,12 +33,13 @@ from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, + is_flash_attn_available, logging, replace_return_docstrings, - is_flash_attn_available, ) from .configuration_llama import LlamaConfig + if is_flash_attn_available(): from flash_attn import flash_attn_func @@ -327,7 +328,6 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] @@ -378,16 +378,14 @@ def forward( f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}" ) - + attn_output = attn_output.transpose(1, 2).contiguous() else: - attn_output = flash_attn_func( - query_states, key_states, value_states, 0.0, causal=True - ) + attn_output = flash_attn_func(query_states, key_states, value_states, 0.0, causal=True) attn_weights = None attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - + if self.config.pretraining_tp > 1: attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) From 49fe3185e0994d04fbfd107205cdb9da96a54460 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 18 Aug 2023 18:59:49 +0200 Subject: [PATCH 05/88] add some TODOs --- src/transformers/models/llama/modeling_llama.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 3bf4f6813fc009..38dfc151d640b6 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -41,6 +41,7 @@ if is_flash_attn_available(): from flash_attn import flash_attn_func + from flash_attn.bert_padding import unpad_input, pad_input # noqa logger = logging.get_logger(__name__) @@ -381,9 +382,16 @@ def forward( attn_output = attn_output.transpose(1, 2).contiguous() else: + # TODO: llama does not have dropout in the config?? + # It is recommended to use dropout with FA according to the docs + # when training. + dropout_rate = 0.0 # if not self.training else self.attn_dropout + + # TODO: support padding using `unpad_input` and `pad_input` attn_output = flash_attn_func( - query_states, key_states, value_states, 0.0, causal=True + query_states, key_states, value_states, dropout_rate, causal=True ) + attn_weights = None attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) From 50491e83456739f14105f2251dfc40093e65e95d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 18 Aug 2023 17:03:12 +0000 Subject: [PATCH 06/88] fixup --- src/transformers/models/llama/modeling_llama.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index c160b8ff63e80e..1fec5463839df4 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -42,7 +42,7 @@ if is_flash_attn_available(): from flash_attn import flash_attn_func - from flash_attn.bert_padding import unpad_input, pad_input # noqa + from flash_attn.bert_padding import pad_input, unpad_input # noqa logger = logging.get_logger(__name__) @@ -385,12 +385,10 @@ def forward( # TODO: llama does not have dropout in the config?? # It is recommended to use dropout with FA according to the docs # when training. - dropout_rate = 0.0 # if not self.training else self.attn_dropout + dropout_rate = 0.0 # if not self.training else self.attn_dropout # TODO: support padding using `unpad_input` and `pad_input` - attn_output = flash_attn_func( - query_states, key_states, value_states, dropout_rate, causal=True - ) + attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) attn_weights = None From 0e30d13c903cd0b44545b59e6c7b8bfdd592f862 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 23 Aug 2023 14:57:47 +0000 Subject: [PATCH 07/88] padding support + try with module replacement --- .../models/llama/modeling_llama.py | 210 +++++++++++++----- src/transformers/pytorch_utils.py | 31 +++ 2 files changed, 191 insertions(+), 50 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 1fec5463839df4..18656da1bf257a 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -30,6 +30,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import reset_and_attach_new_hooks from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -41,7 +42,7 @@ if is_flash_attn_available(): - from flash_attn import flash_attn_func + from flash_attn import flash_attn_func, flash_attn_varlen_func from flash_attn.bert_padding import pad_input, unpad_input # noqa @@ -68,6 +69,59 @@ def _make_causal_mask( return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) +def _convert_to_padding_mask(attention_mask: torch.Tensor, mask_value: float = 0.0): + """ + Convert causal attention mask to key-padding mask + """ + if len(attention_mask.size()) != 4: + raise ValueError( + "Expecting attention_mask to have 4 dimensions, got tensor of shape: " f"{attention_mask.size()}" + ) + + batch_size = attention_mask.size(0) + key_length = attention_mask.size(-1) + + padding_mask = torch.ones((batch_size, key_length), device=attention_mask.device) + + for i in range(batch_size): + mask_slice = attention_mask[i, :, -1, :] + padding_mask[i, :] = torch.all(mask_slice == mask_value, dim=0) + + return padding_mask + + +def recursively_replace_module(model, old_class, target_class): + """ + Recursively replace all old_class instances of the model with a target class. The target class should have the same + sub-module names than the old class. + + Args: + model (`torch.nn.Module`): + The model or the child module used for recursion + old_class (`class`): + The target old class to replace + target_class (`class`): + The new class that is going to be used in the replaced module. + """ + for name, module in model.named_children(): + if isinstance(module, old_class): + torch_device = module.q_proj.weight.device + with torch.device(torch_device): + new_module = target_class(module.config) + + for inner_module_name, inner_module in module.named_modules(): + setattr(new_module, inner_module_name, inner_module) + + if hasattr(module, "_hf_hook"): + reset_and_attach_new_hooks(module, new_module, module._hf_hook) + + model._modules[name] = new_module + module = None + + if module is not None and len(list(module.children())) > 0: + recursively_replace_module(module, old_class, target_class) + + # Copied from transformers.models.bart.modeling_bart._expand_mask def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ @@ -268,8 +322,6 @@ def __init__(self, config: LlamaConfig): self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self._is_using_flash_attn_2 = False - self._init_rope() def _init_rope(self): @@ -335,14 +387,6 @@ def forward( cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - if self._is_using_flash_attn_2: - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dime x hidden_dim - # therefore we need to transpose-back the qkv states to their original shape - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - if past_key_value is not None: # reuse k, v, self_attention key_states = torch.cat([past_key_value[0], key_states], dim=2) @@ -350,47 +394,35 @@ def forward( past_key_value = (key_states, value_states) if use_cache else None - if not self._is_using_flash_attn_2: - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) + attn_weights = attn_weights + attention_mask - attn_output = attn_output.transpose(1, 2).contiguous() - else: - # TODO: llama does not have dropout in the config?? - # It is recommended to use dropout with FA according to the docs - # when training. - dropout_rate = 0.0 # if not self.training else self.attn_dropout + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) - # TODO: support padding using `unpad_input` and `pad_input` - attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) - attn_weights = None + attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) @@ -407,6 +439,84 @@ def forward( return attn_output, attn_weights, past_key_value +class LlamaFlashAttention(LlamaAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # LlamaFlashAttention attention does not support output_attentions + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dime x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + past_key_value = None + + # TODO: llama does not have dropout in the config?? + # It is recommended to use dropout with FA according to the docs + # when training. + dropout_rate = 0.0 # if not self.training else self.attn_dropout + + padding_mask = _convert_to_padding_mask(attention_mask) + + # contains at least one padding token + if padding_mask.sum().item() != bsz * kv_seq_len: + query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask) + key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask) + value_states, _, _, _ = unpad_input(value_states, padding_mask) + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=current_query_length, + cu_seqlens_k=current_key_length, + max_seqlen_q=query_max_seqlen, + max_seqlen_k=key_max_seqlen, + dropout_p=0.0, + softmax_scale=None, + causal=True, + ) + + attn_output = pad_input(attn_output_unpad, indices, bsz, kv_seq_len) + else: + attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + class LlamaDecoderLayer(nn.Module): def __init__(self, config: LlamaConfig): super().__init__() @@ -516,14 +626,14 @@ def _set_gradient_checkpointing(self, module, value=False): module.gradient_checkpointing = value def _enable_flash_attn_2(self): - for module in self.modules(): - if isinstance(module, LlamaAttention): - module._is_using_flash_attn_2 = True + for _, module in self.named_children(): + if len(list(module.children())) > 0: + recursively_replace_module(module, LlamaAttention, LlamaFlashAttention) def _disable_flash_attn_2(self): - for module in self.modules(): - if isinstance(module, LlamaAttention): - module._is_using_flash_attn_2 = False + for _, module in self.named_children(): + if len(list(module.children())) > 0: + recursively_replace_module(module, LlamaFlashAttention, LlamaAttention) LLAMA_INPUTS_DOCSTRING = r""" diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index fb509fc3eab092..d9d5b57dad85c0 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -286,3 +286,34 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]: non-overlapping lifetimes may have the same id. """ return tensor.device, storage_ptr(tensor), storage_size(tensor) + + +def reset_and_attach_new_hooks(old_module, new_module) -> None: + """ + Attach new hooks in new_module that are similar to the hook of old_module + + Args: + old_module (`torch.nn.Module`): + The old module that contains the old hook + new_module (`torch.nn.Module`): + The new module that does not contain any hook + hook (`~accelerate.hooks.AlignDeviceHook`): + The + """ + import accelerate + from accelerate.hooks import add_hook_to_module, remove_hook_from_module + + hook = old_module._hf_hook + + hook_cls = getattr(accelerate.hooks, hook.__class__.__name__) + hook_attr = hook.__dict__ + filtered_old_hook_attr = {} + old_hook_init_signature = inspect.signature(hook_cls.__init__) + for k in hook_attr.keys(): + if k in old_hook_init_signature.parameters: + filtered_old_hook_attr[k] = hook_attr[k] + + new_hook = hook_cls(**filtered_old_hook_attr) + + remove_hook_from_module(old_module) + add_hook_to_module(new_module, new_hook) From ad8b9054c31d5b7b015e406c0359a62e60ad4bda Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 23 Aug 2023 15:07:44 +0000 Subject: [PATCH 08/88] nit --- .../models/llama/modeling_llama.py | 49 ++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 18656da1bf257a..40955fde902368 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -113,7 +113,7 @@ def recursively_replace_module(model, old_class, target_class): setattr(new_module, inner_module_name, inner_module) if hasattr(module, "_hf_hook"): - reset_and_attach_new_hooks(module, new_module, module._hf_hook) + reset_and_attach_new_hooks(module, new_module) model._modules[name] = new_module module = None @@ -439,9 +439,54 @@ def forward( return attn_output, attn_weights, past_key_value -class LlamaFlashAttention(LlamaAttention): +class LlamaFlashAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" + # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ + def __init__(self, config: LlamaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self._init_rope() + + # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = LlamaLinearScalingRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + ) + elif scaling_type == "dynamic": + self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + # Copied from transformers.models.llama.modeling_llama.LlamaAttention._shape + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + def forward( self, hidden_states: torch.Tensor, From 3c31f10e93e1326cdbdff0811ca6be5d09cee1d7 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 10:22:42 +0000 Subject: [PATCH 09/88] alternative design --- src/transformers/modeling_utils.py | 47 ++++++------------ .../models/llama/modeling_llama.py | 49 ++----------------- src/transformers/pytorch_utils.py | 31 ------------ 3 files changed, 20 insertions(+), 107 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 5ab1f53d866253..16890a5b8146b0 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1224,18 +1224,8 @@ def can_generate(cls) -> bool: return False return True - def enable_input_require_grads(self): - """ - Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping - the model weights fixed. - """ - - def make_inputs_require_grads(module, input, output): - output.requires_grad_(True) - - self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads) - - def enable_flash_attn_2(self) -> None: + @classmethod + def _check_and_enable_flash_attn_2(cls, config) -> PretrainedConfig: """ Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference and training. If you don't know about Flash Attention, check out the official repository of flash attention: @@ -1245,7 +1235,7 @@ def enable_flash_attn_2(self) -> None: specific section of the documentation to learn more about it: https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models """ - if not self._supports_flash_attn_2: + if not cls._supports_flash_attn_2: raise ValueError( "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to " "request support for this architecture." @@ -1263,36 +1253,25 @@ def enable_flash_attn_2(self) -> None: "You need flash_attn package version to be greater than 2.0. Make sure to have that version installed." ) - _is_bettertransformer = getattr(self, "use_bettertransformer", False) + _is_bettertransformer = getattr(cls, "use_bettertransformer", False) if _is_bettertransformer: raise ValueError( "Flash Attention 2 and BetterTransformer API are not compatible. Please use one API or the other." ) - self._enable_flash_attn_2() - self._flash_attn_2_enabled = True + config._flash_attn_2_enabled = True - def disable_flash_attn_2(self) -> None: + def enable_input_require_grads(self): """ - Disables the Flash Attention 2.0 implementation for this model for more memory efficient inference and - training. + Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping + the model weights fixed. """ - if not self._supports_flash_attn_2: - raise ValueError( - "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to " - "request support for this architecture." - ) - - _flash_attn_2_enabled = self._flash_attn_2_enabled - if not _flash_attn_2_enabled: - raise ValueError( - "Flash Attention 2.0 is not enabled. Please enable it with `model.enable_flash_attn_2()`." - ) + def make_inputs_require_grads(module, input, output): + output.requires_grad_(True) - self._disable_flash_attn_2() - self._flash_attn_2_enabled = False + self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads) def disable_input_require_grads(self): """ @@ -2403,6 +2382,7 @@ def from_pretrained( variant = kwargs.pop("variant", None) _adapter_model_path = kwargs.pop("_adapter_model_path", None) adapter_name = kwargs.pop("adapter_name", "default") + use_flash_attn_2 = kwargs.pop("use_flash_attn_2", False) if is_fsdp_enabled(): low_cpu_mem_usage = True @@ -2996,6 +2976,9 @@ def from_pretrained( elif load_in_8bit or load_in_4bit or low_cpu_mem_usage: init_contexts.append(init_empty_weights()) + if use_flash_attn_2: + config = cls._check_and_enable_flash_attn_2(config) + with ContextManagers(init_contexts): model = cls(config, *model_args, **model_kwargs) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 40955fde902368..734be31d2db47a 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -30,7 +30,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import reset_and_attach_new_hooks from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -90,38 +89,6 @@ def _convert_to_padding_mask(attention_mask: torch.Tensor, mask_value: float = 0 return padding_mask -def recursively_replace_module(model, old_class, target_class): - """ - Recursively replace all old_class instances of the model with a target class. The target class should have the same - sub-module names than the old class. - - Args: - model (`torch.nn.Module`): - The model or the child module used for recursion - old_class (`class`): - The target old class to replace - target_class (`class`): - The new class that is going to be used in the replaced module. - """ - for name, module in model.named_children(): - if isinstance(module, old_class): - torch_device = module.q_proj.weight.device - with torch.device(torch_device): - new_module = target_class(module.config) - - for inner_module_name, inner_module in module.named_modules(): - setattr(new_module, inner_module_name, inner_module) - - if hasattr(module, "_hf_hook"): - reset_and_attach_new_hooks(module, new_module) - - model._modules[name] = new_module - module = None - - if module is not None and len(list(module.children())) > 0: - recursively_replace_module(module, old_class, target_class) - - # Copied from transformers.models.bart.modeling_bart._expand_mask def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ @@ -566,7 +533,11 @@ class LlamaDecoderLayer(nn.Module): def __init__(self, config: LlamaConfig): super().__init__() self.hidden_size = config.hidden_size - self.self_attn = LlamaAttention(config=config) + self.self_attn = ( + LlamaAttention(config=config) + if not getattr(config, "_flash_attn_2_enabled", False) + else LlamaFlashAttention(config=config) + ) self.mlp = LlamaMLP(config) self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -670,16 +641,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, LlamaModel): module.gradient_checkpointing = value - def _enable_flash_attn_2(self): - for _, module in self.named_children(): - if len(list(module.children())) > 0: - recursively_replace_module(module, LlamaAttention, LlamaFlashAttention) - - def _disable_flash_attn_2(self): - for _, module in self.named_children(): - if len(list(module.children())) > 0: - recursively_replace_module(module, LlamaFlashAttention, LlamaAttention) - LLAMA_INPUTS_DOCSTRING = r""" Args: diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index d9d5b57dad85c0..fb509fc3eab092 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -286,34 +286,3 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]: non-overlapping lifetimes may have the same id. """ return tensor.device, storage_ptr(tensor), storage_size(tensor) - - -def reset_and_attach_new_hooks(old_module, new_module) -> None: - """ - Attach new hooks in new_module that are similar to the hook of old_module - - Args: - old_module (`torch.nn.Module`): - The old module that contains the old hook - new_module (`torch.nn.Module`): - The new module that does not contain any hook - hook (`~accelerate.hooks.AlignDeviceHook`): - The - """ - import accelerate - from accelerate.hooks import add_hook_to_module, remove_hook_from_module - - hook = old_module._hf_hook - - hook_cls = getattr(accelerate.hooks, hook.__class__.__name__) - hook_attr = hook.__dict__ - filtered_old_hook_attr = {} - old_hook_init_signature = inspect.signature(hook_cls.__init__) - for k in hook_attr.keys(): - if k in old_hook_init_signature.parameters: - filtered_old_hook_attr[k] = hook_attr[k] - - new_hook = hook_cls(**filtered_old_hook_attr) - - remove_hook_from_module(old_module) - add_hook_to_module(new_module, new_hook) From 2628bf3c02dddf6b51b3dea257af7eba1f474f26 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 10:26:47 +0000 Subject: [PATCH 10/88] oops --- src/transformers/modeling_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 16890a5b8146b0..a39c6c8ccafd52 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1261,6 +1261,7 @@ def _check_and_enable_flash_attn_2(cls, config) -> PretrainedConfig: ) config._flash_attn_2_enabled = True + return config def enable_input_require_grads(self): """ From 20d1b370ad7731898a68a2753d6522118a984732 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 10:46:08 +0000 Subject: [PATCH 11/88] add `use_cache` support for llama --- src/transformers/models/llama/modeling_llama.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 44a2863bcf0626..af12564881fcbf 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -495,12 +495,17 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) - past_key_value = None - # TODO: llama does not have dropout in the config?? # It is recommended to use dropout with FA according to the docs # when training. From a82f1ca456424fa2369f4e6d005e1e6903bce793 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:10:07 +0000 Subject: [PATCH 12/88] v1 falcon --- .../models/falcon/modeling_falcon.py | 189 +++++++++++++++++- 1 file changed, 187 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index d0b0c40c8385b1..dfef3954bafd6c 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -31,10 +31,19 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_available, + logging, +) from .configuration_falcon import FalconConfig +if is_flash_attn_available(): + from flash_attn import flash_attn_func + logger = logging.get_logger(__name__) FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -139,6 +148,28 @@ def _expand_mask(mask: torch.Tensor, past_key_values_length: int) -> torch.BoolT return expanded_mask.expand(batch_size, 1, seq_length, total_length) +# Copied from transformers.models.llama.modeling_llama._convert_to_padding_mask +def _convert_to_padding_mask(attention_mask: torch.Tensor, mask_value: float = 0.0): + """ + Convert causal attention mask to key-padding mask + """ + if len(attention_mask.size()) != 4: + raise ValueError( + "Expecting attention_mask to have 4 dimensions, got tensor of shape: " f"{attention_mask.size()}" + ) + + batch_size = attention_mask.size(0) + key_length = attention_mask.size(-1) + + padding_mask = torch.ones((batch_size, key_length), device=attention_mask.device) + + for i in range(batch_size): + mask_slice = attention_mask[i, :, -1, :] + padding_mask[i, :] = torch.all(mask_slice == mask_value, dim=0) + + return padding_mask + + def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: batch_size, seq_length = attention_mask.shape closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) @@ -395,6 +426,159 @@ def forward( return output_tensor, present +class FalconFlashAttention(nn.Module): + # Copied from transformers.models.falcon.modeling_falcon.FalconAttention.__init__ + def __init__(self, config: FalconConfig): + super().__init__() + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.split_size = self.hidden_size + self.hidden_dropout = config.hidden_dropout + + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" + f" {self.num_heads})." + ) + + self.maybe_rotary = FalconRotaryEmbedding(config.head_dim) if config.rotary else lambda q, k, t: (q, k) + + # Layer-wise attention scaling + self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) + self.beta = self.inv_norm_factor + if config.new_decoder_architecture: + qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim + elif config.multi_query: + qkv_out_dim = self.hidden_size + 2 * self.head_dim + else: + qkv_out_dim = 3 * self.hidden_size + self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias) + self.new_decoder_architecture = config.new_decoder_architecture + self.multi_query = config.multi_query + self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias) + self.attention_dropout = nn.Dropout(config.attention_dropout) + self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 + + # Copied from transformers.models.falcon.modeling_falcon.FalconAttention._split_heads + def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv` + + Args: + fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim] + + Returns: + query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim] + value: [batch_size, seq_length, num_heads, head_dim] + """ + if self.new_decoder_architecture: + batch, seq_len, _ = fused_qkv.shape + qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim) + query = qkv[:, :, :, :-2] + key = qkv[:, :, :, [-2]] + value = qkv[:, :, :, [-1]] + key = torch.broadcast_to(key, query.shape) + value = torch.broadcast_to(value, query.shape) + + query, key, value = [x.flatten(2, 3) for x in (query, key, value)] + return query, key, value + elif not self.multi_query: + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) + return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :] + else: + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim) + return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :] + + # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads + def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: + """ + Merge heads together over the last dimension + + Args: + x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim] + + Returns: + torch.tensor: [batch_size, seq_length, num_heads * head_dim] + """ + # What we want to achieve is: + # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim + batch_size_and_num_heads, seq_length, _ = x.shape + batch_size = batch_size_and_num_heads // self.num_heads + + # First view to decompose the batch size + # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim + x = x.view(batch_size, self.num_heads, seq_length, self.head_dim) + + # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim + x = x.permute(0, 2, 1, 3) + + # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim + return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + ): + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads + # 3 x [batch_size, seq_length, num_heads, head_dim] + (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + + batch_size, query_length, _, _ = query_layer.shape + + query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim) + key_layer = key_layer.transpose(1, 2).reshape( + batch_size * num_kv_heads, + query_length, + self.head_dim, + ) + value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim) + + past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] + query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) + + if layer_past is not None: + past_key, past_value = layer_past + # concatenate along seq_length dimension: + # - key: [batch_size * self.num_heads, kv_length, head_dim] + # - value: [batch_size * self.num_heads, kv_length, head_dim] + key_layer = torch.cat((past_key, key_layer), dim=1) + value_layer = torch.cat((past_value, value_layer), dim=1) + + _, kv_length, _ = key_layer.shape + if use_cache: + present = (key_layer, value_layer) + else: + present = None + (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(query_layer.dtype) + query_layer_ = ( + query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) + ) + key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) + value_layer_ = ( + value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) + ) + + if alibi is not None: + raise ValueError("`alibi` is not supported when `use_flash_attn` is True") + + # below output will have shape (batch_size, seqlen, nheads, headdim) + attn_output = flash_attn_func(query_layer_, key_layer_, value_layer_, causal=True) + attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + output_tensor = self.dense(attn_output) + return output_tensor, present + + class FalconMLP(nn.Module): def __init__(self, config: FalconConfig): super().__init__() @@ -416,7 +600,7 @@ def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - self.self_attention = FalconAttention(config) + self.self_attention = FalconAttention(config) if config._use_flash_attn_2 else FalconFlashAttention(config) self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout self.config = config @@ -569,6 +753,7 @@ class FalconPreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["FalconDecoderLayer"] + _supports_flash_attn_2 = True def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) From c72e8ff8c227bb346d19680047d53fcdc5fdff5d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:11:32 +0000 Subject: [PATCH 13/88] nit --- src/transformers/models/falcon/modeling_falcon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index dfef3954bafd6c..dfc11b3b4cd389 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -600,7 +600,7 @@ def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - self.self_attention = FalconAttention(config) if config._use_flash_attn_2 else FalconFlashAttention(config) + self.self_attention = FalconAttention(config) if getattr(config, "_use_flash_attn_2", False) else FalconFlashAttention(config) self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout self.config = config From 66823f92765517e601474b7ab6bd5deb71d0e523 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:25:48 +0000 Subject: [PATCH 14/88] a bit of refactor --- .../models/falcon/modeling_falcon.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index dfc11b3b4cd389..66804676f81960 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -556,27 +556,33 @@ def forward( value_layer = torch.cat((past_value, value_layer), dim=1) _, kv_length, _ = key_layer.shape + torch_dtype = query_layer.dtype if use_cache: present = (key_layer, value_layer) else: present = None - (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(query_layer.dtype) - query_layer_ = ( - query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) + (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(torch_dtype) + query_layer = ( + query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) ) - key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) - value_layer_ = ( - value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) + key_layer = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) + value_layer = ( + value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) ) if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") # below output will have shape (batch_size, seqlen, nheads, headdim) - attn_output = flash_attn_func(query_layer_, key_layer_, value_layer_, causal=True) - attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) - output_tensor = self.dense(attn_output) - return output_tensor, present + attn_weights = flash_attn_func(query_layer, key_layer, value_layer, causal=True) + attn_weights = attn_weights.reshape(batch_size, query_length, self.num_heads * self.head_dim) + + attn_output = self.dense(attn_weights) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, present class FalconMLP(nn.Module): From 41f8f3d55ee466a5e97e985eafae5f9236ec9c64 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:27:51 +0000 Subject: [PATCH 15/88] nit --- src/transformers/models/falcon/modeling_falcon.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 66804676f81960..c608ce07a61385 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -556,11 +556,11 @@ def forward( value_layer = torch.cat((past_value, value_layer), dim=1) _, kv_length, _ = key_layer.shape + torch_dtype = query_layer.dtype - if use_cache: - present = (key_layer, value_layer) - else: - present = None + + past_key_value = (key_layer, value_layer) if use_cache else None + (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(torch_dtype) query_layer = ( query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) @@ -576,13 +576,13 @@ def forward( # below output will have shape (batch_size, seqlen, nheads, headdim) attn_weights = flash_attn_func(query_layer, key_layer, value_layer, causal=True) attn_weights = attn_weights.reshape(batch_size, query_length, self.num_heads * self.head_dim) - + attn_output = self.dense(attn_weights) if not output_attentions: attn_weights = None - return attn_output, attn_weights, present + return attn_output, attn_weights, past_key_value class FalconMLP(nn.Module): From a64a1a9320b60424b7608da01719d1cc045f0ddc Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:29:49 +0000 Subject: [PATCH 16/88] nits nits --- src/transformers/models/falcon/modeling_falcon.py | 2 +- src/transformers/models/llama/modeling_llama.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index c608ce07a61385..5b9c411f2969c0 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -582,7 +582,7 @@ def forward( if not output_attentions: attn_weights = None - return attn_output, attn_weights, past_key_value + return attn_output, past_key_value, attn_weights class FalconMLP(nn.Module): diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index af12564881fcbf..78d819ded8b455 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -491,6 +491,10 @@ def forward( value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] + + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) From 67e3fc296b1431da750a3ef3093b03e4549fec9f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:36:13 +0000 Subject: [PATCH 17/88] add v1 padding support falcon (even though it seemed to work before) --- .../models/falcon/modeling_falcon.py | 33 ++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 5b9c411f2969c0..a6f6422b00325b 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -42,7 +42,8 @@ if is_flash_attn_available(): - from flash_attn import flash_attn_func + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import pad_input, unpad_input # noqa logger = logging.get_logger(__name__) @@ -555,7 +556,7 @@ def forward( key_layer = torch.cat((past_key, key_layer), dim=1) value_layer = torch.cat((past_value, value_layer), dim=1) - _, kv_length, _ = key_layer.shape + bsz, kv_seq_length, _ = key_layer.shape torch_dtype = query_layer.dtype @@ -573,10 +574,32 @@ def forward( if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") - # below output will have shape (batch_size, seqlen, nheads, headdim) - attn_weights = flash_attn_func(query_layer, key_layer, value_layer, causal=True) - attn_weights = attn_weights.reshape(batch_size, query_length, self.num_heads * self.head_dim) + padding_mask = _convert_to_padding_mask(attention_mask, mask_value=float("-inf")) + + # contains at least one padding token + if padding_mask.sum().item() != bsz * kv_seq_length: + query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask) + key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask) + value_states, _, _, _ = unpad_input(value_states, padding_mask) + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=current_query_length, + cu_seqlens_k=current_key_length, + max_seqlen_q=query_max_seqlen, + max_seqlen_k=key_max_seqlen, + dropout_p=0.0, + softmax_scale=None, + causal=True, + ) + attn_output = pad_input(attn_output_unpad, indices, bsz, kv_seq_length) + else: + attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) + + attn_weights = attn_weights.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) if not output_attentions: From 8444ab66b83f0df630f64bf6d0b35235029195e3 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:37:52 +0000 Subject: [PATCH 18/88] nit --- src/transformers/models/falcon/modeling_falcon.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index a6f6422b00325b..dea61da2587d68 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -578,14 +578,14 @@ def forward( # contains at least one padding token if padding_mask.sum().item() != bsz * kv_seq_length: - query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask) - key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask) - value_states, _, _, _ = unpad_input(value_states, padding_mask) + query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, padding_mask) + key_layer, _, current_key_length, key_max_seqlen = unpad_input(key_layer, padding_mask) + value_layer, _, _, _ = unpad_input(value_layer, padding_mask) attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, + query_layer, + key_layer, + value_layer, cu_seqlens_q=current_query_length, cu_seqlens_k=current_key_length, max_seqlen_q=query_max_seqlen, @@ -597,7 +597,7 @@ def forward( attn_output = pad_input(attn_output_unpad, indices, bsz, kv_seq_length) else: - attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) + attn_output = flash_attn_func(query_layer, key_layer, value_layer, dropout_rate, causal=True) attn_weights = attn_weights.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) From 8b1c2df2b90ebf98f5ff8062527ba857fb181f3e Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 13:59:22 +0000 Subject: [PATCH 19/88] falcon works --- .../models/falcon/modeling_falcon.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index dea61da2587d68..32ba9daf568a69 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -548,7 +548,7 @@ def forward( past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) - if layer_past is not None: + if layer_past is not None and use_cache: past_key, past_value = layer_past # concatenate along seq_length dimension: # - key: [batch_size * self.num_heads, kv_length, head_dim] @@ -562,7 +562,7 @@ def forward( past_key_value = (key_layer, value_layer) if use_cache else None - (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(torch_dtype) + # (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(torch_dtype) query_layer = ( query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) ) @@ -574,11 +574,18 @@ def forward( if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") - padding_mask = _convert_to_padding_mask(attention_mask, mask_value=float("-inf")) + padding_mask = _convert_to_padding_mask(attention_mask * 1.0, mask_value=0) + + _, q_len, _, _ = query_layer.shape + + if use_cache: + query_padding_mask = padding_mask[:, -q_len:] + else: + query_padding_mask = padding_mask # contains at least one padding token if padding_mask.sum().item() != bsz * kv_seq_length: - query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, padding_mask) + query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, query_padding_mask) key_layer, _, current_key_length, key_max_seqlen = unpad_input(key_layer, padding_mask) value_layer, _, _, _ = unpad_input(value_layer, padding_mask) @@ -595,11 +602,11 @@ def forward( causal=True, ) - attn_output = pad_input(attn_output_unpad, indices, bsz, kv_seq_length) + attn_output = pad_input(attn_output_unpad, indices, bsz, q_len) else: - attn_output = flash_attn_func(query_layer, key_layer, value_layer, dropout_rate, causal=True) + attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, causal=True) - attn_weights = attn_weights.reshape(batch_size, query_length, self.num_heads * self.head_dim) + attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) if not output_attentions: From c3ebcd2ed3d834f4f44438a9f350c4e4d1e01d5a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 14:00:07 +0000 Subject: [PATCH 20/88] fixup --- src/transformers/models/falcon/modeling_falcon.py | 14 +++++++------- src/transformers/models/llama/modeling_llama.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 32ba9daf568a69..ef2886f190c0d3 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -559,7 +559,7 @@ def forward( bsz, kv_seq_length, _ = key_layer.shape torch_dtype = query_layer.dtype - + past_key_value = (key_layer, value_layer) if use_cache else None # (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(torch_dtype) @@ -567,9 +567,7 @@ def forward( query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) ) key_layer = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) - value_layer = ( - value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) - ) + value_layer = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") @@ -585,7 +583,7 @@ def forward( # contains at least one padding token if padding_mask.sum().item() != bsz * kv_seq_length: - query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, query_padding_mask) + query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, query_padding_mask) key_layer, _, current_key_length, key_max_seqlen = unpad_input(key_layer, padding_mask) value_layer, _, _, _ = unpad_input(value_layer, padding_mask) @@ -608,7 +606,7 @@ def forward( attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) - + if not output_attentions: attn_weights = None @@ -636,7 +634,9 @@ def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - self.self_attention = FalconAttention(config) if getattr(config, "_use_flash_attn_2", False) else FalconFlashAttention(config) + self.self_attention = ( + FalconAttention(config) if getattr(config, "_use_flash_attn_2", False) else FalconFlashAttention(config) + ) self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout self.config = config diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 78d819ded8b455..2488b205495cd6 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -493,7 +493,7 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] + kv_seq_len += past_key_value[0].shape[-2] cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) From 1c212d8d84cb526915d0ca4edcd493bb410f767a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 14:16:17 +0000 Subject: [PATCH 21/88] v1 tests --- src/transformers/testing_utils.py | 11 +++++++++++ tests/test_modeling_common.py | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 85b947d706aa4a..87f4918c71b08b 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -60,6 +60,7 @@ is_detectron2_available, is_essentia_available, is_faiss_available, + is_flash_attn_available, is_flax_available, is_ftfy_available, is_ipex_available, @@ -373,6 +374,16 @@ def require_torch(test_case): return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case) +def require_flash_attn(test_case): + """ + Decorator marking a test that requires Flash Attention. + + These tests are skipped when PyTorch isn't installed. + + """ + return unittest.skipUnless(is_flash_attn_available(), "test requires Flash Attention")(test_case) + + def require_peft(test_case): """ Decorator marking a test that requires PEFT. diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index eed704d3bca287..1beae6aa46d7cb 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -64,6 +64,7 @@ is_pt_flax_cross_test, is_pt_tf_cross_test, require_accelerate, + require_flash_attn, require_safetensors, require_torch, require_torch_gpu, @@ -2735,6 +2736,29 @@ def test_model_is_small(self): num_params < 1000000 ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + def test_flash_attn_2_conversion(self): + import torch + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True).to( + torch_device + ) + + for _, module in model.named_modules(): + if "FlashAttention" in module.__class__.__name__: + return + + self.assertTrue(False, "FlashAttention2 modules not found in model") + global_rng = random.Random() From 461870112a256eee15b62e03699b49b6d037a117 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 14:18:30 +0000 Subject: [PATCH 22/88] nit --- tests/test_modeling_common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 1beae6aa46d7cb..90f8bc3d223607 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2745,6 +2745,9 @@ def test_flash_attn_2_conversion(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + return + model = model_class(config) with tempfile.TemporaryDirectory() as tmpdirname: From 85ec94675f041de53fc2e0ef1662b3f4fc5010b3 Mon Sep 17 00:00:00 2001 From: Felix Marty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 1 Sep 2023 14:19:17 +0000 Subject: [PATCH 23/88] fix generation llama flash --- src/transformers/models/llama/modeling_llama.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 2488b205495cd6..8095c58212f427 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -491,7 +491,6 @@ def forward( value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] - if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] @@ -519,10 +518,15 @@ def forward( # contains at least one padding token if padding_mask.sum().item() != bsz * kv_seq_len: - query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask) key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask) value_states, _, _, _ = unpad_input(value_states, padding_mask) + # This assumes padding_side = "left" during generation with use_cache=True. + if use_cache: + padding_mask = padding_mask[:, -q_len:] + + query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask) + attn_output_unpad = flash_attn_varlen_func( query_states, key_states, @@ -536,7 +540,7 @@ def forward( causal=True, ) - attn_output = pad_input(attn_output_unpad, indices, bsz, kv_seq_len) + attn_output = pad_input(attn_output_unpad, indices, bsz, q_len) else: attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) From 0881ced15a0174f5560c9a124b4237dd5fe4aca6 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 14:55:57 +0000 Subject: [PATCH 24/88] update tests --- tests/test_modeling_common.py | 43 +++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 90f8bc3d223607..907d06fa06e206 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2761,6 +2761,49 @@ def test_flash_attn_2_conversion(self): return self.assertTrue(False, "FlashAttention2 modules not found in model") + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + def test_flash_attn_2_inference(self): + import torch + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + return + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True).to( + torch_device + ) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False).to( + torch_device + ) + + dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) + dummy_attention_mask = torch.LongTensor([[0, 1, 1]]).to(torch_device) + + logits = model(dummy_input).last_hidden_state + logits_fa = model_fa(dummy_input).last_hidden_state + + + self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) + + logits_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask).last_hidden_state + logits = model(dummy_input, attention_mask=dummy_attention_mask).last_hidden_state + + self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) + + + + + + global_rng = random.Random() From 2be3e03feb28dec81765b651c58353156a95607d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 15:21:31 +0000 Subject: [PATCH 25/88] fix tests + nits --- .../models/falcon/modeling_falcon.py | 23 ++++++++------ tests/test_modeling_common.py | 30 ++++++++----------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index ef2886f190c0d3..383eee20410c0a 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -556,7 +556,7 @@ def forward( key_layer = torch.cat((past_key, key_layer), dim=1) value_layer = torch.cat((past_value, value_layer), dim=1) - bsz, kv_seq_length, _ = key_layer.shape + _, kv_seq_length, _ = key_layer.shape torch_dtype = query_layer.dtype @@ -574,15 +574,15 @@ def forward( padding_mask = _convert_to_padding_mask(attention_mask * 1.0, mask_value=0) - _, q_len, _, _ = query_layer.shape + # contains at least one padding token + if padding_mask.sum().item() != batch_size * kv_seq_length: + _, q_len, _, _ = query_layer.shape - if use_cache: - query_padding_mask = padding_mask[:, -q_len:] - else: - query_padding_mask = padding_mask + if use_cache: + query_padding_mask = padding_mask[:, -q_len:] + else: + query_padding_mask = padding_mask - # contains at least one padding token - if padding_mask.sum().item() != bsz * kv_seq_length: query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, query_padding_mask) key_layer, _, current_key_length, key_max_seqlen = unpad_input(key_layer, padding_mask) value_layer, _, _, _ = unpad_input(value_layer, padding_mask) @@ -600,10 +600,15 @@ def forward( causal=True, ) - attn_output = pad_input(attn_output_unpad, indices, bsz, q_len) + attn_output = pad_input(attn_output_unpad, indices, batch_size, q_len) else: attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, causal=True) + # print(batch_size, query_length, self.num_heads * self.head_dim) + # print(attn_output.shape) + # import pdb + + # pdb.set_trace() attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 907d06fa06e206..cb7c04ded89779 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2761,26 +2761,25 @@ def test_flash_attn_2_conversion(self): return self.assertTrue(False, "FlashAttention2 modules not found in model") - + @require_flash_attn @require_torch_gpu @mark.flash_attn_test def test_flash_attn_2_inference(self): import torch - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: return + config, _ = self.model_tester.prepare_config_and_inputs_for_common() model = model_class(config) with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True).to( - torch_device - ) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True + ).to(torch_device) model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False).to( torch_device ) @@ -2788,22 +2787,19 @@ def test_flash_attn_2_inference(self): dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) dummy_attention_mask = torch.LongTensor([[0, 1, 1]]).to(torch_device) - logits = model(dummy_input).last_hidden_state - logits_fa = model_fa(dummy_input).last_hidden_state - + logits = model(dummy_input, output_hidden_states=True).hidden_states[-1] + logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1] self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) - logits_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask).last_hidden_state - logits = model(dummy_input, attention_mask=dummy_attention_mask).last_hidden_state + logits_fa = model_fa( + dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True + ).hidden_states[-1] + logits = model( + dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True + ).hidden_states[-1] self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) - - - - - - global_rng = random.Random() From b6d3e58282526f5b0890002743b6da26354a5922 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 15:22:29 +0000 Subject: [PATCH 26/88] fix copies --- src/transformers/models/llama/modeling_llama.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 8095c58212f427..ac3a7be4de6822 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -430,6 +430,7 @@ def __init__(self, config: LlamaConfig): self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -446,17 +447,27 @@ def __init__(self, config: LlamaConfig): # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope def _init_rope(self): if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) else: scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling["factor"] if scaling_type == "linear": self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, ) elif scaling_type == "dynamic": self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, ) else: raise ValueError(f"Unknown RoPE scaling type {scaling_type}") From b47e85c70d44d42e265c735d39bc0825275cc7fd Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 15:31:54 +0000 Subject: [PATCH 27/88] fix nit --- src/transformers/models/falcon/modeling_falcon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 383eee20410c0a..af6a5db02d19d2 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -640,7 +640,7 @@ def __init__(self, config: FalconConfig): hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.self_attention = ( - FalconAttention(config) if getattr(config, "_use_flash_attn_2", False) else FalconFlashAttention(config) + FalconAttention(config) if not getattr(config, "_use_flash_attn_2", False) else FalconFlashAttention(config) ) self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout From db8bd6407c122e59c285571c774aa8e14f6143fb Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 1 Sep 2023 16:24:54 +0000 Subject: [PATCH 28/88] test- padding mask --- .../models/llama/modeling_llama.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index ac3a7be4de6822..0d405aa0cdc820 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -330,6 +330,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -484,6 +485,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # LlamaFlashAttention attention does not support output_attentions output_attentions = False @@ -525,10 +527,8 @@ def forward( # when training. dropout_rate = 0.0 # if not self.training else self.attn_dropout - padding_mask = _convert_to_padding_mask(attention_mask) - # contains at least one padding token - if padding_mask.sum().item() != bsz * kv_seq_len: + if padding_mask is not None: key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask) value_states, _, _, _ = unpad_input(value_states, padding_mask) @@ -585,6 +585,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, + padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -612,6 +613,7 @@ def forward( past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, + padding_mask=padding_mask, ) hidden_states = residual + hidden_states @@ -850,6 +852,13 @@ def forward( attention_mask = torch.ones( (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device ) + padding_mask = None + else: + if 0 in attention_mask: + padding_mask = attention_mask + else: + padding_mask = None + attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) @@ -884,10 +893,7 @@ def custom_forward(*inputs): return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - position_ids, + create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids, padding_mask ) else: layer_outputs = decoder_layer( @@ -897,6 +903,7 @@ def custom_forward(*inputs): past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, + padding_mask=padding_mask, ) hidden_states = layer_outputs[0] From 58848ab3cb47b172d096968f8f3244968dc2f161 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 10:01:09 +0000 Subject: [PATCH 29/88] stype --- src/transformers/models/falcon/modeling_falcon.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index af6a5db02d19d2..1a9a6f9dc5ed0d 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -640,7 +640,9 @@ def __init__(self, config: FalconConfig): hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.self_attention = ( - FalconAttention(config) if not getattr(config, "_use_flash_attn_2", False) else FalconFlashAttention(config) + FalconAttention(config) + if not getattr(config, "_use_flash_attn_2", False) + else FalconFlashAttention(config) ) self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout From 3f73557df6cdd410f9f0d0059dc44827b49f31a3 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 14:53:15 +0000 Subject: [PATCH 30/88] add more mem efficient support --- .../models/falcon/modeling_falcon.py | 45 +++++-------------- .../models/llama/modeling_llama.py | 21 --------- 2 files changed, 10 insertions(+), 56 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 1a9a6f9dc5ed0d..e7c973111ae3f2 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -149,28 +149,6 @@ def _expand_mask(mask: torch.Tensor, past_key_values_length: int) -> torch.BoolT return expanded_mask.expand(batch_size, 1, seq_length, total_length) -# Copied from transformers.models.llama.modeling_llama._convert_to_padding_mask -def _convert_to_padding_mask(attention_mask: torch.Tensor, mask_value: float = 0.0): - """ - Convert causal attention mask to key-padding mask - """ - if len(attention_mask.size()) != 4: - raise ValueError( - "Expecting attention_mask to have 4 dimensions, got tensor of shape: " f"{attention_mask.size()}" - ) - - batch_size = attention_mask.size(0) - key_length = attention_mask.size(-1) - - padding_mask = torch.ones((batch_size, key_length), device=attention_mask.device) - - for i in range(batch_size): - mask_slice = attention_mask[i, :, -1, :] - padding_mask[i, :] = torch.all(mask_slice == mask_value, dim=0) - - return padding_mask - - def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: batch_size, seq_length = attention_mask.shape closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) @@ -319,6 +297,7 @@ def forward( head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, + padding_mask: Optional[torch.LongTensor] = None, ): fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads @@ -529,6 +508,7 @@ def forward( head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, + padding_mask: Optional[torch.LongTensor] = None, ): fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads @@ -572,12 +552,11 @@ def forward( if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") - padding_mask = _convert_to_padding_mask(attention_mask * 1.0, mask_value=0) - # contains at least one padding token - if padding_mask.sum().item() != batch_size * kv_seq_length: + if padding_mask is not None: _, q_len, _, _ = query_layer.shape + # This assumes it uses tokenizer.padding_side == 'left' if use_cache: query_padding_mask = padding_mask[:, -q_len:] else: @@ -604,11 +583,6 @@ def forward( else: attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, causal=True) - # print(batch_size, query_length, self.num_heads * self.head_dim) - # print(attn_output.shape) - # import pdb - - # pdb.set_trace() attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) @@ -667,6 +641,7 @@ def forward( head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, + padding_mask: Optional[torch.LongTensor] = None, ): residual = hidden_states @@ -685,6 +660,7 @@ def forward( head_mask=head_mask, use_cache=use_cache, output_attentions=output_attentions, + padding_mask=padding_mask, ) attention_output = attn_outputs[0] @@ -982,8 +958,10 @@ def forward( past_key_values_length = past_key_values[0][0].shape[1] # 1 because RW-cache, not standard format if attention_mask is None: attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=hidden_states.device) + padding_mask = None else: attention_mask = attention_mask.to(hidden_states.device) + padding_mask = attention_mask if self.use_alibi: alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype) @@ -1015,11 +993,7 @@ def custom_forward(*inputs): return custom_forward outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - alibi, - causal_mask, - head_mask[i], + create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i], padding_mask ) else: outputs = block( @@ -1030,6 +1004,7 @@ def custom_forward(*inputs): use_cache=use_cache, output_attentions=output_attentions, alibi=alibi, + padding_mask=padding_mask, ) hidden_states = outputs[0] diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 0d405aa0cdc820..413e9d85940b4e 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -68,27 +68,6 @@ def _make_causal_mask( return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) -def _convert_to_padding_mask(attention_mask: torch.Tensor, mask_value: float = 0.0): - """ - Convert causal attention mask to key-padding mask - """ - if len(attention_mask.size()) != 4: - raise ValueError( - "Expecting attention_mask to have 4 dimensions, got tensor of shape: " f"{attention_mask.size()}" - ) - - batch_size = attention_mask.size(0) - key_length = attention_mask.size(-1) - - padding_mask = torch.ones((batch_size, key_length), device=attention_mask.device) - - for i in range(batch_size): - mask_slice = attention_mask[i, :, -1, :] - padding_mask[i, :] = torch.all(mask_slice == mask_value, dim=0) - - return padding_mask - - # Copied from transformers.models.bart.modeling_bart._expand_mask def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ From baae73636b81181098e7655fcd800ede1e3c1437 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 4 Sep 2023 17:10:29 +0200 Subject: [PATCH 31/88] Update src/transformers/modeling_utils.py Co-authored-by: Patrick von Platen --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 597d663f080dda..34fead16c8be36 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1260,7 +1260,7 @@ def _check_and_enable_flash_attn_2(cls, config) -> PretrainedConfig: if _is_bettertransformer: raise ValueError( - "Flash Attention 2 and BetterTransformer API are not compatible. Please use one API or the other." + "Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing ..." ) config._flash_attn_2_enabled = True From 55f61406634a6db2df8e129a2e5d05a2bef33886 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 15:22:37 +0000 Subject: [PATCH 32/88] fixup --- .../models/deprecated/open_llama/modeling_open_llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index 8469b86eb9d53e..cc6aec611ecd42 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -373,6 +373,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, + padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -400,6 +401,7 @@ def forward( past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, + padding_mask=padding_mask, ) hidden_states = residual + hidden_states From 3fb221a2c559c986b9819acfeb639cefa32462de Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 15:31:46 +0000 Subject: [PATCH 33/88] nit --- src/transformers/models/falcon/modeling_falcon.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 521096a9e670c4..0461dfbc4a2fae 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1054,7 +1054,11 @@ def forward( padding_mask = None else: attention_mask = attention_mask.to(hidden_states.device) - padding_mask = attention_mask + + if 0 in attention_mask: + padding_mask = attention_mask + else: + padding_mask = None if self.use_alibi: alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype) From a931aeb9a4315b54a00c1833b5ebccae9a1547a6 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 15:35:48 +0000 Subject: [PATCH 34/88] fixup --- src/transformers/models/falcon/modeling_falcon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 0461dfbc4a2fae..0861cb70afe560 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -504,6 +504,7 @@ class FalconFlashAttention(nn.Module): def __init__(self, config: FalconConfig): super().__init__() + self.config = config self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads @@ -516,7 +517,7 @@ def __init__(self, config: FalconConfig): f" {self.num_heads})." ) - self.maybe_rotary = FalconRotaryEmbedding(config.head_dim) if config.rotary else lambda q, k, t: (q, k) + self.maybe_rotary = self._init_rope() if config.rotary else lambda q, k, t: (q, k) # Layer-wise attention scaling self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) From 68a12040f7a7dee7149b7f74284624ebec62e0c3 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 15:49:21 +0000 Subject: [PATCH 35/88] remove it from config when saving --- src/transformers/configuration_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 00f9b5610e6bab..7522ce362ba87b 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -156,7 +156,7 @@ class PretrainedConfig(PushToHubMixin): means no penalty. length_penalty (`float`, *optional*, defaults to 1): Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to - the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + the sequence length, which in turn is used to divide the of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the @@ -855,6 +855,9 @@ def to_diff_dict(self) -> Dict[str, Any]: self.dict_torch_dtype_to_str(serializable_config_dict) + if "_use_flash_attn_2" in serializable_config_dict: + del serializable_config_dict["_use_flash_attn_2"] + return serializable_config_dict def to_dict(self) -> Dict[str, Any]: @@ -871,6 +874,8 @@ def to_dict(self) -> Dict[str, Any]: del output["_auto_class"] if "_commit_hash" in output: del output["_commit_hash"] + if "_use_flash_attn_2" in output: + del output["_use_flash_attn_2"] # Transformers version when serializing the model output["transformers_version"] = __version__ From 36e0d6e0e3c7bcfd2f704dee5d4b17c9f83141c2 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 15:55:33 +0000 Subject: [PATCH 36/88] fixup --- src/transformers/configuration_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 7522ce362ba87b..ed8851d4a77190 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -156,7 +156,7 @@ class PretrainedConfig(PushToHubMixin): means no penalty. length_penalty (`float`, *optional*, defaults to 1): Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to - the sequence length, which in turn is used to divide the of the sequence. Since the score is the log + the sequence length, which in turn is used to divide the of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the From 2beeb6809be8b898ca0e66bbc0eccb55204cc9df Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 17:13:42 +0000 Subject: [PATCH 37/88] revert docstring --- src/transformers/configuration_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index ed8851d4a77190..c565982cff656e 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -156,7 +156,7 @@ class PretrainedConfig(PushToHubMixin): means no penalty. length_penalty (`float`, *optional*, defaults to 1): Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to - the sequence length, which in turn is used to divide the of the sequence. Since the score is the log + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the From 7b5da2c54ae5a0b67809b426cd4f826e2125e512 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 17:23:00 +0000 Subject: [PATCH 38/88] add more checks --- src/transformers/modeling_utils.py | 37 ++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 34fead16c8be36..4e07d0b044ae91 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1228,7 +1228,9 @@ def can_generate(cls) -> bool: return True @classmethod - def _check_and_enable_flash_attn_2(cls, config) -> PretrainedConfig: + def _check_and_enable_flash_attn_2( + cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, dict[str, int]]] = None + ) -> PretrainedConfig: """ Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference and training. If you don't know about Flash Attention, check out the official repository of flash attention: @@ -1263,6 +1265,37 @@ def _check_and_enable_flash_attn_2(cls, config) -> PretrainedConfig: "Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing ..." ) + if torch_dtype is None: + warnings.warn( + "You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour" + ) + elif torch_dtype is not None and torch_dtype not in [torch.float16, torch.bfloat16]: + raise ValueError( + f"Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes. You passed {torch_dtype}, this might lead to" + " unexpected behaviour." + ) + + if device_map is None: + if torch.cuda.is_available(): + warnings.warn( + "You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU" + " after initializing it on CPU with `model.to('cuda')`." + ) + else: + raise ValueError( + "You are attempting to use Flash Attention 2.0 with a model initialized on CPU and with no GPU available. " + "This is not supported. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map " + "or initialising the model on CPU and then moving it to GPU." + ) + elif ( + device_map is not None + and isinstance(device_map, dict) + and ("cpu" in device_map.keys() or "disk" in device_map.keys()) + ): + raise ValueError( + "You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to " + "initialise the model on a GPU by passing a device_map that contains only GPU devices as keys." + ) config._flash_attn_2_enabled = True return config @@ -2995,7 +3028,7 @@ def from_pretrained( init_contexts.append(init_empty_weights()) if use_flash_attn_2: - config = cls._check_and_enable_flash_attn_2(config) + config = cls._check_and_enable_flash_attn_2(config, torch_dtype=torch_dtype, device_map=device_map) with ContextManagers(init_contexts): model = cls(config, *model_args, **model_kwargs) From b99a582ec902939755d44db5acc556959d28a53e Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 17:26:39 +0000 Subject: [PATCH 39/88] use values --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 4e07d0b044ae91..209f285a5f9f24 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1290,7 +1290,7 @@ def _check_and_enable_flash_attn_2( elif ( device_map is not None and isinstance(device_map, dict) - and ("cpu" in device_map.keys() or "disk" in device_map.keys()) + and ("cpu" in device_map.values() or "disk" in device_map.values()) ): raise ValueError( "You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to " From adaed45911a3ceaafd713f044a7a0b2a09c19ee0 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 4 Sep 2023 17:50:37 +0000 Subject: [PATCH 40/88] oops --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 209f285a5f9f24..17d5678280729e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1229,7 +1229,7 @@ def can_generate(cls) -> bool: @classmethod def _check_and_enable_flash_attn_2( - cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, dict[str, int]]] = None + cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, Dict[str, int]]] = None ) -> PretrainedConfig: """ Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference and training. From 7f06af6226ad6e50164c59ce7a7201d12cd3acfe Mon Sep 17 00:00:00 2001 From: Felix Marty <9808326+fxmarty@users.noreply.github.com> Date: Tue, 5 Sep 2023 09:00:14 +0000 Subject: [PATCH 41/88] new version --- .../models/llama/modeling_llama.py | 49 ++++++++++++++----- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 413e9d85940b4e..53d946fae0c2a1 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -42,7 +42,8 @@ if is_flash_attn_available(): from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import pad_input, unpad_input # noqa + from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis # noqa + from einops import rearrange logger = logging.get_logger(__name__) @@ -50,6 +51,17 @@ _CONFIG_FOR_DOC = "LlamaConfig" +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + # Copied from transformers.models.bart.modeling_bart._make_causal_mask def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 @@ -508,29 +520,40 @@ def forward( # contains at least one padding token if padding_mask is not None: - key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask) - value_states, _, _, _ = unpad_input(value_states, padding_mask) - - # This assumes padding_side = "left" during generation with use_cache=True. - if use_cache: + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + key_states = index_first_axis(rearrange(key_states, "b s ... -> (b s) ..."), indices_k) + value_states = index_first_axis(rearrange(value_states, "b s ... -> (b s) ..."), indices_k) + + # In an ideal world, at least for the path q_len == kv_seq_len and q_len == 1, we should collect the + if q_len == kv_seq_len: + query_states = index_first_axis(rearrange(query_states, "b s ... -> (b s) ..."), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif q_len == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange(bsz + 1, dtype=torch.int32, device=query_states.device) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_states = query_states.squeeze(1) + else: + # The -q_len: slice assumes left padding. padding_mask = padding_mask[:, -q_len:] - - query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask) + query_states, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_states, padding_mask) attn_output_unpad = flash_attn_varlen_func( query_states, key_states, value_states, - cu_seqlens_q=current_query_length, - cu_seqlens_k=current_key_length, - max_seqlen_q=query_max_seqlen, - max_seqlen_k=key_max_seqlen, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, dropout_p=0.0, softmax_scale=None, causal=True, ) - attn_output = pad_input(attn_output_unpad, indices, bsz, q_len) + attn_output = pad_input(attn_output_unpad, indices_q, bsz, q_len) else: attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) From 2d36c6f1f072b5dd7ca68234d2cae1f7885a7e23 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 5 Sep 2023 10:48:57 +0000 Subject: [PATCH 42/88] fixup --- .../models/deprecated/open_llama/modeling_open_llama.py | 3 --- src/transformers/models/llama/modeling_llama.py | 9 ++++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index cc6aec611ecd42..f9ebbff30eca6c 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -364,7 +364,6 @@ def __init__(self, config: OpenLlamaConfig): self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward def forward( self, hidden_states: torch.Tensor, @@ -373,7 +372,6 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, - padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -401,7 +399,6 @@ def forward( past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, - padding_mask=padding_mask, ) hidden_states = residual + hidden_states diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 53d946fae0c2a1..a3adf99d4cb8cc 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -41,9 +41,9 @@ if is_flash_attn_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis # noqa from einops import rearrange + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa logger = logging.get_logger(__name__) @@ -62,6 +62,7 @@ def _get_unpad_data(attention_mask): max_seqlen_in_batch, ) + # Copied from transformers.models.bart.modeling_bart._make_causal_mask def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 @@ -532,7 +533,9 @@ def forward( indices_q = indices_k elif q_len == 1: max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange(bsz + 1, dtype=torch.int32, device=query_states.device) # There is a memcpy here, that is very bad. + cu_seqlens_q = torch.arange( + bsz + 1, dtype=torch.int32, device=query_states.device + ) # There is a memcpy here, that is very bad. indices_q = cu_seqlens_q[:-1] query_states = query_states.squeeze(1) else: From 9d3693f88c1f593a66eee103f1e81fd7787f2464 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 5 Sep 2023 10:57:33 +0000 Subject: [PATCH 43/88] add same trick for falcon --- .../models/falcon/modeling_falcon.py | 56 +++++++++++++------ 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 1d1e5488770a9c..f3c950caff5e34 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -44,7 +44,8 @@ if is_flash_attn_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func + from einops import rearrange + from flash_attn import flash_attn_func, flash_attn_varlen_func, index_first_axis from flash_attn.bert_padding import pad_input, unpad_input # noqa logger = logging.get_logger(__name__) @@ -77,6 +78,19 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + class FalconRotaryEmbedding(nn.Module): """Implementation of RotaryEmbedding from GPT-NeoX. This implementation is designed to operate on queries and keys that are compatible with `[batch_size, @@ -655,32 +669,42 @@ def forward( # contains at least one padding token if padding_mask is not None: - _, q_len, _, _ = query_layer.shape - - # This assumes it uses tokenizer.padding_side == 'left' - if use_cache: - query_padding_mask = padding_mask[:, -q_len:] + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) + value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) + + # In an ideal world, at least for the path q_len == kv_seq_len and q_len == 1, we should collect the + if query_length == kv_seq_length: + query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) else: - query_padding_mask = padding_mask - - query_layer, indices, current_query_length, query_max_seqlen = unpad_input(query_layer, query_padding_mask) - key_layer, _, current_key_length, key_max_seqlen = unpad_input(key_layer, padding_mask) - value_layer, _, _, _ = unpad_input(value_layer, padding_mask) + # The -q_len: slice assumes left padding. + padding_mask = padding_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) attn_output_unpad = flash_attn_varlen_func( query_layer, key_layer, value_layer, - cu_seqlens_q=current_query_length, - cu_seqlens_k=current_key_length, - max_seqlen_q=query_max_seqlen, - max_seqlen_k=key_max_seqlen, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, dropout_p=0.0, softmax_scale=None, causal=True, ) - attn_output = pad_input(attn_output_unpad, indices, batch_size, q_len) + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, causal=True) From 65ae59c825fbbb382ebd20bf1e6447f64bd7846a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 5 Sep 2023 10:59:49 +0000 Subject: [PATCH 44/88] nit --- src/transformers/models/falcon/modeling_falcon.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index f3c950caff5e34..15bc4482917dda 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -657,7 +657,6 @@ def forward( past_key_value = (key_layer, value_layer) if use_cache else None - # (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(torch_dtype) query_layer = ( query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch_dtype) ) From 43185b5e8690ea1d380ec74a3dcc7f08fec8a36b Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 11 Sep 2023 13:37:22 +0200 Subject: [PATCH 45/88] add another test --- tests/test_modeling_common.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index cb7c04ded89779..61765c847696b9 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2801,6 +2801,36 @@ def test_flash_attn_2_inference(self): self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + def test_flash_attn_2_generate(self): + import torch + + for model_class in self.all_generative_model_classes: + if not model_class._supports_flash_attn_2: + return + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True + ).to(torch_device) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False).to( + torch_device + ) + + dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) + dummy_attention_mask = torch.LongTensor([[0, 1, 1]]).to(torch_device) + + out = model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1) + out_fa = model_fa.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1) + + self.assertTrue(torch.equal(out, out_fa)) + global_rng = random.Random() From c61157e767f8ab1b0e99cdd29f29b7bd371bd33c Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 11 Sep 2023 15:05:51 +0200 Subject: [PATCH 46/88] change tests --- tests/test_modeling_common.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 61765c847696b9..703eb7d0290d8f 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2778,19 +2778,19 @@ def test_flash_attn_2_inference(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True + tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True ).to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False).to( + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False).to( torch_device ) - dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[0, 1, 1]]).to(torch_device) + dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) + dummy_attention_mask = torch.LongTensor([[0, 1, 1, 1, 1]]).to(torch_device) logits = model(dummy_input, output_hidden_states=True).hidden_states[-1] logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1] - self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) + self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)) logits_fa = model_fa( dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True @@ -2799,7 +2799,7 @@ def test_flash_attn_2_inference(self): dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True ).hidden_states[-1] - self.assertTrue(torch.allclose(logits_fa, logits, atol=1e-3, rtol=1e-3)) + self.assertTrue(torch.allclose(logits_fa[1:, :], logits[1:, :], atol=4e-2, rtol=4e-2)) @require_flash_attn @require_torch_gpu From 2f177925cfca4ea6d33e8e38fc3e5033885c49d1 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 11 Sep 2023 15:08:48 +0200 Subject: [PATCH 47/88] fix issues with GC and also falcon --- src/transformers/models/falcon/modeling_falcon.py | 4 ++-- src/transformers/models/llama/modeling_llama.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 15bc4482917dda..85a1aa80e340fb 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -45,8 +45,8 @@ if is_flash_attn_available(): from einops import rearrange - from flash_attn import flash_attn_func, flash_attn_varlen_func, index_first_axis - from flash_attn.bert_padding import pad_input, unpad_input # noqa + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis # noqa logger = logging.get_logger(__name__) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index a3adf99d4cb8cc..34b0652d1c7674 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -893,12 +893,12 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value - return module(*inputs, past_key_value, output_attentions) + return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids, padding_mask + create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids ) else: layer_outputs = decoder_layer( From 65c386179a4040add3caab6227be89250d6a1fb3 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 11 Sep 2023 15:09:12 +0200 Subject: [PATCH 48/88] fixup --- src/transformers/models/falcon/modeling_falcon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 85a1aa80e340fb..4907d13e75ad37 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -46,7 +46,7 @@ if is_flash_attn_available(): from einops import rearrange from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis # noqa + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa logger = logging.get_logger(__name__) From 165a5030b8c91fd12e24d783b5e4ef31be91fda8 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 11 Sep 2023 15:27:03 +0200 Subject: [PATCH 49/88] oops --- src/transformers/configuration_utils.py | 8 ++++---- src/transformers/models/falcon/modeling_falcon.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index c565982cff656e..74086ca2d7fccb 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -855,8 +855,8 @@ def to_diff_dict(self) -> Dict[str, Any]: self.dict_torch_dtype_to_str(serializable_config_dict) - if "_use_flash_attn_2" in serializable_config_dict: - del serializable_config_dict["_use_flash_attn_2"] + if "_flash_attn_2_enabled" in serializable_config_dict: + del serializable_config_dict["_flash_attn_2_enabled"] return serializable_config_dict @@ -874,8 +874,8 @@ def to_dict(self) -> Dict[str, Any]: del output["_auto_class"] if "_commit_hash" in output: del output["_commit_hash"] - if "_use_flash_attn_2" in output: - del output["_use_flash_attn_2"] + if "_flash_attn_2_enabled" in output: + del output["_flash_attn_2_enabled"] # Transformers version when serializing the model output["transformers_version"] = __version__ diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 4907d13e75ad37..fe3c22ee025e93 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -705,7 +705,7 @@ def forward( attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: - attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, causal=True) + attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, softmax_scale=None, causal=True) attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) @@ -737,9 +737,10 @@ def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads + self.self_attention = ( FalconAttention(config) - if not getattr(config, "_use_flash_attn_2", False) + if not getattr(config, "_flash_attn_2_enabled", False) else FalconFlashAttention(config) ) self.mlp = FalconMLP(config) From 5abc702f4a87de205e7d82ce36205109d483580d Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 13 Sep 2023 13:56:27 +0200 Subject: [PATCH 50/88] Update src/transformers/models/falcon/modeling_falcon.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/models/falcon/modeling_falcon.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index fe3c22ee025e93..c5ead08000da3c 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -79,9 +79,9 @@ def rotate_half(x): # Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() +def _get_unpad_data(padding_mask): + seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten() max_seqlen_in_batch = seqlens_in_batch.max().item() cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) return ( From 5069e4a582c94289d079841e6e04b1be64bbd525 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 13:59:09 +0200 Subject: [PATCH 51/88] add init_rope --- .../models/falcon/modeling_falcon.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index fe3c22ee025e93..4c829d298cf662 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -556,6 +556,36 @@ def __init__(self, config: FalconConfig): self.attention_dropout = nn.Dropout(config.attention_dropout) self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 + + # Copied from transformers.models.falcon.modeling_falcon.FalconAttention._init_rope + def _init_rope(self): + if self.config.rope_scaling is None: + rotary_emb = FalconRotaryEmbedding( + self.head_dim, + base=self.config.rope_theta, + max_position_embeddings=self.config.max_position_embeddings, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + rotary_emb = FalconLinearScalingRotaryEmbedding( + self.head_dim, + base=self.config.rope_theta, + max_position_embeddings=self.config.max_position_embeddings, + scaling_factor=scaling_factor, + ) + elif scaling_type == "dynamic": + rotary_emb = FalconDynamicNTKScalingRotaryEmbedding( + self.head_dim, + base=self.config.rope_theta, + max_position_embeddings=self.config.max_position_embeddings, + scaling_factor=scaling_factor, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + return rotary_emb + # Copied from transformers.models.falcon.modeling_falcon.FalconAttention._split_heads def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ From ace7939a7cf9c37ed88ec29c8c043f954d9e6425 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 17:40:25 +0200 Subject: [PATCH 52/88] updates --- .../models/falcon/modeling_falcon.py | 4 +--- .../models/llama/modeling_llama.py | 4 ++-- tests/test_modeling_common.py | 23 ++++++++++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index aa97ccfe3a2c80..9d68dba3f8df4c 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -556,7 +556,6 @@ def __init__(self, config: FalconConfig): self.attention_dropout = nn.Dropout(config.attention_dropout) self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 - # Copied from transformers.models.falcon.modeling_falcon.FalconAttention._init_rope def _init_rope(self): if self.config.rope_scaling is None: @@ -697,12 +696,11 @@ def forward( raise ValueError("`alibi` is not supported when `use_flash_attn` is True") # contains at least one padding token + if padding_mask is not None: indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) - - # In an ideal world, at least for the path q_len == kv_seq_len and q_len == 1, we should collect the if query_length == kv_seq_length: query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) cu_seqlens_q = cu_seqlens_k diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 34b0652d1c7674..2ab21a8113ff67 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -522,10 +522,10 @@ def forward( # contains at least one padding token if padding_mask is not None: indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + key_states = index_first_axis(rearrange(key_states, "b s ... -> (b s) ..."), indices_k) value_states = index_first_axis(rearrange(value_states, "b s ... -> (b s) ..."), indices_k) - # In an ideal world, at least for the path q_len == kv_seq_len and q_len == 1, we should collect the if q_len == kv_seq_len: query_states = index_first_axis(rearrange(query_states, "b s ... -> (b s) ..."), indices_k) cu_seqlens_q = cu_seqlens_k @@ -560,7 +560,7 @@ def forward( else: attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 703eb7d0290d8f..28faa32fcf7f68 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2765,6 +2765,7 @@ def test_flash_attn_2_conversion(self): @require_flash_attn @require_torch_gpu @mark.flash_attn_test + @slow def test_flash_attn_2_inference(self): import torch @@ -2816,18 +2817,24 @@ def test_flash_attn_2_generate(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False, low_cpu_mem_usage=True ).to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False).to( - torch_device + + dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) + dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [0, 1, 1, 1]]).to(torch_device) + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False ) - dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[0, 1, 1]]).to(torch_device) + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True, low_cpu_mem_usage=True + ).to(torch_device) - out = model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1) - out_fa = model_fa.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1) + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False + ) self.assertTrue(torch.equal(out, out_fa)) From fe9b16d814ad627dfc822f177fb6de869e0c3bf0 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 17:41:22 +0200 Subject: [PATCH 53/88] fix copies --- src/transformers/models/llama/modeling_llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 2ab21a8113ff67..cfc89a2d3f3e1a 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -51,9 +51,9 @@ _CONFIG_FOR_DOC = "LlamaConfig" -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() +def _get_unpad_data(padding_mask): + seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten() max_seqlen_in_batch = seqlens_in_batch.max().item() cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) return ( From acfc954fd633f48f15b51b55771d1ac380440072 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:10:11 +0200 Subject: [PATCH 54/88] fixup --- src/transformers/models/falcon/modeling_falcon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 3d1d364c4f8ea0..b9bbcf9d99ea2d 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1227,7 +1227,7 @@ def custom_forward(*inputs): causal_mask, position_ids, head_mask[i], - padding_mask + padding_mask, ) else: outputs = block( From 33a0f629b55f2ea724e759450bdd2bbafa6486de Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:15:44 +0200 Subject: [PATCH 55/88] fixup --- src/transformers/models/falcon/modeling_falcon.py | 5 +++-- src/transformers/models/persimmon/modeling_persimmon.py | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index b9bbcf9d99ea2d..61d376d221b576 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -562,7 +562,7 @@ def __init__(self, config: FalconConfig): f" {self.num_heads})." ) - self.maybe_rotary = self._init_rope() if config.rotary else lambda q, k, t: (q, k) + self.maybe_rotary = self._init_rope() if config.rotary else lambda q, k, t, p: (q, k) # Layer-wise attention scaling self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) @@ -672,6 +672,7 @@ def forward( hidden_states: torch.Tensor, alibi: Optional[torch.Tensor], attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, @@ -694,7 +695,7 @@ def forward( value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim) past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] - query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) + query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length, position_ids) if layer_past is not None and use_cache: past_key, past_value = layer_past diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 5c6cde7f8a6d44..7b04d38f22332c 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -452,7 +452,6 @@ def forward( "The bare Persimmon Model outputting raw hidden-states without any specific head on top.", PERSIMMON_START_DOCSTRING, ) -# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Persimmon class PersimmonPreTrainedModel(PreTrainedModel): config_class = PersimmonConfig base_model_prefix = "model" @@ -544,7 +543,6 @@ def _set_gradient_checkpointing(self, module, value=False): "The bare Persimmon Model outputting raw hidden-states without any specific head on top.", PERSIMMON_START_DOCSTRING, ) -# Copied from transformers.models.llama.modeling_llama.LlamaModel with LLAMA->PERSIMMON,Llama->Persimmon,PersimmonRMSNorm->nn.LayerNorm,norm->final_layernorm,rms_final_layernorm_eps->layer_norm_eps class PersimmonModel(PersimmonPreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`] @@ -553,6 +551,7 @@ class PersimmonModel(PersimmonPreTrainedModel): config: PersimmonConfig """ + # Copied from transformers.models.llama.modeling_llama.LlamaModel.__init__ with LLAMA->PERSIMMON,Llama->Persimmon,PersimmonRMSNorm->nn.LayerNorm,norm->final_layernorm,rms_final_layernorm_eps->layer_norm_eps def __init__(self, config: PersimmonConfig): super().__init__(config) self.padding_idx = config.pad_token_id From ee8ba206c7327ff79952d23763f50d5461cb6eff Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:19:13 +0200 Subject: [PATCH 56/88] more clarification --- src/transformers/modeling_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 3b348e96247f3b..cf6245b9dd62fd 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1248,13 +1248,18 @@ def _check_and_enable_flash_attn_2( cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, Dict[str, int]]] = None ) -> PretrainedConfig: """ - Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference and training. If you don't know about Flash Attention, check out the official repository of flash attention: https://github.com/Dao-AILab/flash-attention For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this specific section of the documentation to learn more about it: https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models + + The method checks if the current setup is compatible with Flash Attention as it requires the model + to be in half precision and not ran on CPU. + + If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that + the model can initialize the correct attention module """ if not cls._supports_flash_attn_2: raise ValueError( From e28fb0bf784c96eed8c28e7a730d429d8eb417b8 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:19:29 +0200 Subject: [PATCH 57/88] fixup --- src/transformers/modeling_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index cf6245b9dd62fd..c588c83b53a05e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1255,11 +1255,11 @@ def _check_and_enable_flash_attn_2( specific section of the documentation to learn more about it: https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models - The method checks if the current setup is compatible with Flash Attention as it requires the model - to be in half precision and not ran on CPU. + The method checks if the current setup is compatible with Flash Attention as it requires the model to be in + half precision and not ran on CPU. - If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that - the model can initialize the correct attention module + If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that the model + can initialize the correct attention module """ if not cls._supports_flash_attn_2: raise ValueError( From 025727c088f5fbf35cdf0d1d59478b2767c334d1 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:29:18 +0200 Subject: [PATCH 58/88] right padding tests --- tests/test_modeling_common.py | 78 ++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 28faa32fcf7f68..bfc2af96f2ca67 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2800,7 +2800,47 @@ def test_flash_attn_2_inference(self): dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True ).hidden_states[-1] - self.assertTrue(torch.allclose(logits_fa[1:, :], logits[1:, :], atol=4e-2, rtol=4e-2)) + self.assertTrue(torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + def test_flash_attn_2_inference_padding_right(self): + import torch + + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + return + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True + ).to(torch_device) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False).to( + torch_device + ) + + dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) + dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1, 0]]).to(torch_device) + + logits = model(dummy_input, output_hidden_states=True).hidden_states[-1] + logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1] + + self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)) + + logits_fa = model_fa( + dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True + ).hidden_states[-1] + logits = model( + dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True + ).hidden_states[-1] + + self.assertTrue(torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)) @require_flash_attn @require_torch_gpu @@ -2838,6 +2878,42 @@ def test_flash_attn_2_generate(self): self.assertTrue(torch.equal(out, out_fa)) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + def test_flash_attn_2_generate_padding_right(self): + import torch + + for model_class in self.all_generative_model_classes: + if not model_class._supports_flash_attn_2: + return + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False, low_cpu_mem_usage=True + ).to(torch_device) + + dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) + dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True, low_cpu_mem_usage=True + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False + ) + + self.assertTrue(torch.equal(out, out_fa)) + global_rng = random.Random() From 8f7e4008c2e480e8275730c5aa73ec17e7f5e587 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:46:49 +0200 Subject: [PATCH 59/88] add docs --- docs/source/en/perf_infer_gpu_many.md | 4 + docs/source/en/perf_infer_gpu_one.md | 120 ++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/docs/source/en/perf_infer_gpu_many.md b/docs/source/en/perf_infer_gpu_many.md index 756d2b3ef57b0b..2118b5ddb40431 100644 --- a/docs/source/en/perf_infer_gpu_many.md +++ b/docs/source/en/perf_infer_gpu_many.md @@ -22,6 +22,10 @@ Note: A multi GPU setup can use the majority of the strategies described in the +## Flash Attention 2 + +Flash Attention 2 integration also works in a multi-GPU setup, check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) + ## BetterTransformer [BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood. diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 855c52ffd98c62..908ce8383168e2 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -17,6 +17,126 @@ rendered properly in your Markdown viewer. In addition to this guide, relevant information can be found as well in [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu). +## Flash Attention 2 + + + +Note that this feature is experimental and might considerably change in future versions. For instance, the Flash Attention 2 API might migrate to `BetterTransformer` API in the near future. + + + +Flash Attention 2 can considerably speedup the training and inference speed of transformer based models. Flash Attention 2 has been introduced in the [official Flash Attention repository](https://github.com/Dao-AILab/flash-attention) from Tri Dao et al. The scientific paper of Flash attention can be found [here](https://arxiv.org/abs/2205.14135). + +Make sure to follow the installation guide on the repository mentioned above to properly install Flash Attention 2. Once that package is installed, you can benefit from this feature. + +We natively support Flash Attention 2 for some models, currently supported architectures are: + +- Llama +- Falcon + +And they can be used for inference and training, including training with padding tokens - which is currently not supported for `BetterTransformer` API below. + + + +Flash Attention 2 can only be used for models using fp16 or bf16 dtype, and can be run only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature. + + + +### Quick usage + +To enable Flash Attention 2 in your model, simply add `use_flash_attn_2` in `from_pretrained` arguments + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + use_flash_attn_2=True, +) +``` + +And use it for generation or fine-tuning. + +### Expected speedups + +You can benefit from considerable speedup for fine-tuning and inference, especially for long sequence length. +However, note that due to the fact that Flash Attention does not support computing attention scores with padd tokens under the hood, we need to manually pad / unpad the attention scores for batched inference when the sequence contains padd tokens. This leads to an important slowdown for batched `generate` with padd tokens. To overcome this, one should use Flash Attention without padd tokens in the sequence for training (e.g. by packing a dataset, i.e. concatenating sequences until reaching the maximum sequence length) + +TODO: @younesbelkada add figures here + +### Advanced usage + +You can combine this feature with many exisiting feature for model optimization. Check out few examples below: + +### Combining Flash Attention 2 and 8-bit models + +You can combine this feature together with 8-bit quantization: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_8bit=True, + use_flash_attn_2=True, +) +``` + +### Combining Flash Attention 2 and 4-bit models + +You can combine this feature together with 4-bit quantization: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_4bit=True, + use_flash_attn_2=True, +) +``` + +### Combining Flash Attention 2 and PEFT + +You can combine this feature together with PEFT for training adapters using Flash Attention 2 under the hood: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM +from peft import LoraConfig + +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_4bit=True, + use_flash_attn_2=True, +) + +lora_config = LoraConfig( + r=8, + task_type="CAUSAL_LM" +) + +model.add_adapter(lora_config) + +... # train your model +``` + ## BetterTransformer [BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood. From 3259392d523dace424d1f35f9f8ca65bf4ac962a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:49:41 +0200 Subject: [PATCH 60/88] add FA in docker image --- docker/transformers-all-latest-gpu/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index a6c672e1a9df64..76440ffedc9926 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -52,6 +52,11 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes # Add auto-gptq for gtpq quantization testing RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ +# Add flash attention +# commands copied from https://github.com/Dao-AILab/flash-attention#installation-and-features +RUN python3 -m pip uninstall -y ninja && python3 -m pip install ninja +RUN python3 -m pip install flash-attn --no-build-isolation + # Add einops for additional model testing RUN python3 -m pip install --no-cache-dir einops From 57a077b7350c4231025d69cd9928b2421060c318 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 18:54:01 +0200 Subject: [PATCH 61/88] more clarifications --- docs/source/en/perf_infer_gpu_one.md | 2 ++ docs/source/en/perf_train_gpu_one.md | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 908ce8383168e2..a9364fda9ba67b 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -34,6 +34,8 @@ We natively support Flash Attention 2 for some models, currently supported archi - Llama - Falcon +You can request to add Flash Attention 2 support for more models by opening an issue on GitHub! + And they can be used for inference and training, including training with padding tokens - which is currently not supported for `BetterTransformer` API below. diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md index f1b0f3976df0f8..17b62c3a1379ca 100644 --- a/docs/source/en/perf_train_gpu_one.md +++ b/docs/source/en/perf_train_gpu_one.md @@ -228,6 +228,10 @@ For additional information on tf32 vs other precisions, please refer to the foll [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189). +## Flash Attention 2 + +You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) to learn more about how to load a model with Flash Attention 2 modules. + ## Optimizer choice The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves From e62b0b868e1479cb699288d9a8db6dcb60def784 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 19:22:02 +0200 Subject: [PATCH 62/88] add some figures --- docs/source/en/perf_infer_gpu_one.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index a9364fda9ba67b..22de495bc5c6a0 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -69,7 +69,20 @@ And use it for generation or fine-tuning. You can benefit from considerable speedup for fine-tuning and inference, especially for long sequence length. However, note that due to the fact that Flash Attention does not support computing attention scores with padd tokens under the hood, we need to manually pad / unpad the attention scores for batched inference when the sequence contains padd tokens. This leads to an important slowdown for batched `generate` with padd tokens. To overcome this, one should use Flash Attention without padd tokens in the sequence for training (e.g. by packing a dataset, i.e. concatenating sequences until reaching the maximum sequence length) -TODO: @younesbelkada add figures here +Below is the expected speedup you can get for a simple forward pass on `tiiuae/falcon-7b` with a sequence length of 4096 and various batch sizes, without padd tokens: + +
+ +
+ +Below is the expected speedup you can get for a simple forward pass on `meta-llama/Llama-7b-hf` with a sequence length of 4096 and various batch sizes, without padd tokens: + +
+ +
+ + +Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequenc lengths without facing CUDA OOM issues. ### Advanced usage From 74194383cf3496803245739fee0713a71e95dc89 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 13 Sep 2023 19:22:19 +0200 Subject: [PATCH 63/88] add todo --- docs/source/en/perf_infer_gpu_one.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 22de495bc5c6a0..2933767d4f7df5 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -81,6 +81,7 @@ Below is the expected speedup you can get for a simple forward pass on `meta-lla +TODO: @younesbelkada add more figures and cases where FA fails. Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequenc lengths without facing CUDA OOM issues. From 3ba5e98325493a8911dab358eb127bc1fa17eaf7 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 14 Sep 2023 10:28:34 +0200 Subject: [PATCH 64/88] rectify comment --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index c588c83b53a05e..57e21ff004ac68 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1283,7 +1283,7 @@ def _check_and_enable_flash_attn_2( if _is_bettertransformer: raise ValueError( - "Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing ..." + "Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()" ) if torch_dtype is None: From 585e4634ac75abcb9aa701cb3f9180b9a7ba992d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 14 Sep 2023 13:05:21 +0200 Subject: [PATCH 65/88] Change to FA2 --- src/transformers/models/falcon/modeling_falcon.py | 4 ++-- src/transformers/models/llama/modeling_llama.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 61d376d221b576..c30a69c0ee4cbb 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -544,7 +544,7 @@ def forward( return output_tensor, present -class FalconFlashAttention(nn.Module): +class FalconFlashAttention2(nn.Module): # Copied from transformers.models.falcon.modeling_falcon.FalconAttention.__init__ def __init__(self, config: FalconConfig): super().__init__() @@ -794,7 +794,7 @@ def __init__(self, config: FalconConfig): self.self_attention = ( FalconAttention(config) if not getattr(config, "_flash_attn_2_enabled", False) - else FalconFlashAttention(config) + else FalconFlashAttention2(config) ) self.mlp = FalconMLP(config) self.hidden_dropout = config.hidden_dropout diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index cfc89a2d3f3e1a..ba60a5110f14af 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -410,7 +410,7 @@ def forward( return attn_output, attn_weights, past_key_value -class LlamaFlashAttention(nn.Module): +class LlamaFlashAttention2(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ @@ -479,7 +479,7 @@ def forward( use_cache: bool = False, padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # LlamaFlashAttention attention does not support output_attentions + # LlamaFlashAttention2 attention does not support output_attentions output_attentions = False bsz, q_len, _ = hidden_states.size() @@ -576,7 +576,7 @@ def __init__(self, config: LlamaConfig): self.self_attn = ( LlamaAttention(config=config) if not getattr(config, "_flash_attn_2_enabled", False) - else LlamaFlashAttention(config=config) + else LlamaFlashAttention2(config=config) ) self.mlp = LlamaMLP(config) self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) From ec0f8b9fb58b0be03c9cf997a7a89901afbd95b6 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 19 Sep 2023 09:40:18 +0200 Subject: [PATCH 66/88] Update docs/source/en/perf_infer_gpu_one.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- docs/source/en/perf_infer_gpu_one.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 2933767d4f7df5..ed59095d8be7d3 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -83,7 +83,7 @@ Below is the expected speedup you can get for a simple forward pass on `meta-lla TODO: @younesbelkada add more figures and cases where FA fails. -Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequenc lengths without facing CUDA OOM issues. +Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. ### Advanced usage From 3e5ea353f72e6d3d7568223abbe3d680cab9d338 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 17:47:21 +0200 Subject: [PATCH 67/88] split in two lines --- tests/test_modeling_common.py | 44 ++++++++++++++++------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index bfc2af96f2ca67..25403c3fa40791 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2778,12 +2778,11 @@ def test_flash_attn_2_inference(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True - ).to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False).to( - torch_device - ) + model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False) + model.to(torch_device) dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) dummy_attention_mask = torch.LongTensor([[0, 1, 1, 1, 1]]).to(torch_device) @@ -2793,12 +2792,11 @@ def test_flash_attn_2_inference(self): self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)) - logits_fa = model_fa( - dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True - ).hidden_states[-1] - logits = model( - dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True - ).hidden_states[-1] + output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits_fa = output_fa.hidden_states[-1] + + output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits = output.hidden_states[-1] self.assertTrue(torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2)) @@ -2818,12 +2816,11 @@ def test_flash_attn_2_inference_padding_right(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True - ).to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False).to( - torch_device - ) + model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False) + model.to(torch_device) dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1, 0]]).to(torch_device) @@ -2833,12 +2830,11 @@ def test_flash_attn_2_inference_padding_right(self): self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)) - logits_fa = model_fa( - dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True - ).hidden_states[-1] - logits = model( - dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True - ).hidden_states[-1] + output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits_fa = output_fa.hidden_states[-1] + + output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits = output.hidden_states[-1] self.assertTrue(torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)) From 4bb1bc53bc105033340bbb3d0a262d818716593a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 17:48:49 +0200 Subject: [PATCH 68/88] change test name --- tests/test_modeling_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 25403c3fa40791..d4c370b26f29b5 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2841,7 +2841,7 @@ def test_flash_attn_2_inference_padding_right(self): @require_flash_attn @require_torch_gpu @mark.flash_attn_test - def test_flash_attn_2_generate(self): + def test_flash_attn_2_generate_left_padding(self): import torch for model_class in self.all_generative_model_classes: From b67c21e699568591e537579229a01ef1c9b8a64f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 18:02:38 +0200 Subject: [PATCH 69/88] add more tests --- tests/models/llama/test_modeling_llama.py | 38 ++++++++++++++++++++++- tests/test_modeling_common.py | 3 ++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 35a8a2fd3ebe46..b35b02dabd71ac 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -18,9 +18,10 @@ import unittest from parameterized import parameterized +from pytest import mark from transformers import LlamaConfig, is_torch_available, set_seed -from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device +from transformers.testing_utils import require_flash_attn, require_torch, require_torch_gpu, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -375,6 +376,41 @@ def test_model_rope_scaling(self, scaling_type): # The output should be different for long inputs self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + def test_flash_attn_2_generate_padding_right(self): + """ + Overwritting the common test as the test is flaky on tiny models + """ + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + load_in_4bit=True, + device_map={"": 0}, + ) + + tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + + texts = ["hi", "Hello this is a very long sentence"] + + tokenizer.padding_side = "right" + tokenizer.pad_token = tokenizer.eos_token + + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) + + output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_native = tokenizer.batch_decode(output_native) + + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, use_flash_attn_2=True + ) + + output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_fa_2 = tokenizer.batch_decode(output_fa_2) + + self.assertListEqual(output_native, output_fa_2) + @require_torch class LlamaIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 61536555adb2bd..a62bb2dd2063e2 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2721,6 +2721,7 @@ def test_model_is_small(self): @require_flash_attn @require_torch_gpu @mark.flash_attn_test + @slow def test_flash_attn_2_conversion(self): import torch @@ -2823,6 +2824,7 @@ def test_flash_attn_2_inference_padding_right(self): @require_flash_attn @require_torch_gpu @mark.flash_attn_test + @slow def test_flash_attn_2_generate_left_padding(self): import torch @@ -2859,6 +2861,7 @@ def test_flash_attn_2_generate_left_padding(self): @require_flash_attn @require_torch_gpu @mark.flash_attn_test + @slow def test_flash_attn_2_generate_padding_right(self): import torch From 5b735574bba4caa7879cec65cc93ca2e928528a5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 18:17:06 +0200 Subject: [PATCH 70/88] some clean up --- .../models/falcon/modeling_falcon.py | 56 +++++++++++------- .../models/llama/modeling_llama.py | 57 ++++++++++++------- 2 files changed, 73 insertions(+), 40 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index c30a69c0ee4cbb..552e10948df427 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -723,25 +723,12 @@ def forward( # contains at least one padding token if padding_mask is not None: - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) - value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) - if query_length == kv_seq_length: - query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - padding_mask = padding_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) + query_layer, key_layer, value_layer, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_layer, key_layer, value_layer, padding_mask, query_length, kv_seq_length, batch_size + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens attn_output_unpad = flash_attn_varlen_func( query_layer, @@ -768,6 +755,37 @@ def forward( return attn_output, past_key_value, attn_weights + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length, kv_seq_length, batch_size): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) + value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) + if query_length == kv_seq_length: + query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + padding_mask = padding_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + class FalconMLP(nn.Module): def __init__(self, config: FalconConfig): diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 3e9d2cb0f5deed..e346fce4a323c3 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -521,27 +521,12 @@ def forward( # contains at least one padding token if padding_mask is not None: - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - - key_states = index_first_axis(rearrange(key_states, "b s ... -> (b s) ..."), indices_k) - value_states = index_first_axis(rearrange(value_states, "b s ... -> (b s) ..."), indices_k) - - if q_len == kv_seq_len: - query_states = index_first_axis(rearrange(query_states, "b s ... -> (b s) ..."), indices_k) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif q_len == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - bsz + 1, dtype=torch.int32, device=query_states.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_states = query_states.squeeze(1) - else: - # The -q_len: slice assumes left padding. - padding_mask = padding_mask[:, -q_len:] - query_states, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_states, padding_mask) + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, padding_mask, q_len, kv_seq_len, bsz + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens attn_output_unpad = flash_attn_varlen_func( query_states, @@ -568,6 +553,36 @@ def forward( return attn_output, attn_weights, past_key_value + def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length, kv_seq_length, batch_size): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) + key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) + value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) + if query_length == kv_seq_length: + query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + padding_mask = padding_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + class LlamaDecoderLayer(nn.Module): def __init__(self, config: LlamaConfig): From 48e3bcffaca06ca24111478e0b2c2a8a62937029 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 18:26:40 +0200 Subject: [PATCH 71/88] remove `rearrange` deps --- .../models/falcon/modeling_falcon.py | 17 ++++++++++------- src/transformers/models/llama/modeling_llama.py | 17 ++++++++++------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 552e10948df427..bc083e05d5dde7 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -44,7 +44,6 @@ if is_flash_attn_available(): - from einops import rearrange from flash_attn import flash_attn_func, flash_attn_varlen_func from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa @@ -724,7 +723,7 @@ def forward( if padding_mask is not None: query_layer, key_layer, value_layer, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_layer, key_layer, value_layer, padding_mask, query_length, kv_seq_length, batch_size + query_layer, key_layer, value_layer, padding_mask, query_length ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens @@ -756,12 +755,16 @@ def forward( return attn_output, past_key_value, attn_weights # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input - def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length, kv_seq_length, batch_size): + def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) - value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) - if query_length == kv_seq_length: - query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) cu_seqlens_q = cu_seqlens_k max_seqlen_in_batch_q = max_seqlen_in_batch_k indices_q = indices_k diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index e346fce4a323c3..4ac39c070dbbfc 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -41,7 +41,6 @@ if is_flash_attn_available(): - from einops import rearrange from flash_attn import flash_attn_func, flash_attn_varlen_func from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa @@ -522,7 +521,7 @@ def forward( # contains at least one padding token if padding_mask is not None: query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, padding_mask, q_len, kv_seq_len, bsz + query_states, key_states, value_states, padding_mask, q_len ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens @@ -553,12 +552,16 @@ def forward( return attn_output, attn_weights, past_key_value - def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length, kv_seq_length, batch_size): + def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) - key_layer = index_first_axis(rearrange(key_layer, "b s ... -> (b s) ..."), indices_k) - value_layer = index_first_axis(rearrange(value_layer, "b s ... -> (b s) ..."), indices_k) - if query_length == kv_seq_length: - query_layer = index_first_axis(rearrange(query_layer, "b s ... -> (b s) ..."), indices_k) + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) cu_seqlens_q = cu_seqlens_k max_seqlen_in_batch_q = max_seqlen_in_batch_k indices_q = indices_k From 046138407c2ceede900b5af163e10e23468aacaa Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 18:57:11 +0200 Subject: [PATCH 72/88] add more docs --- docs/source/en/perf_infer_gpu_one.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index ed59095d8be7d3..a11e8e6b440a96 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -81,10 +81,21 @@ Below is the expected speedup you can get for a simple forward pass on `meta-lla -TODO: @younesbelkada add more figures and cases where FA fails. +For sequences with padd tokens (training with padd tokens or generating with padd tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below we used a padding rate of 0.3). + +
+ +
+ +But for large sequence length you can benefit from interesting speedup for pure inference (also training) Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. +
+ +
+ + ### Advanced usage You can combine this feature with many exisiting feature for model optimization. Check out few examples below: From 8d72a66b4b9b771abc3f15a9b9506b4246d62d8e Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 19:10:20 +0200 Subject: [PATCH 73/88] revert changes on dockerfile --- .../Llama-2-7b-hf/2/speedup_plot.jpg | Bin 0 -> 112666 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 flash-attn-2-benchmarks/meta-llama/Llama-2-7b-hf/2/speedup_plot.jpg diff --git a/flash-attn-2-benchmarks/meta-llama/Llama-2-7b-hf/2/speedup_plot.jpg b/flash-attn-2-benchmarks/meta-llama/Llama-2-7b-hf/2/speedup_plot.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a641bb78add4ee4612d49fa6cd48e4bc5aef3156 GIT binary patch literal 112666 zcmeFYcT`jDwl2Pq5~ND+O+b_?NRt*2X`)o67ZH&ry>}1+MLGgX3l@qXASHANy$DDr z^penPLJbhY&A0bH=k9y%J@;4s{?-`pU@)>qvgZ3f^O?^q>uTz13Aph{T}vGxz+b`y z!T@kJ3#bC5BqSgbVp0$Ybp1Lh83jEh1vxnd3oRWrJqIfnCkHD#J2#)G0QXH{UUv4| zGIxaUic3mLatYj5l$B5ry(cO0*Ow4nzkZ#9oPwE>l39X>ok!yT^L5n<(2x@6fx-z1 zZUWb62ncBit~voO03aa#>uCuHfPY;C*9eJ-NkF96$;k0HKyLuo2nY$U5fKs-6A|I> z4#9s85YZ6R@`$UD(COQNZhFv5L?nM9<$X}m#$Ygl;=A|4^UZZKMkZz!R{mQ8x9%2o%&*_R=NA+f{V1-is)p9o*3~z(cXW1j_w@Gt9vvH>n1oMF&nzR5E30ek8=G6` zgTtfalT*yu`CoPs0EGWE>pv~~FLu%3?Yc%pL`Ves%PxXze)vU5LqyCYPC~1q53=!~ zyD1StO8+4FOGVpt-g^cph8LbAWQ=^0%lzoSO#6pr|L+WY^Z(1T|1|7>+BE}E5E9@I zkB|lc1E=S=zlM|jz5d>VzcKJP2L8st-x&BC1Ak-SZw&m6fxj{EHwOO3z<@mtWT}+DPRRGo0^Thq%3k%_XiZC8!wj<+A6@=UShUy3Vx~Kw(vlG&jf2VMOce7p- zt6XYLiRm|Mgl9ZJbH&(=xmii4-1!!Q>SZdU#?WE(KrU+ZdIbN+#h_||8^*Q4aD7)m zQsVUD4$SZJ)mZJm2b{w{kNn6;=Is?wa|J9jS6%^c6LAlw+^R$VH2zUq??3ezt#JUp z#@_zdT5obeNTUkh?)x|(Okc$bcE%4k-XIBy_QLtnaLb^!L;~*lpnqXl>jULedk7dEssY|mp!&tKH?>$?7Q+|72a4EoP z7Vb$A+=~3M8N2`Q#{b7A;Q!W%MThcEG^t!RU;gkpNvvFC{&G1}FtMKakpd^RF#LZ$ zhQE%--TJzMMllA`o?+b31)p)HJwq5DOx%sQ5fKIW*q*_Il!@u}|Fw)o&}_yn!@-~* zPG+xhYAZ^{8MnG#zj|ueaZQeHCR{F4NUTF9b&HdJWZxAsP*+zGZE6j;`YLO+E_ntS zV@~*|+{%@Ah8%!ZW(YzzjC__5c z=OjR&#pJt^S<9-Rj?#0F(Qx^hf6z-AnHbI1AUZb$n77w9r@0;3Z^cID)yUSEd-Pw9 zTg?k60~c+QTDnvocW*oTSmUD@$T4-788owHg33hJYp@WxnNd6e>2-g znM14WaNE8rQbc1i*EOZnD?ZVu!gkL+x1B5W&A4`isO2M1S!f4N-& zm0>HqCw>=V^H+dr_3p)-Pmh&g`Tvc8!lfPl9hN`A4BNG#T%(LzJ&&>s8&d3|OxgFS8iD-dk#r0UIDEVCx^xTt<$F!5=f#jKfgAWk^n$+gk&Oh_xrO zaDWO-_TO>Kf0FIL_@-X$UCZreRAN-Z4;zN2)U!>_*{D|*?CW1Z3W4KbDh1aC$&tdK zIz)^CH1#F=A*lsdLo2Q3p4>4>xV)UudrTsdQnl?w?)7=FbR~3fC8qPe#Y%?f> z;EI#dC6P`IaC32y{;Ys|Hl57e;0?Y*LGmbii~44`eC(Zy-jwc2^|s_zJDcIiVfY<1 zUDtuXOb6JtM)PHJ`m0mbuVWeyjHHcUs?%z9J{*OW80+e;rUE52NT zcx<)s_au@Xl)4;TqsZx$v4yN*(pEEmz|3pD_?JBSyZ+~K>nx{vl$k<(M_f$U$7s$} z#vg@_tVHRRjASMaDfk`l+Y%gy(FUt5m%K~uwqkbHc4TvCs!APD4*d*H-$4%#g9`Qa zOc%_z7~L1l9;ccQL?Dx?R0$XAu7GR)J$h&*C_-Ck%EdNOyUygx-I8mq)sbuc?2shy zoQW*l})zm0rEbtV{6ar%VlMbG=_Bub+B6*Cc|PZ11m1Slr zQDR5$eGzycw#%yxl_bzKXsh7bGx@OwFcgLxjt334tvpe_|ECS!`wO87<8F zN#7R^B<5U65*jLPazs-bSKy{bo9@_Mrnfff58k<_nOky5aRn?YoD2ei@eF_b?{zbV zN{qy>Hn|X?WimWm*KY}&pMol`0J4pZf71)MfpX4qnqaMq2P$^H3jLOX#NKfS*Gbvg z!!9hFevl6&IlP?Ar6ZntJ4EeuVeHGq4Vxe zT;%)EW@$&8dyaz&I&sgC@?PWK+~}Ts4<)K!=&<;vJNAk_@5?3onIkpW+Pl{4pZAC! z&{7ecDG#A0^W&>)AXT$ncX|G#34c(nu{aDnk-C;WARWeAfjLw#a5bkdLiG#FKY(-WYv}<}Z)iFJFb)1?(1DJlxU* zU9XGt8sm`T6r4liByPcRZG~5WiqlpoUs#_U9K1bs1&Av%W|-h6QRL2yt6!Ys?-yE@ zMzS&YEuBX$&NTJ3Y9YDUF#hTbyiH?A7p5$n8}eEnUOe}^4?G!E2+P-Jt57g(a!}X> zFe@kNmajY@o64=Z(KM3hI;r2O9|*j8lssxV9c8{MztmVB^#OF0^H|@ zAQA$->?vAHX+wO^SB%3DF7hT5k{i3#mKKhlpCdAnziDMMB9rn?|15aQ855%8Cl3u! zHOT~&AyRwC`V&vn^g`c4hR>ftVr%{UGJB%2VjL^!zJaA}=AU_Gg&WfztmBCM_dJvt z%7gFy+^p#Hut6&!o;=_-ecqVOJRgHfvWPfi-3jHwe8E~S3xzy-`Lw~v!IeR2?|4rs zFA_7!u-Lu`Y&0l#?XG1GF;qX{?H8HLO9E99K-zx{`blaD}%W{~v9z!P?xQX0GFnd^UO!*Zsr4)7v3=?xj`(+s8L;q1$ zN&n|ES3Xmj{(Bj|oe3mbD=6ZCp^neNc1RA|t-fGYA4f;Zwt({a8OOOai-%E(ze%q1 zaUsf;G5S{kU%&bl;A?%!dK`gb!#xaIytod=eTWSgB8SifE26vOZHk<0S-TTU`c|SF z_?q03=fNq6g(_-I}@3Pmci8BlcZb*O&Ps4MD~t*Q~L?B$2aS*fDd5@94O<< zx3Ck3EpQxcqZUUq!4KL=&shH>yY%uK^$2UYh$)7xdt{~*n=9AET4A|1-xlP)ILDoB zTk`q2~Fpw&;r8M#q4w2D}sqoDUG|Zl+p3x(`kMX(iGSpwFRj|G;DqEY8v$ zaI|X$wF*N+a9UmPOFz7qJ;(!L1lmq#Ova$5O@_q_%9F*RT?#*6SoVsyz4n-mUEhIH z(_(cek3plyKdP;%O|@1v_Yz|of8lciZrBt>j%WTX^%en1Rpy5wl?$+1# zcy%e9%xtk4>F|Nk&QKKb(`#W2HW)o(^Lf=y7==``BJ_!qi*pCOGQdE~^B$kbHr@}s z;6#M2+vX4!DWTnaZ8UR^r?rf!Y`yS-+O%)48C)euWq&Rv_|d?kz30W1g8H*o9lmn; zhyT3J_2D+w@d{wIbqHlM>+BKE(EFkEomY==SK%Jw3Se~nr?B2+QP3~oiR_eZ)!7^TObdB!L9wwUveD{2n5+BZK_hJZ6o3?CVk(dV%jydYBF&PmJbjn=~@CQtK;`*Qq)If*TO}x z_t83Eigk-O%mxRTjL$8?$?G&{73^hQWq09oeuG{u!jnzfpl-GhA=RdgkioCt-TQ^~f% z_?e!5^74(cq85=ohlE!OwTe{?(%jKafu?k}mHg3(!LI&9!l(!4=%?6QrX(cV4*ga~?ylmqP?BnJ)*6&-0E7c7Zm@4|XIg|swkwY$)CgEKdo5t-m z=ORA_7b^TXif?bJtANW!FvEnn8;+xGOKTIN^^Fr{FB_8+4X(H19$o>hCE*wsXM~8W zh_U9V58d~v{MjcyL9BaeA1H(!$GE&xyf!fx1zEO?p9{=Pl*JjU?~sMi1K_j(zt-(u z$`+Vyy;}SHSy#|UUKT9rr5d1CY=MC+`~I*E3M!l14kRA`dZ%mhaWdTyQHoc%W?Neu zk+;-l=4wlRkwc~G*RAWeexmakYJ^c1aY@t$>K?~sOvw20J4^k}H&rd>76j6mR}+Rk zm9ffLH5o+cx1x;}&`5Qob2>C-PqWljl;PL22cGw41&%#VR>_p<8kLqdXBrxdWx`1#z?p9I;fD{zS*!A176A!Qkb9C z@>ng~6v*9D+qf^_3cNTxB_g%E;Hp|2pH?4Wc#@V$ztx&nz9Hk8Qb6ZWjAL+Q5pS zi5A%5%kPEvR~Ma2il1vufs`QU*Og)<2_~&DUiza-Y)-z(&YAg>#Ue9qHHzcWsyE}O zR3RVPZ3*GoqyFG7b>#A6v(>p$;Yr_=fL&g@&tH32PWgZ^*4=l;2BZ|motTki`z{+2bK=~~6MbJsN+{M%L z{6wq(jA{Rf(rn9nw7StQMc!o5?bFobR1D0j?pg^}8zmDNEV5L;_w#6(>I?68qd%^8 zFU;LV+Fo;glyL%%O05kpjKj;1weN6(Wge#>Sm<}{2Gl;jk>gBT%O$QwC&o94QD-5{ zo8{W)bHUZTETWt9;Bk{iG%`_*7? z$DKjB?^d`%`I7hAR!=7ru{4WYEXdl+r|_BOP7~>tx|G`8UyQQm8vNXKr{|= zyn1goGdawaCVo&I$IK}3;~aNQ_k&DH5%yN}ncmmA=B75?XB77mCmY?!7F3G-wYBPN z>>P4O6>KZW2)OiAxS{|L91&U+>V^oq0!aKbZ9GDpTI`x9@2APiDmC3u*&gRi6Nqh6 zEpuA9o7k4iRHrQ;^TjK6_@rhNbpN?4pBeepIq~v{m>HFIR*Q zSJaC|f7dZ146?*=270`k)cE#^Gyl)8(jRwqGx*r!Kv<)2IZDHJD{<69j&wyn1|KT) zawjM%%@(fp2}Yy?jNIP6iMo}@mfY_;jLzQ?J=1s6t<9nr|uWIWx5kis@ z^gaXcs7P1bpLkhwx-KWSGfKxHhtiKN4HeR!KG$9AKf?o{i|dTvto+h52A{g}1722a zqYBnYH%gPmdCb>CE$EG(zUWz6yDWzpyX1rG!YZ;?z^(na$Cp7W=njlfXD!4{qT8;b z+2&BTO;)L%pkd|B_m^l(nU1Vpu@)|ea@XGvef!BHrpH#FhtZtFxex-D+Q{`7mr~6D z;nGtJ1{@W~udr+2{V&R#6&AW(i)Z}jJ(s_(fRPcn%m4cSFzDWL2w|6%FsdNSOW?hz zw>ERB?VHh=-Wvl&UxIs{tDL?2qbPUE-ONx z6$V~`*exY4E0vHXN-Aw%qo>sgj6L=$(3mw!kIdP!{0kKa_n8~}cvU{!k7d`K0!NBn z9xJmc-9w93iM8P4H|XJuyt$wh7=#LY?1P9}kHFgHPxlo$=H4H76I(VE znBG$n*G}*aH;Krd3w<_(`KD4c-g5sk@mW)2W$n0#-t3?Ycl-w;Kva`A<)`>|>=tD)ce7p}m{u5RsP=;Hw=pXQvL5b7;izD6 zteDh%c`VdLP^!^?fH9@=3(wQNR+}MNEOFpa6qes+X_pH5Wb9Ra<{f^=_jPu!D4vRv z49lKrIIZA_l7g>$-W$_P1)p|$1B+}HTUIm^O-#NG(4BPs2Gd2R&ox8w--62AP2$%D zM$51!N6WAgX75rryHu8{}3R@-cfy98UlyWC#M6+oH;NobR1`k==200x!Z$X`5%!Q0 zwX&hSjTc4(5~+b zx9`o)lCRJE-K&P##lK7nhoXtre zm<#O>7R8rR))7pXZ)0%rx#9PqXhc}FOkn7*1^%W(bInY8R*5#nOnMqPmA{kE<+IL%Up-vyfkDrlsUuq`Yb#X) zocWI${j@wp`pb}W*q;EyP%!%ifcE>4p{>{8pG;gwVG>|7Cnms$-d1MrTPELB5}R}A z=wdCNBiEI=P=2)F!3=Wsvu8g)Dc7kSb_*6?4ter@(cT?RV(!3@ouE_I2 z()1Ah{@aGjy&uaL%`(q&O??zaSHFBN@Z3}3i|;%Lpa3sP!brd?*=G3j3J5zA?0ccW zh3*wU6IM{AR{W@qlaIkY%p(uG0e$k|qMMT)m75&m!uP7W{E3YmX!0{bpO^u5wCJ3Hv7jbNvSao8xYv zq_dWKU@l5qC(GrJBn!6Gywc|1-2GOD%w7wZ3p_}CH?z=K5UhsUZHZUtT7OqOb2jj+ zcc)9m2Sh~i#%s#B1fGuRfy9SWUjbz<*qLx%G$=~JKHuR~?np_tr49y-o6~Y-dM!+E z_omFF6;!m&@#9=%KUPV=F?m6)xG&Ffk3eo>JMhS0#cl9t^mtd_?Q_TE`h!G!+C)T6OSiHUubcE@yvqx=bgP*L!a(mUrEtL z$+ijPb>B0yrU^DJcFZudj|o;_y0CxxS~!m2z)*Hb!P9F`MwWkwB1&OYL(BD!&GmEdJ`xvMl?AWI_?)a;_i=&9?ND7Tqx9wYijT- z3u}z=LYy{(36KPceR9z+?mrmDDPA<_w?_jU9aC?p*W``$Jv`VI{A|OE zNwlO^V5GYO&8Up`>+i`lLOgCNQ?u`Yv3hvTETdhI26u57oSzI?){jV%n?AlvZtH;oro?qA_>@XCPX|be^;$pUCL`Tu2b^j)LF4~=R?gi!Mh_ci_0aIq4MSFV5)}q zSSF8`q~=-9J%_dmcjrj1b474P6mzk=J-^|=DF1OzYB8s_PFWn%3BmWNypW0Xg^^fs zr2<>pKPGB!EE@;xNTt?mx}){x)+aJNnaVOe6`8)N=*W##eoiHi#^ge| zddyAjP*G{^Ld>Q6Py1_s3AS<#5u$$xbw2+(<50nbP9d z335?=TGv2R9kyx6`=)Rs90!d8VY8nMT}Eo)-c%E8Nwr~3Sxal*W*Cj^mzCO&t((5Q z@u-(CF8f9+*QaYw4`-zop1C~e=Frutuc)&ep6eXJ%RIYi>cc!zZ$%>!H`R&>o}LUJ zvi`)ql>1x}y;z}0RP>J`|8EFiSW)RE{6rLjizjXvXcq`;Y;<)G-R6XJ3T2~Uu?47% zjw^uP0q*HCF6n5+=CSqiMX^Q^b^1B$UJJv9^^_H1g(;Iu*ZR3d{dPw8BGy}MTbaGN zdf_JKJ)+rnhMHFQ>*ma}wg{eTghrc9U6RVWYv9F;1_-skcBdR8l>N}ZVgKn^?5~uC zU9H#1H_WMvn|j-e6q1AHMyV!a`}^MUj|}VAt5OMuesIDb+%qxrEk`7R{4buMCaX$B zt89wO_X@l_e~e!XI%Z$Vr33<@9uR!lWaYCnc(3aU_-4>3hg1@}0{SE-!{$HWC2~IC z8A^+RF2}RA2U=zM!rvff=5147vaoQ*NNwaYqm8laCjI40KjU$VUzi$8wnv(+;%&@S<^p^z zDujhz7W*BrvpX&FS3o&F;rP;QuT>z>367na%qgAMz0CSvbi=gg)^Ja_`7vK^Q-X&w z%~ow7yaP>D?ufMByghKNMvZmxjlP`-*uD7`7vyfK_94)@b*Q^wb_D#`jpIth~hDBwj$NgNOL%V*niK{f%nML5+pM-~oW{VWT z=3$6tl2xdhZq4+c=P8=qj5-7$!notjE+7Ejej!HV9H{dxe={(7)cdH>(N&Dsalt3_(|flyvhA`Mm!og42QQf{#=CtWI*^kjq|`8pyS|5ZzYOL6FKCwMFDjKN3u zuck`)$KdBzK;k}W*pz0NWFED_*Vxh`Yt;frxC~sRhTI|3l=?zYA}kFE2#cl0;^hZ( zbr^N|kllB&nX(v{Ice(JbLbTi!74I@@q7q8zpkUZ-gFoLLF`%+)q-N? zwjtUm=G&BIj)Bp<$m~*_IN$Asrk6!w6NpU0~+?>?*9 zSzaDpopFM>;1{Fz&8XL>BU{EsBhx$*<_|mh|Ewt!#vR7J^w~DGqCnls_sDa|%)}3% z#WF_UiOPK3&Mhks%%28%mnO0}jci{5#N|?&;x${N_er_h{mPax?1}BQP1jUVB$1Jj zqrY?zD{!#L=>H1ca(B}=vQsX$uP7k3OEzi0+Tf);@BL+U@oqVIKE|uVSc&n%ZUWz8 zfQlI0O60TZ9jGAg<9nIA_`BYX0gWDNe~6-N{Hq!LdmpReEVUmk<*e=gcx-x*{kh+( z2cRtn!avnLXyZ!j~@hEYOy#ibNoEuEFe zZ!zxj_x@9n>%#DYoQl$Gk+SLfH%v)8eykeVwI4spzP zmacUNHvy$7>SQZkh~NyfU+^+-5e9zoSyjiR{X_#&#cmzl`>hNw z;3jCM@L>DGjm(px^P0pTHT}-@(R}d-Ac55JGEZG1ngliHVCLr9dW?^=8{M7sNk`i> z4L$927A&)oM%O0eX(uP`s7S9fyy;Lnp*a8JwUGBimj@2l7P}Sg^+#SG-9Nf0DKc5$ z*mS$6u@+7+<1x;fj%>@7oD})<#SbZ8La71|G@ZGY^e~90BY5HM5>Cn(DFa5~IT`P& zkB$X_H8OJzlJ>?HaILU&2D_hSCk8XcnRM}cnEy~@q zP7Qc=mVMamu4{JCw+6$sm1UW~N{6?dNZ?Ma8{n?;)En|tx_o2+6cu`X9}1@P&+2m! zJ{GEU8IH4G_XVdfDb*j98x;xvEUn(Y{d#9v(`8vNxjT|T`Nsxf45l_Lpa4U%w${BD zWZF4GAWTdDoZPE-t)hC7+8@?+&QVbKR~3@SE`kyYE%0O_q~>)c{@eg5Up51?;Nw=d zWfjws-ir_-ZY^!DmsupyW7qpFszI?y#<9bRiaW7F&yk z;k3~Bs8k0#!H-ZWSR(xK`G|Iov%|;Yh--8mjqcwz|LFdb>e#?BpbKLq>!78&-G1lZ zr<(lHGs6s;;|Go5&*f|r2i6KPP#2M&_7A4KL_cZG`)ltsd4|ql>8lnD@rky(nd474+c>-*{s!{M0*$f!VRQI|k}1pzX|U^QD%zLu`XToh#2=IUZ)*ZlyVX1eBsna0mzX zJ=>C6U1WtJdpZ_RXsuVRz9|zknCsPM?mua$sjYW5U5i2{EUQ0$tU?gp`-d7F<(`Zo z@9Go07p&f5M??#Ixupo6)B}Ar0)M!bKHJu?f_j90!U>W8{;tcES-tay#sliWsh_$m zVZHGP>I;&&D`2A_k6IF?SRSX~vTW{~)maPXB9$QGvin_tQ4^}AqVU_09s0pCl(c&_de39Ek*Z5O3CwHh z*0D(6huFW~QDkv5f6jH-f@rhZ-SZz~CQ?1^Fks{SDcrBYt4SLDnJ?YEUZs^qVD&lp z(H{(j*V3g1A>oC}FGaBLM}pLE)|gsz^l9voqP$5asnX+@vR750B^$l9u#*x%gii5DqUL3m-Oe1r;E9W$y5|ZwJ_jGYgt{Ta zu&GGAg7gU687J5%+U9fG@cc;t8aFI1`+Nm(mw?ZgAm^k%R#o{#!-P<~uMIj?q92woSQPTp=Ryx&9rtn{#vQj3xj&K7YeCDj?^?CHXqZ9FQO9#FPfI{NO3F;>JiHEiG+ZHC~?yVg^ z)=w~rGU_ATY7Lbn$VW#SG>rw5pXA7$1ewN5ow#KK|D-f>p3p|Uy zD&5uY7{(vdoY~_kN|EeW0^l}oy=O3`mgzhy4P+$ANcAlPvMiJe2hPy4FyV*`qou%7 zwD~)$Q2Z<@@a)5(*m_*3)N&^L-K%j0y6+Bw35HKcpkh^z5>I1+2apcpFoJDsZ9Nbr zD#Hpli{dEOoUZ$IYxMDP`{fv(fRez37VGa}h)T@h_YI-Lsg2E(56R=M%kPwWPMtBC zr3KaHOW*jR(8t3im}nLDI1@mG5<5xE`%7>AQ5O~w?M)Q;`inAg z$OG#{{T6&6<-d)C;U7vhvpkq~^wwC@jZ?2L>aehUe*U5e%kvKzWqpi= zoU@M;l%PMDg@D^Sp49X;PS`Q_mW-Dh^=!aUf@@g6H_#a=O{a@`EA+XYSRm~^+6iJ369&8cAeL@KPyo772Rj}fvqy#k_slx@HRE%#$2 zRo%Q;;(zx?bH06L*elDssmz&g6GW9?Xl`vTy&-zj)K;mugZFGA_RuX-uBn!jrG9wt zpf>a2E;+9csTMC-oeE4~m8#5aB>~rn#_|?zWre5Teix&-;+7U>02Vx$b-VVtg=kPn zUFVxnnA4kp*95Jn0|8*85`5~hS1#3Tu8#8knCmmX<7y=`^@o7$`fjUd7}>}ZRm6br zwreiyFp*%yHMgbP%suCOpYo_mTJbG)EMG3!So!>^4(~zX@gdzCT{wV~8imJWd2=s1 za&J#6@RU5L8LMN`e^8oc)Y@y9n*GBNj(HNq4+MZa!p;Ph?&HILI=I8EB?jqLx7XsfGD;enidOvESn)NSMs zV_FkEN$eULljjo+<8I?L4y81Z;B(^jC}>fO`3Lhk5)RGZzql})*gC*- zt^k>E6K88C1@m%cznN~Bofb%4{^K=8{}15!z`=jM;qlj79{9!Zxom^g?Y(!;*s-=< zF3Ek4&umq9cARx$fkTrPCZ`$C5QsW@vc&$S(h5b(tC5Ynzd=hv&K>x9SS+}>roy9W z(J8ZpN7c)RM;25D!lYV8(G&DyYFimO?=(c}X_v&yKdWVqJ(*e#=O8%lk!=GlRD8u4 zoWh575n*)J+sWoRBeTDyZ-sOtA;Gp%8%KI1uJzwVtCnYXdF&Xsy$F@rZ!Lw#*}JNH zY$oer%pc&pCe9s3X5g97v6E*)Ydh18!n%*MeU3|GZxg1cw$0*hb|Hppk#@Dyxc-&n zrMUh1{V|3T#1l3gEj87mqf*#3Zh|ReZD+A|k%a}jZ4e8reH(!-b78!sQ`axQ?KA{e zoHa_jo|$LAL>`vQU;>sBcOk^4TMe(u1HL(Bt-jwANqAVqz9hF|&ry#{+c{0|3$pWY zZf^NdDqa6$ zD?{#2X&ir7Z>x@Yl0-lchd+mqVYd>+7}mZ}&CT~fMS-$q0U_4Rq8OUD^fCxPA)QC9 zScS3m68Y==?e8T>5j=NmYf@qA2H5RF?8I(p zM*2)1yYLh#AFZ@+6*yV=?kGphe0Wb-DjZ^)3RK;-jE8)D6UdI_N^_dvu+^%DduPQ< zT(a!CGC3Dl7>?8Qo?Z)Az=5Ngj2NFddmu}nNYWd&%qp4CoD!ct7gelyild&H)k07+ z;NOK(vifLm9`u-s?1wG327+O5yaL+OP}4KzK|dA9Jh`2}R2|GhCL2qIw0@09Zd6}N zPmAon;1YH`PYF!jbGWwag_A_}VEJ)u8JZXN(+WKrhr#9d*mOMN64xz>=?ScLp%+wH z&i1gdWHL1v^L#Zyw(q9Jt47ox@?DSPW;)}>;p01A>sl_qdvy(rc~cj&xgNv))ByKP zrVd$QsZO|5snx#4I6iMQk7>28z%Vtb>;ME4Lb?3mUpQGiqes>}?@d6{ba}ByNzqvMEm~Z=V)h8tSybL&Z1U zMAF(f%@j)>S1CNc8IV<3b64{>C%D0RxG3*hQ@<`okuk?bXm%T}=LB(Zw1Ghrf4?*W zn5SIjGvbaJHmmhO1mCcCwL=0$oUP%a$FozUcSFATet+pL+qV(G9+B%|DHE$8UWlnx zOMN`9O4<+7y0QGtVomAYq-M9?6TeA!*sL4IH>vV9e1V9d(3tm3|St%Cq1j? zEtDt)E9#cXg6Lhl5Bk}Ul71?gM5uVu2lpsQ%X^nq#sVt}rl<=YwAzYXip8AcOf@&@ zzKaf9!ei{^lBvs@n3Ne&|9rlpKQ%s|wc5m%t}&hH=^oPVj= zZNgj>KEPs}wOC!&sxAvDOc-}gaOT~xc>#|(MLFBeI2$Kx3K1yRyClrdEw*dg>_d(# z_Ca|7$86mOwa&z%zB2o1yn+x{T=4x{DW~99wl_6X1AcYb=XV znmeF}P9kYu{USA3(;s~5RuS^8 z{|YDwl5rVA#^EFvK^QL_4Vc1gW6}A-ILYE{!D?{!Q!PHi^q9?m4Hs81T(*?yno)I} zNs%6&MW#`4-Z}=I8wsEq1Fvj0Tl-Ud41k>kI zCK@EjkGN-KaC#QS^wUu9T_Pc)4Gp*RsKA{#;#A8?I^zm7$+z!|!Z&Fr2M;9PX!a7( zohAnMaFG>`yV|(X`IqZYx_oune%;#r_5wS2{`>;|U0<2g3;l&wEP9{aiGdNp@Qe@u z0-qY;w{lMhLbSkzR{!=@)rU+=s`|eFX&Q^5jOZNP4v|PP?O&_rB8oGAw`6ZhuG}iE zp_7+53Kc0G9UWzJ=%;_rC?zr;EbC$ME(l7^kLWNW_Nj*@`e6XOli@Ng=h5sA;>n8VdMpZ#W zG)n!+Jrx2R6-GyHF&T`36J;eOm~Vy?Wg;fkcZ6Kx@ng%5B2&3{4x{JWYM6Nzc9kMX@5Tx;@CfgFDYCvtpn zdh6h|33Iz}%kqSC+NZ7}1xw;w!Sma>$2kbG=)PkA@a-8TsL$6e*Tz@r#B-G%ammR; zgbGW@e7~o9pwCV)(3tV)Z!7=kz4d1S&$9Fy5(jzcW%fp7DDzTt3-ezGKedFALw)}E zepdybOb9`L4&B)BZQ&m-tNze;D`8>jw(r|;Bn9*HBn>@zJy5O#7qxvoswK6^L5UYO zyxLe1wPzc-mY^{uG3$A^2QD??4{kN$qz(G;#X|~L=!I)JKDc`+4HQXK7b96Qn?@c_%>C9QmX!l7McqeR8+lY=U7=K?O*Su*oNjLlKdfe=;i!BVl31ui>4{vf zeOE(D-TB0N=UaRPX>g$1;-%2&?L3NdDhFvC>&VwJm%?0cb$f}qyY{baQ?vj-scx>Q z+RTvS@fJ3;%U+_37f*6&x|q0%5(MASVD~H6{2G2Q`qL=J-4Wj+@g8zKaa^}+`}lEY zK;3^xfMwqTj&A_QBxC5UYfoZk^BbpR&N!{QEYPQk7Pz!oJUdaxwlJIff9pCiJKW-Y z(9TO|e#VAfiql<~6Ho3@9c|pgdn(8XW9=qgAzhm1%+x@+vwklm=%4W@^~nVH;qy>+Di&sM zJH>Zpx5T35=0Coe)j^+>y1Z!O6V8ei%X2|;g)?m8T|#kLmrQNhIVg=aYW1>f?7cwl z?&-Z7cg1&)ZeUYg@Z=#N4VkJ$ho7I>*L`h%DzO)(ncpf`} zCxyw`-aHV>w(SERsf21;C=ydqWJ!vYFeD^tk}MG_l~5$xSTB;4Z9=6eBTGa`_H``T zLiT;%v(MOOn3=2ZMBUGQ-{14R@BO{sbKmdtJg>j5Yi6!F=bH06&g1wUzhgN=G#yiJ zs6@G&JZ2ZFx2(Q0$6gQa92Aw)!~jb5eAHJb+d5AW>7;cFRebG#ZN$IOD9VH75#`dk zTr^*AWN9Y(iaX!Ca?jSX66K3*!!mL4;jV{s_V&JX;_c0L-p83U!+9&p67&WaGpU$8N)h+ka)#f-ngDmLa~$K+KVM=d z)UwUdq#@Ejqu0ZJ+DRy{p0kfg_L);VQvT%H9qmfZ>gY;Bg~Zab;qSFr3n0EsOx1gR zGlxfUkhp=Ws`UcH{mqC0Ke;IkuAp6ZUj^_Mhn<(b#o3;XQ=Jp_>_Ebox+(D7?ZZ8^ z?b=V$Gefue-eoH}YI*0Q`OMiKRLH#&Ic}VEbEIIbfF?qK17W@91@M;!+0y@bOw6XUyxeVoZ9SkBE5DlDZ$hsekeIk@VQ^#0EwgIIH$}LZfuop z!vzMNE@9wY2@s1XfpeAY94P>tE9$-8tQ*Gfq?)SO7+&>G@KTwIns#a9hv@-ZcZMXJ zkK2`;M@!X%I`_exlRZ=8$n9or)$O$Jh>6RmLotErUk3e-W=vAM+@p*sA-y@eUvnfT zgtW72Q!l=~5!zwC4Ix%mb|RlKVYAed;k1t;7w&JgZhzj8qhRkG`-T1D;)vwqONXy- zm9%_qlsH$d9*b4t)+;=gwXr#UE9;HN`>XP{l9dpfzFO8tkLd#oI#flh|9qt6ZWP5M zPUbDg?JwNL&XN}J^A?J6k(AxTS?hZ1hHD(xCnxVupBSxl#-g6zQ3u+@ZAqJK(0XoT zwB$J2lMnPmfrArYVoD@;)4TF!u|+_|y}(4Of=@xK5#uZL2%hub5l8(?yEsOggTXRb z^D)P;cLC%sWc%^}2IQmngK!n6#KX^-_FHDy&yg> zF6c>8J3l`uZkO*vsdukzuH8GjsZwn_?iL67Ydp9s&qp~dS(5IDa>QaxnbebCXRK8i zWK-VlHY4fu)LHF@35W#e?oh?10Lx>j94)`+W{RV8k@}K1c_HR&P>`lHHq)|Dx0DY?v)PE=1pwoY2?Ni=?yoCB&1ezm}Z5ypH1dN zjrI#iZe21o^NDidt|u&ZQBK0!H;8`&W&xAb;q9{^dyBRik2M+temL+0dNIHc5Q`ma z?p|U7Qh_6Ge^NT})z>9?p<_ob<){#6s79ZK5=V49n|xzAKHpbmncT*ZUsg8Z*1UVO zP#X9RGYre9Lc!uY&sgju{j-jZ{yV01j=g+dbLiOQiGuP`sV2{dvq*{}P3&fbbMyrI z)<)?m(Ocu0=C6*A`K2ZUft!CzOx>x2zeMcHwGXSlZ*@dYQmHhtv#eCQpU>fX0>!Ae z@tsFpUMduGWpuG(qn;n0;BtBssx55!rWEa%$St&V%@EC^*Tm^s?{~y_ z%-n&G8b`_|lcu`DP|tjee4aYYCuo)S;+}|C2#k-r?9tb2S32ykeA5H9uq{^#EIgWM zZ?4tp?9jW~#oD_*-&aoE_hOUVW~=OzxVkLRp=2{u63PL18A8$h5!4+j|I2T%#q%K} zldod_v@xTG(5nRkp9WNZZzpUPYc)u9B{-aXOe=@AL6C&yhy) z3TBSe~rvNXorLmhmIW zGCjk7=u<;H)LlR@;bk#rH+{pe<$R!;`2 z?sV84{;Iq$i?D&>2yZoHZT&_trE0gP!_5VkK_^;>-^$I=y8?l{;tTwUjkYOkY}d= z!qh9tTOz&+H%W=60})t?r~wgRjPBxCY}WYEu}72J7y~!g%x#jan$ByzyPxnX-L`5} z`*_*cqxBNN(0(T302F2?=7s_&oa!6`0Vo`jeiGeTUfNY^(8%N?%^gjtIH#&w62N)w zF7b*YIl^KTX*r?K4!DOG_>G6&VU0*gZwZlGB|A0>4}4J?u ziVaZALq6V zo&qCS{NrD3qNkv5r99fyo|AH^)5n;R^Zjk`1eAfaVtj3ITj_)B6)pqOJn&Q|O-;wY{#8VFW% zF=4~q9MwLn`1Vo!HTz?)!gq9CwKMGd+Uw(b(ze=RHWu6|Zy!_9ODF55i-65I+V;rs zXxWV9`_i@M+#Bd>){8uWn?2sPn%EAD=@uqEDv{iB;*LXLAf*XV)K%EwLPKKkZPqAy z_Bi&8TotAp)jF*r!Lm&tTiO85vBo^v%Cidr&c#>kql9kC?qo;UjpU%r9d(EB|U`Q(Q zS4t}JdK@Mg>w{ndKs$6BQ5CDL(VVEQzEdQ&azD$}mc4sl_fK(mL+#PIHYpdFczbY+ zxDDKA?Nb?J3`V!x2Xt^aC3p05;cSgwljjqFeqr>lmqyb4`zMq?rv>}9h}Jy$kmD!0 zds{o%2DA|AXx3K*bi4`7L>2waIt6Nq(5AuGVns~;ALWiS4s}>f`&e_U*~|iS=aihdik#E5H*e+c4f zhkv#-7lA5p3v;olX7Cksb-ZSzD`=8-b*1gkO5knv{C_?V>|j$zegz zoiG1q7dk;bi@7mMgQiIO;vCK7uxPC!JLAu%@@F3pc78|1qb3_2lI-9VC|h&1sSy0+ z%b(5pb`S=OXp4=yATjq4YPc!3PVUzt^Jm}OfBg7Ar?oM0;of(IY|2Y^&`#}_MaRA@ z5T=4J+UX9k_{*K&!oA!)e(9R{vwPy7b7Y(Rj%dP>O>=d!zawg zby<*2?o(A%Ej7+#F?Y>A156q*auZe?TTnfrC}$%&U4Z0jqC}^ZTc9uCN%NP7&y-F?Rl;f z2J2}|6W2OsM}@v>^c#9`#hNn1vy6)pIv5`pq0{_O^;%1rV_zt$}9z+4H_C~3zx&g_-^pD`QI zY7R9}&Kdan2Yx?kA%4u3kqG*j2dc8kV%-zH`k;=|%IuI?ty9>eI(BPnDBCbm(?&)D;MS z`g3)y9{-^5f~vV7@7 zKl8|92aPFHQw=`o|4^f;A0K!Ly!+V~l}RX;c98fYQMk=#ew(t@(VhNpUp+gzy}ZN{ zA9bBjV8NV@Nsp!`<3@$*e<<3?=-~daOd~BVgF8VR=B2A(ChWDmiJ@46O2l_WpI|nP zr3Tm0D8c?6fqe_|-4z|8*t4pSuKSJ<)yaK*lIINQ;Wpb(mdK9!4id7_4VrcoW@c*W zJ{$0|5=DQlom8uPXm4c@Etj@)fwfg3sSiioc8x53+83UCg`%E_og$*tQC51X7Ub}2 zbT!8}?BRV^%Atzt0_bV3&q*E@&}A^SmMo!mg3xscJ+GnJ-DCHpd9uqyZ8jJK@5_4- zo^nO~<)OGqq!Ht=Hbtbbcx0XlfDJ3UqBi8-C1&u#b_$;O0%;hksOugQDoyaG;upMI zPA|NIw__WnZub&8vp~L}R}5D#-gn;#`15Pj{7nT~a9$V~Sl?%C1HGD7JigB<7tn9U zes|%?=dp*-87i<6ZrtMW4)8-2pM`Lf!let^4dNO=agCw9o*8!W6xhicbdxrBQ*zW% z{r3dmF47#KE3dhkK64PZncKt$eW-rM9!DQsf+q10KrM1(Sjwbr6fEu3`Cu=pVk@GxbbpMqvk;?c!~U5Waa4e4V#gz(0P zl+ylnV>WlbBc}8cXA|FcpHB*$YSl}p)*ghP>(kKT<>Xm!kP7I~ikyw0T8ThAwxK;2 zn3C}`JX`R)Xj-}27>FE(u8p;(4EEimHc4qf?FX6QYJ(bhE3p9&6}738k6D6@I&EkT zMSYP1okvaPM=Yq*X{CYDrFZW*(}ge=WL4a#X`z!n0R<_fS&)sd(Kk7$ z(vA}y=W6aMQ?os%CG_DPSdqnVx6z%U>a<|o#KtEy9XmSIUQn)`S;SwO$<$5k!;>vQ zIO5Z#5z=S}{W}7AmN41L2;T4-#rN)f(EcuHACzswKWCZ%8g%6B>D-fzA)6!TBF3a; zL>H{~lfNUTX48!^s0=pu$} zY&e>#53i%#qgN?t8h-G{(*yj>d4g0#3A7fM+2^vyvHDqUqfh2M&=CzIq=-Vb#zYPJ z7(|qK0VGx2iY(zc_#I(?0mNm2s7kz|2G5Sca~Wvr;fPk^Y+grDH!SZEeFo1Vy{pq= z7(*0S&9|w8|E3N2FhnU4`;@*5gx=G@qGGj`9U`zoxp$P|%G-<4 z$O_YGLkRR2+ui8M^9a9_a>zANKqa#hC4q`iI85OV&~`Nx76D_6N*?bCxux%IA06tvfo{;U z*aLKR7Ub|9ad-(Z<+2Ykld6l1r$!cI>AXIG{++ufdyJx)RU8vdtr49yEg#4PIS4`L zT(ccvg0^E0O@`l6L%t+cz*j0JjfQbn8Q~eT zl&GHck%T?qQ)cBE-H-UKIeX|5W9smHFRl32G)=eb8G*PHmapa+R#&td=}%-q1B6sW zA8CN3-bm~9X37pf7LSQR~nTgs zegKj);2-^hXMW;ligxX8d`;%EYEqdRq z#}L9z;^DbS<3&w7qOr#SwTrusztE96dOW0QCI&bXCrJ zdV3K3E;~y|G1%xuBLGlt=$XxD=A}t6D@7$sWE#pwQ@ns#W27DGwjH~Ki7cmI%su2) zYkV9y9->iX@7AKE!9=2v9NJ2wA&5S-v^>Lsig!LtuQHv0x}IPt+VSeT!c;NC={XoR zAbN9isUIO<)ru^wAP7{^Z6y2^0KBzNovXLL2QyKPqRKdGvd8ubS3wosh3EmE5^mgP z$hEnD((FYQJS&WT58xWV9?D9yzMVeNzqpGWETs6k)r)F2d20T!tPA9pcfc^)IZBeA z`V<;}t`DrNMd zyYmU&OsFDI_kg@9IwWh0@i7oHNPjT}&w-#V(8MK)J46jXqNz2gZ)RQiQ?&!sMNMxg z*W7bS7V(~Z)W!ABy7i7%1MFrjd{m!h^Ob|u$^jBQ*p7r?$W>!Tpm z`Ve|&Kd6O~7_;bsSO|tllL&gd4XO;l37cM~v9LBISsqnZOOFo7B~~vkcD!?~MaRh*imK{L-j#$`}YB0!08GDN&M2!l;-xNUO29`In z$0^tH=0aofIz})~4RQg1hdXMKK3%Z{Ns>~zOMXyFS;uY`X&?#xi;}nnxzZDvn)^y5b{mP4iXoV{2>|qUhVjC z2gB)kRUlXcr{)~Fy__x+HRhUF@bmFEfr|v5eMnF9NJ1gH%eQn!cZN)1y$>wYJ_~X@ zaE@AiwE+SPqXt)nK)GB=f|xu@ z+VKM2l_gKBvS*~RkPy`17caY~#KD8T3Gng;Y?Q;55es;{ntl;Cp7Uzr35`&8ViYA>sA6`(i{y$EQGYad$NttiQe&cUFPOo~y&0LnaAeY@Jg>brsdiks82=#AgO(`ma5 z?Le&WJEElNJHk;X_X^}J4_^<(v(aw9kw`~R$!~?t6nb|Xw!Vf?(?|=R()tGNs)HoE z67OaW&}5}64Yw!)NSr)TgXA_ewW)tm%iwb>ew&kWx<&N12_~Vt($O_d7W}1&3s#~TE_E{1^5GdbD7if)WAD!b zgr-7Yr-ShdWpN2^!;#|$muV`aU(-~+>=pcE z3CZt!F*`^;JcP3f4^Pr4;o%INq&tvwf;Fv!amOk}an zU6(X3HOwx^4$dV**@E~njmV^OGN0CWg!NFeWkBafXEyaE0PyF*vQ?P$7s0;&7P6`5 zY*ium0SIO<|6btjNe1=Vcqt6wnrZjcsputxH{`+JU28oAOll(%d?ci(8{USQV)uUx zZzg`mI~KKR_Zo97(HiS|Pe@T*@}cZ4s4rRII`7XSvFKb+!SVZv#O8wP=2PjY$zEyRWx>AQ1WC>jO9=59lG!Ci7sYa^Rohm;6()-W?;{ zqfoXbyf2rWk0q$j!lk8BodH0lD(PA)}1zi7X>3aM_sLTMsG$zPRs*D<(>JH`X zh7NfWSGRn}m;2KKetTYtKHOVsPQi z5hoo|EL3R>q8_GgKtq25O4z+bwP_KMg$}Lgu*3+SFHIQ2T|lvLNa)Y19l#KxS|zDX z-=HW|d!`lTqa_W!uw`BYHp;K>SN7J9(;-l6AU6QDii%mu@z&{!0LIpiwHp?8@dQqN zlhT87jX9Tub9=9e6vgozYO;L~dYU$H42h3>qF_6HJERDf$%cnoC=y91Nkc7luYacT)gxCbO! zxnXctI708Yc6$3K!0Kpi@9!|4qF8!T(_aAuB9sU{%wsVRRsbIV#qS7`raruxB9o)>CfgO-#lT=9`X!DOIm7s-uFD7Cxq4!1x((TIAl0O{9PN!5T-5FMJ7b(+=M)qKCXBP-4*^r-zCC zycB3!e!`d*UBBlM=i03ks)nMdM|t882W|%edM9ZgYM=os>Wr@dXGiWKn%+Wm$&u<4757P#nN_j|pQ9GU@&+ z=#Gy^Hl5nM2-Mj!LZh6gSuk}q&aDG{nH8-VHTuSm*ys&_(h6C|Oj0iKpKay?4z2tBBM^_G~B{G<^Y}ozf4M2!@)z^mK)OMU%+@09LPvopy3aLqkQ= ziPUspFvi2d*v5riR$TqVK+1O#DC7!H`eXyL=Tr@m2vg9;oCE@9>0bxxj ztZHh%1V9<)+gwQbQqbhkOM2wV(C3QwqfW-eK%0O$fYE@152gh4!qrQ7dj!h8%RkE= z^2Ks&i7C4_!P_wzlOQ20ggv-uCWo4jk3v!RpT}>@0eo+}B|M!0Ur)BT6|*>NNMQwr zW2Q8-@keMkwVWT9bAT=_e-+*J3^}YjFDpcDK zgX{ConW|>q_Y(M^^fT$=-ys&P3k={=l+I+&uTuzCgPi|$kaH6@B5*1^z2R0|-m7c; z-AN!c6RtrQY-DSQOuKrE;F+Q#3*(~GsZC`NVEH#-O)69QDaADS2+E2Q}NLs|&6 z<%6F+jj308td}kgWp6-zi6az2Oa%S)El`#Y#H+8y9$tF=MdN<*$3dKtALQY};*zxw zM>gNT%o^+?8d=xh1aX=haJ+sMdu$~s&a@7+0RWM?02v$_ayQs&{qzIR`N$uwSKr$DN88^zvOECsSezq=^Xy6jQua* zq_SV>))j^o^yLnGl}_H%xo}~f*WcN4cr}~r?@Y=s>-W$848$jwljQ!|Y%VsNl-xl1 z2BUhNc&hFfk;i+kUyQckcl#<-WP^LuC7DqxI)~YUbyeYx&k0(;Gc(}2?ZG=nlYVb} zof&jOj{x$U>oaP8Qzmei)Z}>Vh)l4YLn+-U8g@4z7|8$T( z2)%c)NUpwL5sJN1LU_wQCC&iA)@CQQIEr1#Q7qTzqbXrxs>Ze=Nv>sxP}@(_N9%FA z^TwTnY2CT*xOl8`F@f@ocBxk4jElBdfzZ~@00lZ?X_Q_JvA%CdmhXY=zOB1;_=@+} zm*XfbG(x;&4-?J9CULPB1wSjoVJhUso0j=>xIOhH;x9*qx zx64^dCJdVVeuZzRBjJ7*&D#nB(F+$LOM;;Edvr$kbHouZ=s7=`YdYdfAk}x_0^{#K z#{3kDs`z=4?DJFS92+eJF_V&mrx>1T(3XebxrFbq_~Oc`h+ZB-eNcs7LW^1bcy(Hq zBx+S0R#(Y?w02m@G-z1Sf*&Rm|I(qS7+%09D z2ovrH;SVY?SB2b2n9oQS1B$Y4YFe|%jZib@eqXlN$s8?+<8$H+Zojdk2HT0o#ZoBG zS>N*_dKVwnsTgZ12BHtQH|65bG^KPp)F#YkXH14^iv{7rt}^g~E=De1eJ*>QP60Z1 zJM|>IA2J8sJ^=O2qg54xZoM-MrJo%Wuc)5j7W_)Bn1<`us3o6h&y+85oC`xDi8~i} zmO9_+I?p+ntw>%VBWtvCwLP3pCub2cpW9I!ZH3h)(q2>PZaIgV&&>;G@{T$S)rC_6 zdzq~ZAbPgxlXS}`?b~p7@s4PDIUXZYjB{e&`X@J0N4T&hOmS!PAwhC55wi)vIj+Az0Z)= zoLf|*X`3;xQQ&d8XoVXvyZFnd=0epcPxlCLAFWcJ&qQaX=>N#ejeMv>y@K!5m2KuH zYsJ-`FJncS1ybXb+Xmp#MhAgUYf6{SYL~u~YTza8nmd%- zQUZQ^p{?9+hE;sJWBCVgOJ9kdRll)r`7C1MK4fE>%XXQ;XzA?s@|`3q?pXcu>V&Nd z$?C%B)cx022Jds4sFJVT@NlmBk0Lg4MwcaJ`a{=g{J&Ia(y{_}PQXmYYlnO95neu% z74i&_i#&vky0l`hRl=6a-FW{QK02qci8zMMU0_u6k%N0vMm6PLt_*4}5~I)YzV<=Z+F&^!|K zmhd@oH+jA1TvhH^J(3?1^5s6V|dU}e+CYwGmf8tO#^1XkaZ6iZqwfx0e zIrga!A`&N02v5a_XQV&9*8g1LV8Z^qO%#y?>F)7`Mk~C)mAvj@(6@0^U`^<&D4pAB z2XDS>Q#8BthS&Gr(NSkiX52GS-+ZH+MS8oXbf9YZWPsA4(g}@?0rw$45J6<|&hMUO z1ly_rpPiC55fir20!h(0O-!+R=X-Nr^Y!~@_6gL-Uu8+hM;)}P{+1~Euafxr-)18G zt7Js{Gw!` zjpsz>$NhTMtq6qrrL^v`W>RE6;h*K-{t@3e`bU{~G3X_*q%`z4fN%X$vHUXv{G(1Y zPG#vy{US7fWc<}@yb%9r4g`Nwno+kDp8VUATakvC*yUaVbyp3;y#&|)Odj~Nj~dxa zgFOcK%H>%>-6u~NO@%QKD*K_ozov+dse-R7wZ$+W}Z_ySS zd5tL(Vs?h3JJP(zq$@b+YKSZ0??)EXLgJLBO|m?q`t}y3rc~_F_+}gEmARa*w2rOW z#+S46LAcKgBUh$)tBL+@=BU!AU3=@(hf!lC-PEz@F*$B_wz6AZeQd#;+jL&v)aN+z zac-?mbPa=|MnkIeN#*qIquNdL{@Y#a%eHL)cC?{!lVk}GchjMDxsOtBN)Jb>3T*6_ zvqyU%?$-Y3-U__lRR-mMm8JRLVNfFCTdQNfN%=lc2^aGAzI@H=)R~sbNJjs%0-3nt zROAEYb3`~g&UDupMkX~fea-Q=`L`|!ANd>r_PUvzd(tnuur)^>>`>p1r?WLM3_}QL zfawF&`~~_ZF*Y}V`t{4%rP*cZgV<8z6+VZGB@+)lKX|4hETf>eGtVM~TT`2fCv%p) zUXh3?Jot{)WM5Xmu=%F$P=(hTQa2;3X>F(dHC>DkRn>J7FXhc|Fnr#jxGk4xF&t(@<*Urm1!^>k6t2L$!G zNH`c&Uu?6MQ4)RrSxgpcQ_*p!5d>#(Ug;uc#QMwD$8VkKr|nYISllkhbFEi@^Kf6c zSL!AUgV1ZydLK2PaB<()l8|xT9oR73Em4bY?}7a-!mgOg?)Ed&YZpIi?_zfRnJE8z zzGLf^km)WG@CLmPXe2HFMChcl(JUzR8c+Q;{@lAOv0?SY^PZMN zR|^l`Fsgj4Tvpos0Nrz9N}rBJ7py-PdaPM|5WOee2uJ2)LC^9UkbbBO(2*MSu32t< zrma-s@>4SIm?(%Js9L!u1v06qpw?URO&64Gyf5Ecu>bJwNS@dtriB_NJZ($HB?-`Wj4FD_mtIfceIuJEqrzWLX7MT;V35>m>4~>Zgk2bw;m3HLu z{V$H~E^&48a$ewv*ZDPe10Q9D9av}f#NY%%!wq-Va{?4~IL@+Be*|uyK5?lRB@^V8 z+it?-Z-RmXAGgup8BV!8z@=2sr)n-;Qop*NdXCUVUxzXYqIQSH#DwST&!EObW@a2< z3B#4q*hQBcug0k`h%OH!u<{K>#go5(W5*i9<#E;LUqQKxEM8v1YYEHaM*J9wolS@N z|N9-Ot_&x;{8!Y~6HnCT0Om;hQ^_|_rtvHJM~__pX$dAun$|aLHhwrP z&$(7hl(si-$9KeA3uRDg&Ox9Aw}Z`U(Y>39>2y{HG>jYLwx9Q8ELQgQcI zKJ|U2y96$s=3rq$FtliV3tOKh{J3nCljGX+NAf(+tJ7Vb_{cntSjY3-ufHQ=xS**} ze3bR23ygao31KgC7@Y~1_c(naHll{14iDHC%*8tKiR&Hm?{(TcKW{!&&2bMIQ>}1m zc?(|UjGa4ni9Flx7w|#u6P<}@@$qB!D19`9~-37DhgEhccG`kPr*u=Z_ElqkCP^Ioi|4~(_e)t*zny`cBeLI z?bW-AJ|@Q`tZZ;R!CjehqE|$WtwT>W=A2Y+r6=cigv~v%xhll{W>m^r!2vz2t+GMY z{uYgewdk)=$&~Mi>yzsy-7HzihdYElr}si{9B%Ni6X!m8xSy)$MUA)ZfuD0XyXPEr z@GuwOB$3>qx5uJ#W6iovjD31(# zb4LAvi6oz-;ZB|5>3u;3yA1uL#roFP&Y2X$k7~ud2DZVMxz}SV2@0iA2(UNXf8Ffj zXC(Ufyp0de3dA{Mp9nnJptlkr#Wg{~Ge)j#N z_2|R-Q@ZS*om7Zi*mkwFW6cJNb1!BPPzE(^K+=+|gdCCM&SmfI+Cd45qbPvp?GNor zK4JzxpNcQ?Dc>@VAvbs2pqoJ@1P)>GjJ)FBQrrcCztRE2V1eX%`q2Dths6s_c5#%D zv$coD?@S$%%g%eV=fS$cmgIFz+|Ra`!#P^H*lIVJ%XINYDwDUgp4IWJ%UXRLyJ6C+ zGlGWHu&dXUN9uBe|8OGFYTJcByS2i9HU3|e>7<>Pms&3zm6My{*VVQ+bHM*2B91-Z z=?u{zs$Z{jTU}d^UdyRlj+v}CkeTVMG&OS4NVRMjk4?WJ8-}ECIlWf;L?m;BN4aVc zj{HGtdmwrHShLTf*4;dOoD&~OKX*PpQmy47Nz+$Sh!QLo7qPi?Rwh|@u3xz~A--pdYs4Y(QG#)x>WgWCNh7U=z9+BOZ$$fwb;pc4 z+{%5iD)s*sIe&kR)bAd8)&GX^Qsd<8vs@h;R=p=8ZOpKet(M@~`Ky-g_-p#y_Y6cS z>TfCGmo4^weG`)@Twj=ZnX9orlguHTnXpZbw~z23Pfd2zS*cuh>X~AGX_yk1XO945 zmp~!|nuBI>)hVuXSIzOUVcoN!CGcD1;3=9G*ozsSQmgIT=)k4N$%lTw&sApbmAa}J zgIqZuah+9D=Y#$Dly6j+jBM?ZcP_6a&Nml#?aqCscsJ4HQyF$cx=IFpy;;eusC#hplYP6=AKPfB z1`Lx!xX&dYmT3t%hOZ{Z>vg*xURc{Q_Voaxn>6A*`T~{)+7cS}Gy8lzO*+6_uG5tsd3o{BLhRjP%iW;YXUJ@c0W`@_>Oqi3YECZ40*kM+tJZQ zfz+~daTVF2O$QICMcHQW5k73cJFwXB4@v*3u3oINpOsB+^*sW6whKQ5no?mp-kXqTex4iy$sqCh7KW&%-L zWG`u|_;P6wkLP_Kha)BzZ@66bLex_tq(-{goP->2o$9_Rou7NzsS@E5ma(Xv-0VdW z?Ik~+vtTdo!*Pgv)nq*9edQLQjI!HvFBW;c|DBlEeC$80BK>`J>1svqKd!j_{apRC zU;iZ)y@x9}jvv{>+l7-7UHFD4*CcW^!5a)l+u(@z2U0C|b#|&AS76=twk7Q--eD+5 zh4>V!ryQu!kfUgxbg(TZZCqDLhFxh=4dJ|fSn)eTW;?vdW?u_cp9bCLJ)CNL@ef`0 zk+`4Pm6m@@d1~%EsnRjl;=vf*%%18<!eJ5V>GVRsbZOn*1uVgmz^WkI?wou4LE;~cY z8s^ybUK^MBp7UeM;70Ghz@(yhUkdfPng#n{LPSJjPtc)z{#W+;B+afRV$!ViZC1i? zn%vqt^(iM0(5;<$UQV2GEJ3R$6iHoFAe1)R(2^xzu$MIU-q==4a+%}&FI~l zBB(yxLz}qNRCkSOW2=X%k1HK#&59)&dgK zBawu!;`H%p_Zr$aF0?#ttWC&|;@jijueR=*N0ZE{MxPg7J2xu}Iy2-CH?Sba*#s1s zeKY#G#Z(Ptbe!Z#=|z-!l_u+Y@hvoe*3_#7%5tAJb3#&?mZWyQ97A5UT9Jqh7q%BB z=bfJ!1tV%G(MNP?7l(3Z{mYY2y)<$^GK8RG48i+uNBP5Up`-nsrM=M#@veAc)MFUoOn_Zp@XJf6gFFgqpKa69%8 z=b^|W2D&HGUX>3L8MM$Blx4%3F$t&Ac~A{u*PaJ7UE*zd%RYcl9Z-$}f;1$3S z)(H7L=)Ay1)YaeC%<@8!~QJ}bt8~kq1?&`PuE(na}X?DK3lpvMISM>0JaH|P1Q=YSDUvG zQa?H^X{|3Ds&X{{Zfj3VksA8_TTb@>#^vmv3jS$z?ZMK3l$% zL#tn2ov>9QSzQ=QYUy8C8T{C#L~==UnOYpos~Q|I2ATXS9|eA`?4p0KLm3Rbl9sip za{y*aWT%|=N)IW_vhH)zn%eF5B(Oq>k)xm$WNoQXJ}H`1JG@g&Y|{X+&$W2~8kdmGG!d_ zF_?$cgl^cg(dWToDXXVQGm=c34pv($>psHks}{|6H}Y#v(L9uLqdv8wZgcrfuRYE+ zV(3MV4~aXR6!IwEYp@oDi)!6L_rBaRW^_aVTfSHOIA36UI&_(+{m6(I1orLHcV6TGn}@m*oNqP5Fm3g}0@!T51q;mny}$Afz>@8~;Fuu*00RUaPr zj1OQhEWfnCqJi5|`^U+3PDxMl5t7tc6^m##hO)8HUG?!UBDX`iY9nMdlhRdQzNxIr zC>t-$Y=ib1Dm~q>$H%=bhtM13HYX&bG+gYme-k~?E~Au-{;kM)$U;d*ti}1B%!zWH zR@ZXGQvt>biRuS6sq$~M6HbtY&KfMf-{F$9(DI#PO)pE8TGOHUO&mv zc5S0i@!<%5D=|JlDND5!3hCOem#Mxii*18gn}=~5=^XFb{Szqt`jik(upO^Q%9Gxd zmu{BQo&lNPKHu{mE9Z|+LIn0~dgyBtZ+b3r&zT+#KW4rp+DF10M-ZJx7afDQ5eM_0 zGiVrHt3aljZ+Np@(8$Nn5p#P zbK$P23J#n*8+Owz_L{bQObo|Fh4+-A3Tmrc{sEGJpkMF%Ji2}9WYtra}840TQl**H*2Dvge~ zeE50%e%}K^Uz-kbdiI2Erz~oxH-|VKqulD+@qEtN;B<(V{~1k<%xg@EGA(4ZdX%1ohj-+s*6q z7p2{~Gk0AbfDVn%dZs(kZFU>R=$hR<$3c7cT5WHqbMlc+ zI>fw0)^?!EV&z3;NtwXSt7nS%M!oX^?M z^WNxMyBlC!l20TII}b0xq{;7+pYI}~pB^*%WQc}jSqV^bm2EcCvlB(_H?>MUak`l6 z$zp;aaSFH#I`U9)b$^kw(;5C4&s?~FLjv4?tOG8*mp5)od9_3G&aOU#V@pqQp7%LV z>B>@->z4R-lgg&s6K0n_d}v@7{~#{fbvz@uRLIj66)IcUGpcH7dXXqrSbKP6#>DID zwWJx3!D=Y|BqZ=T)5)?&=Tu9rc)5pcm8J>v`xyf_u|6xAzgFYyPeppRU;C*3?d|>; zX*W&k^0Hy)XM=GuY3I7~7g-#AQZfn{(6e5y4g!J5eX|SPGu+FGL=XHWy#iDh7bQvZ zJl%^QyM)U17|jyAOPW&U0;l-gl1TI76d0V|p86;;`1&1ccO!#CevZuDEHb6xDxv=@D|cEROW zoLs1|Pd02#!CxY#N4;Zvpgz{a7qNoqoK!=LmBst67o#Zdx@6hBz3dUj5Qf=@B7W#E z{3`~~pUj{Ce&$cXb@cmW@$GjC25W=> z91j_wAnN4l1JKkW1PM<8(lVkBKXg!41n>VN_Q*x zPI|Csx1|jOli8^4=VZr-q4$PAP_BQ*r2QTu%>7=dt+ep*&uU=&nN;!59^U`>e}4!a znTUT1!Duml%38CZbaqXKX5j3oeZ>c{Fz5UbTF(XMIMeiPvZ7|%hawcqM%Sh@14>b! z8Zi$Ld=xCAPq)tV3S-|Lsm=DeCZ;v@O0+>l)Jnwf8tu?o#obw$6ka^1WgfkkQ~E&w z!yh7*bVlCg0j%Z(_>BxIWCMV015ID+mwCoUoV_*vAdLYp3*p$Yp>J^K4os8H&|BP@ zfy1M_Rjc?B;FBb2J>2%{>kynJ?GbX zni)^+D@)l~KH?A#7SbFYVNnx5A>2~y(f(K2{{JkY{ZA!;|8^w+G5IN{0wg?Sm^zqA z!1{KOj_;u9=}C{(J9~q<8NnsOe27*4{@nJYcD!A}RU!nUTrc!J6LoKmernmP%v75W z^z1DrRy+`KAl{&@Be1$V6U1LF=Lhkcy8i%HbseGyYG3yg43`iP$jxjXWA*5e=Ll|k zwA)c{EDPnFj6AZ4KiNAIGA>t94L|M?XK=1XlE_3{I6vD*3_6wOr;biW+HmtE+^`5W zE!&gdG8im{ChB^M_UTVqqsz4rW${PDQ1x_W(Ai->xKv~l6p+98m6fpmG;;p?jhvL< zUh3*kf9}8Cp98dre=Fefub`~(Z2;g+7|I95x-k>cb`~h<%!S-}ktPpbNgPm_51^Cv zc)_Y(5mVH1%gg$*%4PiM8U?Q@I#u`pv^?0+*H%nS={?{-?HzXj^20Et*;nXA8N_m7 z4?tQvAjc0CdRDIGJdIjhY zo~pgiddi3w4=C>X8;8uDY*|YOG>e>zjY=8Jm=No8@XjrvY_%NK>ChV`>atGp{}zRg zU#X+=UwN-TMxEq$Bt!jXXNAnIM|R{QC%GJLwuZ$3FYFUzVE-uyrr$fHIiD{4zEzZJ zKb>?ga$O*dwnjjt(E%-lPsvnHnUH?Se2U2F0Z@HKE6aD3^Uxn=ad1CW^%ysN*}Q{KKt`Havum_0u#*5J-FiUnDXmkLEMbBQzhC5&ec zCWeGC;I)3dD#m)9AG5}kfwII+^l-{dk;qR;qTmiu7%jBjm5c+Bme>O?g87ZytQtYu zIoctwM6*nlWTW>Vvs)pb?NF)9m1SUb34q*SN$^rDSGBxPZeW0d-3K7voRo@giR8QX zMsBq)g>Y^;_UM2`p(y63URSM^JY@szo|hOHPaJ@V)-UTC>Kl>qIc;f+Nn9-Rz7(O* zImdw&@dwz%OKYd$()KWBqdbO25`DBYB^E!Q)uKVB2Y-J~HzkC#@XuOuDZeeQ^T)(u zeqqoXu!cx*p(*2dym>2zJ%+yV+^iDC=jg@y$E> zN86J=+BI>*%84)c5VW=~ep6$o3Y;>gwirKCtRJqo{D~)uflW~H0(cDEH|a5W6anUA zb|_gC2?~V3X2v4Zzbs2K70ceYU}EmUs%DU^ax9vMit3n>hA+I^AhQCDSo@?)=u#wt zY+Ui4p2u=U$IzL6DNPCgU_!{v0sgp_NOD!7h_wcM|H+nke|w1vVXW_Ka?O++OH6oD z&K;_4|0dUSy<{IQGCYQKS{46NF8+5*SrLH3D5A73_i|7mLoV2yrdb9Ix zvls9`oEDZE=Wga2XYj2IFqc_qw9lq=^WKoqxaaqI5?|WaZDjZ$@EFu$1Hp=8&QCH8 z3e?9W+F@^ULFQ{Dy&UjVW{f~<@IAj#6MSr6zh%k3BA{=tpDgj(1b?^U0953l2^q64 zo!Pss_3b|7CfbHyOo1*3?x6))MtM4%B!HUo$u_5e}ruh`bRDw1K}!i{{-6 zNfr#uk9jdJOf+~rGS{327C~(N3D$`98=ADjUUz^EZddj`KG7u29qmbyQ@Ef1Bw(>SHjNh$ehEs)=v z(Wu!rCL1ol2Pm6qI^JZodz`YjxR)4v)p-%C4vTjQk-hH_Ya*Y?S33S`NBHOX(XaJf z{-0|1hlwWqmDx-9617u z7K)|1G-*6;gf!`k?RzA2qnDXkeOCJ@1N)dYwuUa?A6M}NsOQ2?@yQwm9GM68TC0akDjvB0j&F`$h9syJ3HTeU;=^>n7*1wMq zMpbQyE5vhMVTswjv3@sbRcq&15bKil7)uY|&;v5R#hpC!a=WK@4PR_EUo)aKbwQAO z3mVm>k!nM4D~R<7Q7E}`didCrcVVD4irI_ZI`q4BW%}aPHJcnOE%_@`K4Op?%*Zdz zLr(hUchZd&3RhUg6bK?w+UL!CB}!M4II1fO*CUEttrRK(K0?C-wM?G)p`8m7Jr$>f zuu>=ms~wM;$kK+gd74XVnPliFcynFKbD_E5saUOPXO54J=l%NVKq!c>`>B^6r{3g@ z)ib`I(cn_w%clQC^TCANF{y4sZ)<|q%|m%mj2Fg;D`9D|{lf5GZs7&{1>e;$Xb63l z*oA5RAPI&g%B3WSi~9WK*I$EhqmgyTz1N$02lF4G%ioVnRQLMBgG%E+fauO$5e2*ka{)Tm$LVrp|XN|8pK@T$pYAd^tL_IV|FzyK3d(fTvv^@vei z_DPzO-onO`7qp|0ClNwh+a0RLzg%;T63F@eY@3gZXu?bnKz5RqcuV_7(b?qg%VK5pUR>G70( zFpdw$e)-t%y4-q%a8efz9Bl1F8|0^91{O8xo?WTgD6e*N+?Qv9z<06)hhJTl7& zCer=0n{cqwsOVWOw`G1*cMUD-Y)9@Z3WwF0f1dF!m9{%&z;jJ0LY(}gb-~Y0;cUx` zF|ophLasn2y@?FPyf1>S@EIot^!(#Tnd2eDVe{KmEPDN0tj8rMU7S0CWx5*n?J1_j zDLr=B?+2WrN4y3+7qjFIuDF7E&~_UlkP=gIPK}%M7f|OH`4Llr7&^qMQd2=}*=H{X z)9pnK6{J49u$)`xK9I}P0dJrz5ZGnIcf5f$$4NK_j&X!yw(flL$tebQkgURJme49e zZKiC;Tlu#P={1XOCd^|Zr-Ih-nxD6Aq=(8%avl2b(c(S(<%7gu&NlTtGu|G5x6yIm zT|*VK_sW^7vvEppQWYez087pj^)gq16Xeq|FJ6SHM;V1POdo?HGvo5%@t-sZg91?d zrQF>bG7QutFKlm1;p>SoeeiF?%AF$dUVi+?-dk;d{WduApJ=7zc}3zAyq|shG$hat zDcyl!EHUceKAnCseq_6-EOLy$T^RiBi(2x}%Y2=)e?FT($IYLU;isMJ-!~b45_Zoqrwt!*XG=-D^DCnT z|IFqlO2S$cd2|LkwuUH{Rzy4`N>_s8PUU$rBb2`cV61YE>@+K|Fg+1wJ^(Rt?hW37 znc8J7?w%7=U@5N%PmS`@45M3LrZ=YbWa(d)cP2cTdcEVJ+C z@UDL~G%wuYC`pTFTel6J+Sd%!D!(+{87n6LAPPsGQPi4G2lh!gV-Ogp*~jvPVbAn# z%!)p^r?kd=YT_K)x`}XKv~e|Sk?cs(z2Fbf51nn5Ew(9@)sew?{(&h0cWS@2EBOXC zoAE}5N!aX6OcEO*iGB~4T^*kV9^9SRBPuS|NXg;_lD_YmG#?Y~F_`PATwBqLXS5bP zm#b~eZaP{k>8feh9exj>0ecAs9BafpiZ@xu%ve6i4u;BR@p=B-@|7QcP z4gi_$f+9sAX-1ygtvI5Hy0cc)YEo%+S#Ol~gM(?ii{6<)FVQnFM0WZ?w^f72`BD7O z63jx8%GIo|ZrWXiMm=LXMU(B1NqRp|iO?_IM9N4y8f8YK!{zAA%sXdsa zhKR?{sd65{=FHh%cQ?5;SkAt3?Ga1TD{)>!+>fyC*RZWbtkFMaDmZ-2QvNBXUh)*c zW^ewc2z@-uwVT(tET(bE30_p!L!ygiR5F~*Rgyn{>fxo`mIK^=o2o{8^5QV>lqzD* zj=X0kIi8w5TfG}1S<}{ymR^{|Cke8+%F&BgB2a*y{q`c|?Y=AemI+dk$fxGOBvAp- z$l--taT>FlMIfWP*z>d1+faP9&XO+tdZ`X8&a=ZEeCB42$`Qe;L@l)QKD*FE6nrNR z%b{ChO^cX~Xj}>e(;1AbIP3H)WOaSnC{2kmY;GE+r z=BBcaYUmK8+jEwu?vc;>(VkKkScL-GU0U!P*Y1vWE$=aS5_}y(nUroE6gY%P&XbiQ zTLnzvrHt^%^heli_budYN~mAehVUh7YD-Sw4QMO;j@ZO+jb;3=Z4VGue?BPF-sQ-i z)%d{~{FVc%u30v*Jcvqr_iron@Xxs6-~abV^YhthwE}34h@W_q5(7Q*Y*;01!u^=W z5}aoPm{qpCZM<*yh1zY^4fx;IFxo{V>{;$H@}U59>AE?)fS(67N7nfR(0xSnYf7tg ztj98fPPHd`*Ff(}9Rhbo_s}lSj0Kp|UITf!^5|C%?U(zrWfd8kBra6lV$qoa-=ss3 z2j#(oO`~~NH;JnS;_pKy%5P#ahha(pes=!Bi*zT6FhSzToOv1;vR&oT7xV$>1up-1 z!Fy_I9890?>e9KG15i$dwAaacY`AW|o*ntPeA|5%rWT^@?9YAQalW%Ko4c8?S~cr- z8=Pn3(pf9m@r((aci?^5hh59NQS19G;8jv^PeJ39kbZEolSH;F@k%P(B>M|DbPkbr zM}xab<_(T4nW1V^}G}2BL4W3awRbVQp*sq$3Ep^s4bmctG^m?xH zLyfS==GoTiE^K-he?+p`$dlU{`|7|Z$^j`Y+JcN*cn)q2PJuNVHStuc2#ma`>_c^H z=z1cppzlyK46_YvoGu9yC>b1?6QEm7rNT054D4*2@tU7FTWq|@Fq{&MR?T*M)`b`$ z0B+JNU2oKc=vdz)< z*CyW0tc%aoD?^f?uwO^<3R?k3!6_tbp6L5M$foFp1nr3EV&N0p*87enA6vkU=Q}s& z3S_3_8B}aD#!f+G^m}B^HlC4-0Bps~_7t732B=I(*Rqxg`{}k8V1bC z0{NW?^FX}#srzHFcNJdHlTfttm1Q7)l~S?Q=S(6ueE+E_8bI}&UBDD=fJF+aO4RQA zO3ny=zgGZ6p)Rt<4(K(vEiB|#{4i9Rw{lR1QV>j56wc2P2qD{kXJ;(^U}ref`2UV^ z@xPX3p?3g^Bi~%s`pLb&Net}ESAQ%X&qr`T1xG4n8+;Gn^#BX+%{tZ9mcoj=%X}^y9)%M#j<%(Y7b)HCM0_O5S&XzWn{gySSW{7uY50yE& zZ;44evUHiOo~eTu6O#PZr@3~Xu{-B4+>FT71CZ#eXPY9Zv5CIv=U>#`ir$Q2-RBm< zh8qMnK~FXnvH10QrdP?SHIfoA1!T3SeQw{)>sQV8M0~77ymjbRH>yTn7-8AilA{Z);91JNZ53=nIJ||4n zOSF~8jylJLElruk16czQ-|HWj^wup?Gg69>D5_!}u)Vl7Cx3S=3xWG3^?^V!vif+G95()Y_xkf(YgIUU$T1zG<>;1+Q`$uxj*4^jO%LPh`skteI$*7A=m&@<0 z1wxW@)|Ym8of6r%yg7SQ6HJypsi-?u+h(%-3RI^;v7h>GbLQmFnl$z?^bESY_A}J~ z++_y>(Ldbx|Bc6gO#0yOa6JDN-Slsa(o*ec;5oDV{Ej>Q{|W|a^6Ke#;>*rDEw%E2T$>R6o%uaM<90F}~qZe+aAooKMXKTc#06dPO~G*tie!{)4DlDQpGq1OQ! zj72bK>?`s1l=8JHcq$HC!`_o`0puL3X5$}@O}v2LeV#GxumjNBiD}Kt>!W@z%(ZtM zLJWomz)smb%iZ`p4~`k3fVgg#JlDhgg)sy7tag!#-JWd{v(XFQkPu$@tTIy$%+@9T z#q^;8=-F!^xmbNS^CM*NmG5lga8Ckkux)uEA8@#YrmUSM790TLqVou?_hyOtS3c2} z#XJqih^F$OYTIm_tBqRhRTl^rIR})*OC{!D21t!xL z2pEWE)ji8!**xe}ix9vYUCC^gBYPBs99G;^%VL87I}59UqhhI5)NOg5L+|E@ zY=qR>RhZHt_C67O*R3;i&)@i^N-_M)NV=CMK!pYPQ2OQ9loq_m+c+fh67L*<0@Z&G zzB54q@SPgj_W*z^p!V@wtVG*6TAu($Rmt4-XmV3OOuu1*f28FUVeesgAY~3tknv3Z zxiTDd3akdOX!#|8opei-gEKbJxHdA79rzKJ_Fwyhw7u-xJ9f)DKf>#APEF+@QWz-5 ze|_|Z0CY=WA=`ebOv7u$pQyXecwW1n^w8Xd<|vhlWN*#DQ!86U9Q{gS{Q-aL_AiF` zA2rqfFHI$B-eb3$CW$i#Bie&fI|mcI7B7;Xny0$-_NcGJ=)JCEuo8mUmf&TjhjK?k(u~z#{-ZU0@VvW z06l)G^zQbLhG2`z15lDNdNHR&qws9q_jd9>UD0awtPZj9_cp1k)R4_j0E>a&C4}o3 z&u{6B^7m8uyi(@zk6_d3`Q5TUZ+`p?iSKuYTBQ8_?x~xwe?K+y`BxD^RA>SkaZY%cLyj(b|wLK1t=D_IAyJYzSm)9cWN}_ znj(YW2EW>f)j9bKmm|kcw#;!BrPNf_hz8gtJWDjr17K@B>pfnjEImuUhxW^H-S!=C z$>ojLbJo#OuftrEmn}EUR&wz=Y-^`}8IQQ`3%W23;V&uSEOoTNMR z6W`y)2CM0+T0^A!ExB328?tr}Kng1s`Epmx@PvNcSeK{3k1v2CQLL6gr_VyV24$Hp z)c5tM9Kzp~&o(}yMl`%bIs^5K;v)H)0ah*0VW{bIVW^XBzPRRGt(8y$KO&G1N82Kz z#g28b@H(rmAODA8-~X3z>5%e?B`9tw3A{~l0IK33`LfN4-&Q5Z1Ra3J&Q|$g*iteN zH(DatF@aauv%`{r@Anso{fFC>t@nPh*GYMQ8}!oJ@-k?57>I`|Nm5re$EO$S5vlzl+ zWBsj3vB@=-=}{ACKH9NS59VL8$K}-jRE8zpY!mD-D#_>_!X|0mEqnG~La%R_iqe%c z1>j7n7P~pG29`zQkBiJYCR7{oCc{3!`XJ+gezRnOS(*=Evzz=g_m)a~U!!-cVn9V2 zGu+Lvs$CIlO6oMw5;#R&!d{m&Ja{?T4N6I z>LDe(HzBBnP3~p;U~Oxc-&^(Fm5jJ%<&!(t`?{*-*;_zJN&Kl~bFBNp1$C>a2P zY$U8!e;s67WXsKT-WL|~0@z>y@{7=^>T)&AGd6+Ew z=rSCYu56>oRilpF59fhlOB?UYXUT!4N6i8D&AO=e6sfMhw;C<$U-o z3cx^ob|~ur)D`Ipc=A^cK+RciJ?;B0hNs2UL)k*E>*l{8K<{Z1tY|{10R54c9=AoF z84;miWD{N&*-?F2b)3)T>Wrx@w{0<2SB;PupfWnN&xg0)HHJUIZt(+@y@ISEq2 zbpV=Q0`ISA6aA&v@;^J=c@FI4k^rR(h;jpWR{^9#Ng#WeK>Yq$*pFY4OOBSegCM@% zI`U_St6ZIU-viK9Og<&v8(5-;&%rjg$#FYu_^x80E19q_MLOX1h;f)VcpqS4`v2AO zur3@&h`+OKz>39*oUJX5dicfUkmGT6|JhQ&ISF3zIERPM6| zWqh!y>A0!Fp*g&Bdk#PcYbl>`cHl3$xf=A{sqR~G9?llRaA6i(dH6XF8*$(rPL926f$$W4lsb`03BmXMO2-+D)Hy0 zv?m_#^e0Yy#yeG!zU$$_TB5GJ=`$O+*Xz_it0t$b373_WTXBz3&b3V9Lq8~5dR{e? z5RsUFPWEY$I{XVWrH}hP@TZWiFGojOY$mB2{NO6qRR%_Y5oQ}F*<0^4_;O?^{ z0UtFw@1A`MJ3^4sK;6i6z8odtYm=>?Uw^(SDkg?Lf*5e_J{Pc~3PSn4h^7+ZW>q!M z$1jDwiSC#ZG^KgRaf-Z6Di7{kENUT=#sa^>9UHf#35)~!HJmw5(G;+2&e)#Aw;nvdUSfLKnyMJm}B+aJv}39Nia^KC-)lHCU5prK>-3G@FT18P*2qz_~QXVf=!8( zBK@LH(YP=8O!;X;yT)*yea?68_ik21Tj0C!vmHpxX$Y~`4J}ka@YJfERsbWluA$(~ z(?G%>l}n7HIlbK4OAlXzQ8=-uJY9Sq*7D(?q1=~i!UrHXVaOW;R7PZ}h3Abrn)TAE zPnn5a(Xpg56p3tN_QEXd5aP5XT6B(+*auH}J3=NUz9v&VTAH1-auRaRS`8gN-N3|Z zcO7&7CNGn2A@5^Y5Y?XV^0HxE7;C~{eO1%WMvaENB{NG@k@jsOHF+5$!HnT$I^$-Q z>tt4Y+}i{fiQ;JrIn*Za`M86XcF2L8$JTJbh#yRZW5`a($Dkol6gal(F2@u_2dwl(qssTAQ+(@(F?Y>5@MtD`Rwvp(Uu z40;r8e3kX{MHBCg&HR1c_|+|OX^RhM$mhY}3(be;O(2|M7Pr4QF zHg86ydx1z{1&c|C~?+ZV%Aev93}V3cewqzv*!1b?BXa9apktWB@T&6bfiQtw@$5D2nd<*hazAa05rL)wvRmb6BP4s6o>nkT8HY+Ja!5?8_mk`(e>1mhn$~0*|X4} zn(6K?B)DXVQM}VrZdbC@P1}q7=v%eLNWi#vq*icB5F4{6deRte+*H#dP<{3+AF(o0%w!y-bx+SK7Zk75nh?oc6HDt2ZyMGS=sBA`d{JEumYK zV%b?sC1$%mW6aA* zUwbn^A=9E`ir#!;30~P_h1x+A^`tA_~Jxj>e zlINC+Os};zpCTFTE3uC8kAG3A-mDbU%FN8!%P4OB*~l%m+&VDo28aKrd7p|7$HkEK zFI0j}VCu#7p(Q37ogQ}2ev1nxL-jNt99tl-^Ou`cADONj4mi234XOmt##TL+P9@w+ zI31x|Izfe18sN665qBCk3%9lJJw4=7X%D)yh^(i#8guW)F zlj#6tqI}vrFk~J}mWS?DUf&CC#2l-gec9Gw<`y6936hhEc)^$h?D9_#Q;|60=Lj@8 z>j8-2wb=ovF_Nx*>Lu-}Gu6p#-P4yERyV2BS6qZ?m=~8r?3xPbKOt2_=&i{zG^#tF zE3^j7YqvNi5lt_K%+)bj;91JieqK3CRX3ljkH6>+{WYd2g*m;32lTYcuo%SJqZ47fzB2!0#kNd8S7uDD-se zk)0HU6$!(}665O(dqq#>SrHO7+nrXJ%QY{j6DeNtDpZ_*l*9X4d|i_g>rZAul)4~t#&%PUT69do@Ys@|>$z?&yutO3j^N;Dq(|(p3KhuxmB+Tx8DPAj zC8mg9ZNW$C9rfUUB=9oV>Y=AoRqup)wKbLHwFl;=G)=l-Zim5y?F7^-PW-JiUvP`C zEiC~eR}{zETq6>n;b`t~rDk!~<~cRnr+766iq^>}BS)kMkaa$y5U5zx&stJeLblT` zr>mP58y`$eFB(BjNXs<;xQSgtH1+6SOA2u7Hryn%)}5V>9anjk#^3UjO|*icu_ zWN;}(f@#nk{iUyXn_Q`jq&C|sO(eJ3l#er&E{q>X>A6)mesEVetqQKm2^xHDFA{n8 z6ztk?6)iGiZd~V*TK7kP2Byg{Iuyb;x@uxsjqpXAgJ7A|9h}9G?<(LItNq{tY614i zZNwL?>>Mxr8`yZ-&Jnk7zG?4;1CTm+KLvi6uKwFHfPMpjfBY?H{-9c8S6IJri3e@f zyVIEDmP2N-x=w#%PcEPwf=NMY+oS2_&W_V-vWw*M zwBja)(hoD3C4a3!8_y`a2dM;i4tRmU}{0gd6@54DnkN8UI7V8qw&uybrI@ zB)6-4KN={coOt1Utm7opv~m1MG6m%+<&e!<}FaOYxcQJjsvZlU$P zjaL3^d7}n5{Bo#{o)+JUK~I&JKu#@6(3eCHDwgewU00mDz8ALlfFQAm^)*$7|90x@ zPIj~nibOe5Do}QSyQgERji!5ERJ~X1q1#cX`YkHfCcB^ng>_sq zAvdP-Y1TGqoxF}$S&a?Ly}p8HT7OJZ(dG($NFPiQ$aaE26Hh2H#NT3t*sy;N{~`}O zgq-24b*14vPfCQb(g|D7Jxck`=i;ryOl-j1E?Yb4$7`W=B@qW8)ym?RIKw6tN*v8; z*ektn&s_dv;fG8b`Ua56r>qMji)?LfIQdy)Wh09&1GJS5mE>vhtTF)NI5AS2& zohKm95I;<3kMG(H!`meufIJh|MylQrB5Z4<{!*P? zk71O?>hI}f&dAdF-%!gYG7-^cMEH90Tj#@UWHOHRi4DAL-%Jc&O?>sHY*#!dxdGGV zv;Ki+dfm18Q}vjxu8l42XXSFy+ft3>YUD&r0kz!wQ{?^+9waQ5=IHv+Qq-T18X2*e z@fHHGq((Zy;41@++U=|-cfpqoCBlTkN)3f*D3XLEHqmlbl>KAyEO#oNunR8?OuQav z&RX;7P zvXvJ~)3)8d8-C>(rIyAzdP~~8Lxvggb|N%0z~Iy9yZ4Yy^3iU+>fRY!fXS(fBjD`5 z6wK;B_0(4WcCNV8Wd9Y)MvQUb1HDeA@_q4Yg?sFCyx1ntEuEOp+Rv#65-5;Nr~6?@ zU39N{cWEAK=$7PWS@iLfL#*HiDdCo_0_A@1i)hoEy-ADC`qmjF&mNCUuFV&|TioW) z%SRi_qF6}I<=l%|*^i9Qhs!bG89C{d(WXdoj#b-~-nO^NA8BW}&*I3zFVc8F;Tbwz zol*Dm?NYCe-MXA`gPDfwO_j^C(Z23!H)WYJ)X{_$0S{;PlUD>2=7nh>?4mHGAz&U$ z3KRhvKH-ptvVlR7xPT}}n3J^4chwMru9E;zEQjZz6SJQfD~GfEDkAWeBh3O$#jThz;I>+W4V zu{x?C$mAb1Nia}|Cw23l>Qt#2Sc{4+P*89^r9>J%A+5S-NCt0!pU^c3Zcgf@64pvG za|Q>>py#SDg;N|kPn>-fG)W-IB#shQuIsANy)&(R;$~Fyhxuu=p&NZIAvd+e_*Ata z^OT_fw)Ksd``Y6@(h_dAiE^!8!jt&pRh1V*EQW=(j0BJA>Kr+-b+Kaki@(w7+W0s4 zj;p*3QAcw_fhjsqxPBId)0i^%=c|a1$YSjEZF2EzyMm5<=FOr#SDfBb;O6~B)v?re zl8q*^N?{UDM6{mkC5p^=;`HHY)S`O(`Gvt#_i;4STls|?zCLd0%uO`}Q>E)y#=Zo2 zDn(kP++Ae9Eqx8dFpa)Na1^GuHGnGjAKspQ>k2|TEdwx4gN%1e8=N4heaB>sp9({vI- z-AENB|K0(ezCn4&(Zp5n|X1?Q_rxI!yi@c;#DEWR%;R* zzQBYV)Skx!rqserZX|)5qXG)tuzGz$yEp<#HOHoS;ptv@ci_c{*LtYPrdO0VyVfXG z`p(xN60SeZs}4M&&^*WN8&R=j6<3cSSADZUP4}wLOql<;23_I&Cxb~m3;Z3Sz8}~z;5{lykUf!=i9m=8^*|U;-Vu6PQIcgj>H(j>9Q+il zb0k^TvU|4yg-<1~94$NmWk%w+#5q(ilJ6}ZfFg;Zhe}wjet37- zbVKC#Kf3Bd}0HR7p-&;I7 z4eM0Jx~ba(I$t$EHP`?cSV?g7ddeiAwe-Egce6_tNR8L~-XisY%T29|v$bat! z^`9O6Pd?FFtK4B{z)tcW<6SyVzNnYk%6FrZgC@&X8z0f{C6 ze|FHa?ZH?6-r)UTyxK?3Z^g)ri$7u!Io?F~#abv%m2V3{c+`sKDVvBK5@G&2fkxlC ze2;LPCEo{3_NJwt6F~PnBOp;jxHm|>!+kMuKhLEGzKsFUUyIvWSSw3FTXRr@7);UR zmOy0eEDU6U{6zD^8_z33{I}+hz87tRjgvw;r*-3)&Q(NSCV6p*>B%YtiRoV5OoX%? zJ5204GV#;{=wiSv9t^%*5MT$UQ7o}rM|z0My)S;NgPRrxzsR~U$diJ6b&s$Z$VO=l zN~Zo4lmQSVyHvSipl?_Px0jrsX#>mhgh4DLDrgKUMD^H&|F?}O?4>htP1ElJ<9JmY>^x!?@yTs}E7 zn0vP$oCKoGQU4C+4+FTFk$)Q;2o0xOvsOGa&Gc>({_YyYUvP3RgDH7^t>g5&Pv$I7 z6+|On6yj*0&SDe>BOmNUxT2K&8ENc4%AJ7tyXCmCHY3POKZSM@Q6YLWmI)n113nx@~9h+T0Ph_ZaWuv29iLRDTS8Jd*Q z2Y>ibb-;&5-N+CX+92{J1W^Pldf?r^V+U!+EnOZPH`-}ta6pj!l`HFW+nO)l&&`NboRZ@^TrTd%@@u>WY8B%jCtmsiAwk27+)p&v$p^+ukMR z!Z=p2JSiE0p1c{0>|rAfRIPYcu_CNaj3nLG&yr{UUZUbpAgLzsrVi}^HRD4*Pxd#U z^i3a8w=C5mp9&<3W-s|Nncnm7%()_nn-T}(&ybUkTN=TlcAgE*t&EjFb-!nxk*B%t z8h}7jc2O(u8xNPYcFsNW5ZP%~-OR(VA7*d;Q!-UEO55)MbW$*2Z3|X{d$~#v@hJ8l z%g^g&hPX0%^th=4oPqUF^urN_edw@f^nClqj_v6Cs(TpBK>yiau&V$z6%Gi-)@o=Z0# zwba2g^#qV-b0Vk0=JzyG26!fgu0LcS#_1tu9EP|%q7`O|<5?Un;w zLpcqe7P<8~oBM0JS+;w3g1ND*V<9?Ps;P_9Bl*2?pyAuN05c%JtptHI@(1)o^{s;4 z0WIp%t3p9`;A)LqArYKG=79VGY*(NElDqn|dj-@^4@-KBfd`){^=km-{|t!y5BXL< z<8l2!EvaG4^82Mmc@HlBG-@-x4b{ir2D3&ZJ_WKX%l%p+gf08+r=uzF<=v#5$!NQbgrx@8D`dP1B$@4TQzzT>-L++c7Gr(L#D3#sGBwKsQe zQ_tQdx~*!bx3KUxONNXqrc%W~$b=8ZDSGd~xCJ5zWNidZO(#K+*3Lny@J@iDP|m zCw5CsUN8~0>Z@Qt>K^TZ8C3rjaUagkl^VyaQLF~M;mfhtMPU_Zj@t1)85JIjsO$6D z0>;XVA91Eby^8;Etm(*ah)aEK8~>HuDC@A$$cJI)ODLwZu?pKO(WiyYM#+2^k=y@P z?xK95K%`9LaUM#;_zHwMJxQQOAa3d5^>0kZ>90)2yAw-a*@}Q<9iUT^-y9WX=u#EH z^yE;+Zp&n^u>U?0&ZVfYSn!fnA)xu2I3vOQ$}Qon!%~JN!0Ue8aSQDERYXB-SUhaZ zgxE_1(5(;X+9YcvKTM-jE$*;C0FlPZ0+h)n&(Flw8PquUg!QqJPJH$~#|3)$$MpPL zHm5M$TIZye!twm4EBm-2NZP^0evz%+xN5zK7ujTw3V>J@c&Kj)c02`8z3&2u+-6`3 zgN!pRhjesr@abX18I~w`x~!{uCr~rvZ&d6GWT6500ob-HMYM-G(MAr{l5`peXMK82 zgrb}}AhxA0OLgz>8gTzjG;2=k_f^wqyN2E25+J~95j{qrvwud1jrFrR>t|>PJc5ah zWol5FX23$+Y=KY4FP`d0y)9oIojGyTipaQ9>FJ@1{(A6e5gJW3-{Uy}rCWaLix)ln70 zZ^jta`Le!fy;}xw*!UTB2uGv6U+nUap{pBQm{_~b%bt&4b2Mjq!EFKq7JLfYufvLnqUyq5uepFzq zw(4A_rbV87R}x!%+f!}Otjm*Hbht8NYDu}7AY$_0gaiLG}Lozu2{e97Foc6oj9x+4`}0|vZqfn-&Z=&MuKLHbU& zM|e|BX^%Z*w<~!is^Qba9FN^dC_@;k=Wp)(4ORCk`(PI3tRd5b%I+Z8bkZ`=x6?WM$ElB1FJW0FJ zo+u-0Rc}~6uoNDgkYYW4iN`MWLVIaSv|vFBhFp8e!-J*So!lR5Y%}%wUef>9-gk#J zwe4vKK~Soq^b!O-2nYy>(nFCZB1o4ig3<&841`cZN05$yf(S^jQ6cnRLQ{J00TiSr z)BqvQ_S|#Mz4yB_^Uaw%Gv9f>Ie+kkCs`|dWtF{u<^8>H$KBye6?bKiUk%)gd4~z- z!?vqn(%HYgEpBO<|3WHO4z{#V;#L7-TfqY*GA ztVatMQvYf4ndkYFw3Zq(fg(3CIx<`{p(qCb*^_q^E+69ZX{=JFA{u}-`vE|u+gbD8 zt{9PsBV?E|xNN%pmHQk>nt0nxw;==07a^->g%J`J*&|C^^tyFB_R3WavRz(RfA7)I zZpM%u%keQuaLrCwf zT#BBYl>`<89dHp2y2c-{JeiDkp_%9&m~8igE28m>XBtz_A#OiT#bvdQc~UXV)X&6P z$A}I~)0NU9ZV%Pqm;z*3E|uxox=24AFjcH?25yY`YkN|qrtN-`SibP_E}3aq4uYp>~^>G&19CB7Qwrn}Tm}M$ymu&S=}yW}coddB&Vssq)~O zf>aNk$#Mg=gYGO$$x*|&a(t6g?QLh%HSlYPIfBLRIQ(T_~d{OSSyCoA;ISVHDq`0MJGMH$VNo_$JsKCZKwD`vEU$N{_Cp@jXM z)>RpWqRLOQ&wX?{6cd}V4>~80iorRyB)Q`~oZLpqLXY}`c+fXqTU(chc9_{p?S2+g zKw({ow-kZsoG#+_F6+)#5MrB~?&v7t$jITSG>z?}e5ONbcBtWNMT-6%EfTl)vRaEA z;FhO6t9q5GTt5NJJ1Du=p&-}AfF#d+1ichJyE$u1Dq|fh+EGe^WW>Fgm?YQ-F1FoE z9`r!#ODCD9UX$D;d(G=>_rL|wf*X|mgs_==vN!#jPD5n^fmn+SR1xYe_Glzf${l-Zv)Mp@WmNJl7PLwac9oYvX|>q;ODp z1hZbS2wCt(eT?j}ht0VO`jdYL9jNnO*k~^8uJc^@4hnTDA1{&B$h}ojdCHZ2R(&d= z%V;r^40^z@0*5?TnF}b{J4CzM*uJ8bGLZRz54uAROGQueg)kW1t9o4JTJ3}}A3VPn z?eD8zswxuA%5#^|Tj?V?!@ zd;#Xf4|laIrTeQgig6xP>Pvb`)vqcK?^H*FNs|_Il(SaZF;IK`)cM^Uk0xugH$xLE0}T=SGopchA1#^PW%H{`w(OlD!B@lYlkoCwyLqiAZo@&Tfp*P*jf- zh#J2q#YM80iwB$g>NOckr!6MMOTK>W-gWsbS-KlJf_29w%tRNqh-S@uE|K5f($d_f z>y|T~mc`K$f|h@|NuifZu*05wnQVqSdD;{|z5JAgVvK@kTcU4eGoILgQ%tpc1+8kR z7poW+NCR)k%d}5vc)NeILZ`>Mr(C#+^0;8#w;h)5U;gcSOeADtP|5>kzd(x!SGq>A z-To6rIqZcc3fmhcT;R<w=?RHQj1zB z@IpCJp9rRa3FGApFlTTB_{H}7c&i^4j5=V!XrHj|9tHHT9=(yikpL|)yBbAP{#ebp z>8iqR7wKN1A2UuOIbt^}e_X|tom-D_Vwk)Ww9gmqP#FVRQXXww8(B-kivYxe*VFe7 z`G|q#P*QJG;1p%x<2#gU0yj31WRGu#U(A^OLay-u9yC76iNXc79vQ4nPN}*2;nGTY zCC@KUgfC;64@$)sFVasF6bCovB~wf#<7i!XSZql5kmmie61gyU%?Nb&q$o-uUN`Gm z$;_*R&s<_YLO3>MoI%Hg(4?v3CmY45Ad$85K*b~IerZje<-%ClgMMcq1x9aw=IR&w zeT}8440ZA^Pbrj}`}zT1_7(W^kzF<#H4t0svzGzmv1h?)*H0h)1(sN3@JobA`HvP@ z`mJ>G-w0$Wky4*KvwOW<_tcw7^B<5yk{^&mfLm6I%AI=i?I&^5XGSzn%FdcD<~E9S zuH?NyYGV?l?w3=IA1eId!vn~Xk5N>dJ8cxDfC$>ey&pWy5kA8|{ouLVMubnW4?3A- z>-`{$5MEpWdbkDwOhP_t&eNI~GWWpOigxTr-M@p1E0uf|nok6RXE`eU9wsizhka!L z8}?oUu)z7bbq55*C%-XLqlFkjz~cxYs^Fo5Qp7JUwgb_0YlZ=xouw8obmn=YeO}`>%#=r ztq?5?7?&faXN`o@+SshWAH4H`+9(d(OWbEK`VixU(Wbj^cDcPGEKgaCJv;nKz{{hc zXqqSYRBWXa^0#ix=G~L>fk*6n=A?nS6`+Ie(sB90pm((e^4W=fdIJV-oOW{h6 z_3$e(3;XTP$d8~7dkPtC&iUw$_$wn^__Rey^lL9G#`MQ+UOBF;H;>ysJd_I&^!GUz zr~#F8#+UMphi@eD0)V>F!ijG<*w!Lr0r~sQ{R*cm#-8V2Mzsjw`rQ*^;Cq&5VGpNT z;1^AxwO%o4JGWJ-qv7dRO&~bsJmLPl%Q^d5A(x9VjUY`P7T(}%)Wv=WG_CxrYGx@y z85~O%RM@sF=Vc+ev5>|m(Fn1#$jgHk%?jb~4rUron3#YT#g5*Llsu^p+hVV!EJBTf5ipba1rqFi_wp%GqOp0w0o_ z0}dS%@C4NI3UWX&LZQd9CXXuK(9v8oxI>4(aC1d)Fr>bz-|w@aSEc2n?TNC3Ip?!o zd)G;J4uWK98?N<3JpwHSbx^QRr(-$~X%b!T%)3w4lQ#LurwNCJ5bowq>G~hu)4YNv zJOiRJj{UnXNK?zR1d(Ex!<4?3Mc?{o8MHO0gZwXFjLt^neh^55l{m3)Oxv5Ca3to@vS=iu+^|?goELlSP=xF-H>`z0l z&Lp3kJ;YT%fP^$X)Yz3gI(kjyhQSxx8qU4#?Y$(lpdQ~868I&cc`25n+az`5%tHIH zpD2DLujlNXZQc}b(>}%%$jkWF#I)q%fkX;Dm=o{PR$>@?-GCXb0|T z(|j&4Uty`6EyEzU8p8Nw2 z`%F$FY+193a7ji*S)r-WMq8O!VYh}p=~OizU%JCuon9u)QVpA%2cP6jWQmyAzg8Z} zv1IksbzP@wnGV@&!@88YJ;CyXi zghP+2_3zf^kYyWDM{QKQL3S@#6x^@bf8huic*~k~6y0gARZvy-P|o?GhdkK&b(64= zXGQtO7rl*-JT58lV7X#iJ~Z&nw%Y^8DV{m=O6ZBc*#n%B?kZ8}33WOd95uVjB;Nb7~{wB9DjZ9n%>_7HAM{j%bDn`|2^1nBx?hj)r9B?rM6+k|s#bTg$Z5zXY^c-TT)f^g)n0RT$wuvr z4}3w%agr-VCKa|E(%Q})rpm#5KGqTKC#sapDb}A6OCNtLe|%T$!D##h;THCGMS4~{ z?S~wGh^@WC<0#)V(|Lyx(vbuBA@7Tvw4o!fyegSAQQjCu;J{qMkDO3wRw05k@V6h@%oi*94xeU0xhVW|AJ6y1pN0eU6;vQLjf~!J*=&w#2&K3swQ{5Cdiq zHR#Iuc>rd|lL0H8B+A{nh^R;S7no6` z6|(WU56d&5O|zu<@%I+m=GBMlavqi7P9<4ooVi-8kK7Z>yjRxXzE4vMy;-Vwd&~@l zVUXTO%{X_G%T%A^I=OH$N{&JJ3CFTF6S?(n^aw`u24un~Zm-qI8bbr=pebp{TX-?v z-WWfSlOlO&9P1v;1_SNHdzFT@%d>pMSwtK`zv?Xe=;j}0x%S6cD#=}0;Ykj|fP*%< zqU}_MeoV4yG%(9*415IsxwAmjS*Gv!I|#_Nmb&$6<5TDJiS2nE2B=?A>lvDnktm`we+rw0fNC>a;PvwC&hiXE0CW zqKy-TB{vVAuJRDmaAxMG_QSEv7i`jf2NeXYj$nwa^)GbrIH)@*d3a70Wos5F*v8it zNJZG;^A@h$O%qCMNTv?5sZ{j1F$>@`v`@An0=B_Ku){1HK8|B@vdsQmR^fDcLGmY9 zZ@-(^1h5rT7rMm>(!*z1CUF;bcP=7Kk});~e649$`N{H9isk5D&7C-WlkgRzu~n}b zat_#bwif-sp$b30Ne2n>VtkziY)R^!N&6dd4marKzsyTt81lGD{^Pbcnyw>9wIKA# zumLnOX-MiLZ*p9A4s_y3H{VEjK8P4wNrO9dGr%I`Y|)YQXF9pZKgQ3q2UFGw934QS9mib*0<_Ku z{F1CKF);nPz=ZAr-!BLQvA=NN@bCUR+0fr7JAUPN%9Q_S+ao0sf$Fj!N~on$gw@Dd z&T>C%d!8Q@xFrsP@fD;rKjaaervOFjFtT^6M&YZKvcqNl@`rsa;2G zTmlfg;{bn)1xM@6&l**YK#ZZ8aJTbuqG5u@o0leEcFkDG{5_yQNTT3c&T2L6#L$XI zs6fQt`>f-{ulZUT_q)0&08GHg32@1yJ@zp=0_${7ACcxeNR0R)sED9$I&^e$)@t<0 zV*nK2OqUgj*ehI>RDP{F3%FlQn54P>}16Z*cViU|GoEinr4=Z8R%z_iyIc z#R-=Uvjl$-Mu`<9jRn?>c(DOc67R6LKOi?h{uqQRkM;--0zV4{Tz|y`AjJay)kNce zgJ#0NrnSNF7up|L$STiw&@+9eza?hkhm!36QhrGb_+LK|L;rxL`So}Gv#+qe_Qpfb zIyvGhVr6pQ;Y#-Zjcte&?Z<=YdVn1}u4p{c*;^mJrw-)7b4mVZd5r!e7>@tyHm9T> zL|BDCR0D4a+h0%_T8wfwoIcAGx5I8LpD6UpMm5%N9VocW-+ZGs+n%*7qTE!)fV07QBe*tQzL zlsDpW@8-?l75KhgCc=R-0d!2+H1IgLB)|rq0uhGOH;Q~ix4L#@Hz9Kp;E7GdGLk53 zZ$?#aAo5$geed+cH2@mV>UnB|L38~8vNHw%-LL=ljV}T`B65CfB;N-_rsNQ~MU|%7 z-!&TH0sV$w`>Opi61yrUyF$xfe@NDE*ZwyKD13TSAozfA6SzP>iM>tUDUU3dqg)1a`6#KKYLix`f;@{I5<*gjv@E2DN`vR5YaPJi# z_E;SCmjPtn|7WTX{#RTFc@v+tSUX2oQW2BUNVw~afZf-TiV$5_WRaAEgS2x%a21BH zV2V1^Qdpso6{=_p#pV&La-6MITe-X$@HVIf{&S7U&o2Ex>-zq){dsE8k(P)q~|W9KKxlqFtLhhDVa$M$s%`?$bdAZ91m0wMzDq z^?px({F*p&i+A-B1s(vBi_d*C-vA2t3H)-}#IP5Q1acRmjA_GpysbAT{hUfc-@=P1 z#q%t8=%}tpG56jH%Nz8Uqy6`W_~&Q8Ypc+)gHLTfd}|L-AztTnWVGUVZMHKzJB^c*XmlLS;*s`+MAV(Zk?MOiHt2EVYoB&Sg*|l%wI)k!BqJ9|dIv+7 zk31d$kp1w&0|EJ$liH0?@pPW zC$X9mnuyV4DPLPll`=WJ_jdMiRK&V_pnkL@I1vOlV3=><#KyFVoL{+7Q(qY+Y#sVV zQ$RD{%$b?jJu-SkJ_}%HZD1`=xEIvs?r@ee`I%;3qCb1WN5!CBmJB-gcXeABM4N_> zHQBy&glbA!A%zfg%cs}qbq}sZxoT$QtzSkj7>Um;UA5rA+5p3G-}laxwc{N}Hp^n0 zB0%darKXr(q%RUir!|b?CmeUx&53D6RJWC8g@B%sqM+A*)_FA0QZ^VWe>JG3N*wLI zDPvIj*tiv}U>rG8vki%9+!nUC+}j7_5EM8+kYQ&O+4r}z-*;vXs*j6(d>eeenc?yS z7?^lVj+EGpMXa2v{87yKSB^Nr)KqU7dtu?ch%-DaUXPv@W(_Aa?G;v@z^})yz9=DWdS?VvcI|T-wxu>|Npumf0(&I3gCh41GfiIyk&J7 zAV_zyD6%lXtr2f+K!`0=0Dax8zqCt8)W&NxCH2+Sy68(!*qoW8YeV4%Gs^P}!#6gBQ%lukSH(VP zySx|Oov9+e^B3@;94h6*8DACop!XxySK-0Ga zG6o)sM!>bQ3?!I3E{f3Q<~YCA4~TScxqHW1#%e?UOf z#lc$NLEiGieG^1X`)I9G!uxD7>;AyjqcKw)lBh8~x3|NzbDd$ob4Rnm~6D<<@HHH_Ncp&;6bO$YcG-4TSxvja2ICcTZ zGP6q?j+WQqzMltJwjbJyuK8>4X3$;79kkzn=A^U2$x&%quXl+zw!oE*^Le5Cj$cro z>>{fW-=c6Pb4`LCZ$P;&d?~T})NVrIeHo zUBB-yyVaCTlrx9>inNbrOqG;rGW17zJ?%cVrJbC0AyB+y^aBydJ9k#+AjT#D#32p* zix;?K?vUv^^5__HM3O;0!_#gdGPcWc^A=VsBU+G6(yC_oG9@UMtGjfQTt4_<9-3F= zkj@}W$ukTUZ1%JUgy?SY=0c`i7CBSFuQCcVQC)cWk!$oY7 zymV#sa+ArfTmB=)3+g0$23G;b{8QRz@JVy+4eS*oB||+TiAh(Ndt8TY_c+@OfpInQ z8{^##g45%iTS+!`NC_eEFQ6Zkb@W6YGrY;dl-fYXeUxb;U3K!)D2*2XtRi{<)?z8~ zjbXu(ao61R4lX)vk&WtcHNc53j?p|bd!uujLWPI@#5bR0z;2F3B#TMRq3FryzNBq* zwjX6SmK|d+H+s1jQpm*(A?O7S0 z@s55Z1i#^V1bKcv{haCR-83bts*c(1sl+4XJ$wWxXVFP@OC9?d0KoNpF;^skElsl7 zcdjdvzxShu#TeiqA;L)L3uO89<()R;bM)GgLg3R#Maq0F#J*+GVU2hlcSIQ<>rEiR+ZBm&5;O1&{6xp#W& zm@s}Wzvt1sL}hry$;WTpKAeT5QYZR46jH7@Dzx9t5xz1kK)7D^e12L?cpYALv%7ko z0%9U8Brm$61EeUXLbkc4v6&ioAu|F1aLRwj_$@WQ>=A~JMlnY%qJ-vk1Z}Qy_{8NlO<_naz+rX`l{X&~ z1~&@K@_OJ=)0pVd1?Bel_zr@q0p{e~;C`(*fnN80w} zodO8%FSVCJ4fY3bGXQl`!HA>trDKb?#eJlzxNL;lZXMH%6H%5JMC*)cvr$dtWMccv zB6n79Dq9b(Ky8_q@G)qq-Dm^Xfb(uqzgB(s6Q_E`)iVcVq<9?)A#C6+M5wfG(J5>D z#IVdAGr+77EU&3Ft<@-bgLxE{CArJPuz>U7!MzRVjl+Al+tZX)xGrmwZ@CpRKdz@} zf&)dcTGFdvdWWL?KHkiC(6pt=DzsRWbqZ)ZJs2L3*Xw{s2Nk{#)Qs8n`Ovfz7ftTF zJ-)X%+7MVu+L2CG>|-UeA5z}Zqfbea?5wLJP0m^W5n>^j5C+uRGM2Bj=OCY-pSMgj zo?&d-pRT@0vf;&o_CoUD0i-7PSHPsUv<1X{BvA9(r1+p)W45CNl~RPAx#pA6HD*ts ze7Gd-bIj%<$g>D?p^haVKcw-od?*(cq0T2JTK!l@bx?9L*Z)Qx@C@SS3;=Wq@Q_^+ zU04tCmdiQB_>JZ7AQcH??ANp#ahMWuttuP)5-01BNwM2W2B1YQcGqur$JQ^8_(&Dn z_lsQ}^N4SXTneG?)W6^l>&l=L#ghW6Uvk#O27m%V5ouX^0TUL85;<*OxqtL>Kfb=L zd&Md36^LAF`dw`eh`)%}c{5_Q_Z4l%QmGfqZl2LA*bt_nwg~^$@|+#(>V&!a^I9s| zTTaZ!Yj2i0Uy_VFygh^M{m8a4c{#Nf8ppE@AYPi1DfW=`NzFc|ak=OYU zgc3d~k*dztu}2>JI%WYiFV>4BmSt(aWVTU2NQ8y3zgf~pM&Kj1jG(Ja=c1JLbFD~W+S}ZVU+?wm_me^B<++;Y zXT&=z_=S;A^ExB9dBAg4-&SPOGfQ27-F!&EEzHqVBa+c5G732jF{a?UDiar1Ut3y#X_SVO4 z@TXw{*^WX^3L0AdKzhB|kWguhEp?S8#oduk!bLt(WC6PX2K{rS%`*)iyCt2@rceMr z%83Tdv-!T~LQs*@%DHFtE$qLxEn;; z*K66I$l<`rI=Vc%=WDmhG^FsrYUdj0&|Zg literal 0 HcmV?d00001 From 73b2f0733ff6f8ad61673cc8574ea8ece1b9a639 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 19:17:08 +0200 Subject: [PATCH 74/88] Revert "revert changes on dockerfile" This reverts commit 8d72a66b4b9b771abc3f15a9b9506b4246d62d8e. --- .../Llama-2-7b-hf/2/speedup_plot.jpg | Bin 112666 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 flash-attn-2-benchmarks/meta-llama/Llama-2-7b-hf/2/speedup_plot.jpg diff --git a/flash-attn-2-benchmarks/meta-llama/Llama-2-7b-hf/2/speedup_plot.jpg b/flash-attn-2-benchmarks/meta-llama/Llama-2-7b-hf/2/speedup_plot.jpg deleted file mode 100644 index a641bb78add4ee4612d49fa6cd48e4bc5aef3156..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 112666 zcmeFYcT`jDwl2Pq5~ND+O+b_?NRt*2X`)o67ZH&ry>}1+MLGgX3l@qXASHANy$DDr z^penPLJbhY&A0bH=k9y%J@;4s{?-`pU@)>qvgZ3f^O?^q>uTz13Aph{T}vGxz+b`y z!T@kJ3#bC5BqSgbVp0$Ybp1Lh83jEh1vxnd3oRWrJqIfnCkHD#J2#)G0QXH{UUv4| zGIxaUic3mLatYj5l$B5ry(cO0*Ow4nzkZ#9oPwE>l39X>ok!yT^L5n<(2x@6fx-z1 zZUWb62ncBit~voO03aa#>uCuHfPY;C*9eJ-NkF96$;k0HKyLuo2nY$U5fKs-6A|I> z4#9s85YZ6R@`$UD(COQNZhFv5L?nM9<$X}m#$Ygl;=A|4^UZZKMkZz!R{mQ8x9%2o%&*_R=NA+f{V1-is)p9o*3~z(cXW1j_w@Gt9vvH>n1oMF&nzR5E30ek8=G6` zgTtfalT*yu`CoPs0EGWE>pv~~FLu%3?Yc%pL`Ves%PxXze)vU5LqyCYPC~1q53=!~ zyD1StO8+4FOGVpt-g^cph8LbAWQ=^0%lzoSO#6pr|L+WY^Z(1T|1|7>+BE}E5E9@I zkB|lc1E=S=zlM|jz5d>VzcKJP2L8st-x&BC1Ak-SZw&m6fxj{EHwOO3z<@mtWT}+DPRRGo0^Thq%3k%_XiZC8!wj<+A6@=UShUy3Vx~Kw(vlG&jf2VMOce7p- zt6XYLiRm|Mgl9ZJbH&(=xmii4-1!!Q>SZdU#?WE(KrU+ZdIbN+#h_||8^*Q4aD7)m zQsVUD4$SZJ)mZJm2b{w{kNn6;=Is?wa|J9jS6%^c6LAlw+^R$VH2zUq??3ezt#JUp z#@_zdT5obeNTUkh?)x|(Okc$bcE%4k-XIBy_QLtnaLb^!L;~*lpnqXl>jULedk7dEssY|mp!&tKH?>$?7Q+|72a4EoP z7Vb$A+=~3M8N2`Q#{b7A;Q!W%MThcEG^t!RU;gkpNvvFC{&G1}FtMKakpd^RF#LZ$ zhQE%--TJzMMllA`o?+b31)p)HJwq5DOx%sQ5fKIW*q*_Il!@u}|Fw)o&}_yn!@-~* zPG+xhYAZ^{8MnG#zj|ueaZQeHCR{F4NUTF9b&HdJWZxAsP*+zGZE6j;`YLO+E_ntS zV@~*|+{%@Ah8%!ZW(YzzjC__5c z=OjR&#pJt^S<9-Rj?#0F(Qx^hf6z-AnHbI1AUZb$n77w9r@0;3Z^cID)yUSEd-Pw9 zTg?k60~c+QTDnvocW*oTSmUD@$T4-788owHg33hJYp@WxnNd6e>2-g znM14WaNE8rQbc1i*EOZnD?ZVu!gkL+x1B5W&A4`isO2M1S!f4N-& zm0>HqCw>=V^H+dr_3p)-Pmh&g`Tvc8!lfPl9hN`A4BNG#T%(LzJ&&>s8&d3|OxgFS8iD-dk#r0UIDEVCx^xTt<$F!5=f#jKfgAWk^n$+gk&Oh_xrO zaDWO-_TO>Kf0FIL_@-X$UCZreRAN-Z4;zN2)U!>_*{D|*?CW1Z3W4KbDh1aC$&tdK zIz)^CH1#F=A*lsdLo2Q3p4>4>xV)UudrTsdQnl?w?)7=FbR~3fC8qPe#Y%?f> z;EI#dC6P`IaC32y{;Ys|Hl57e;0?Y*LGmbii~44`eC(Zy-jwc2^|s_zJDcIiVfY<1 zUDtuXOb6JtM)PHJ`m0mbuVWeyjHHcUs?%z9J{*OW80+e;rUE52NT zcx<)s_au@Xl)4;TqsZx$v4yN*(pEEmz|3pD_?JBSyZ+~K>nx{vl$k<(M_f$U$7s$} z#vg@_tVHRRjASMaDfk`l+Y%gy(FUt5m%K~uwqkbHc4TvCs!APD4*d*H-$4%#g9`Qa zOc%_z7~L1l9;ccQL?Dx?R0$XAu7GR)J$h&*C_-Ck%EdNOyUygx-I8mq)sbuc?2shy zoQW*l})zm0rEbtV{6ar%VlMbG=_Bub+B6*Cc|PZ11m1Slr zQDR5$eGzycw#%yxl_bzKXsh7bGx@OwFcgLxjt334tvpe_|ECS!`wO87<8F zN#7R^B<5U65*jLPazs-bSKy{bo9@_Mrnfff58k<_nOky5aRn?YoD2ei@eF_b?{zbV zN{qy>Hn|X?WimWm*KY}&pMol`0J4pZf71)MfpX4qnqaMq2P$^H3jLOX#NKfS*Gbvg z!!9hFevl6&IlP?Ar6ZntJ4EeuVeHGq4Vxe zT;%)EW@$&8dyaz&I&sgC@?PWK+~}Ts4<)K!=&<;vJNAk_@5?3onIkpW+Pl{4pZAC! z&{7ecDG#A0^W&>)AXT$ncX|G#34c(nu{aDnk-C;WARWeAfjLw#a5bkdLiG#FKY(-WYv}<}Z)iFJFb)1?(1DJlxU* zU9XGt8sm`T6r4liByPcRZG~5WiqlpoUs#_U9K1bs1&Av%W|-h6QRL2yt6!Ys?-yE@ zMzS&YEuBX$&NTJ3Y9YDUF#hTbyiH?A7p5$n8}eEnUOe}^4?G!E2+P-Jt57g(a!}X> zFe@kNmajY@o64=Z(KM3hI;r2O9|*j8lssxV9c8{MztmVB^#OF0^H|@ zAQA$->?vAHX+wO^SB%3DF7hT5k{i3#mKKhlpCdAnziDMMB9rn?|15aQ855%8Cl3u! zHOT~&AyRwC`V&vn^g`c4hR>ftVr%{UGJB%2VjL^!zJaA}=AU_Gg&WfztmBCM_dJvt z%7gFy+^p#Hut6&!o;=_-ecqVOJRgHfvWPfi-3jHwe8E~S3xzy-`Lw~v!IeR2?|4rs zFA_7!u-Lu`Y&0l#?XG1GF;qX{?H8HLO9E99K-zx{`blaD}%W{~v9z!P?xQX0GFnd^UO!*Zsr4)7v3=?xj`(+s8L;q1$ zN&n|ES3Xmj{(Bj|oe3mbD=6ZCp^neNc1RA|t-fGYA4f;Zwt({a8OOOai-%E(ze%q1 zaUsf;G5S{kU%&bl;A?%!dK`gb!#xaIytod=eTWSgB8SifE26vOZHk<0S-TTU`c|SF z_?q03=fNq6g(_-I}@3Pmci8BlcZb*O&Ps4MD~t*Q~L?B$2aS*fDd5@94O<< zx3Ck3EpQxcqZUUq!4KL=&shH>yY%uK^$2UYh$)7xdt{~*n=9AET4A|1-xlP)ILDoB zTk`q2~Fpw&;r8M#q4w2D}sqoDUG|Zl+p3x(`kMX(iGSpwFRj|G;DqEY8v$ zaI|X$wF*N+a9UmPOFz7qJ;(!L1lmq#Ova$5O@_q_%9F*RT?#*6SoVsyz4n-mUEhIH z(_(cek3plyKdP;%O|@1v_Yz|of8lciZrBt>j%WTX^%en1Rpy5wl?$+1# zcy%e9%xtk4>F|Nk&QKKb(`#W2HW)o(^Lf=y7==``BJ_!qi*pCOGQdE~^B$kbHr@}s z;6#M2+vX4!DWTnaZ8UR^r?rf!Y`yS-+O%)48C)euWq&Rv_|d?kz30W1g8H*o9lmn; zhyT3J_2D+w@d{wIbqHlM>+BKE(EFkEomY==SK%Jw3Se~nr?B2+QP3~oiR_eZ)!7^TObdB!L9wwUveD{2n5+BZK_hJZ6o3?CVk(dV%jydYBF&PmJbjn=~@CQtK;`*Qq)If*TO}x z_t83Eigk-O%mxRTjL$8?$?G&{73^hQWq09oeuG{u!jnzfpl-GhA=RdgkioCt-TQ^~f% z_?e!5^74(cq85=ohlE!OwTe{?(%jKafu?k}mHg3(!LI&9!l(!4=%?6QrX(cV4*ga~?ylmqP?BnJ)*6&-0E7c7Zm@4|XIg|swkwY$)CgEKdo5t-m z=ORA_7b^TXif?bJtANW!FvEnn8;+xGOKTIN^^Fr{FB_8+4X(H19$o>hCE*wsXM~8W zh_U9V58d~v{MjcyL9BaeA1H(!$GE&xyf!fx1zEO?p9{=Pl*JjU?~sMi1K_j(zt-(u z$`+Vyy;}SHSy#|UUKT9rr5d1CY=MC+`~I*E3M!l14kRA`dZ%mhaWdTyQHoc%W?Neu zk+;-l=4wlRkwc~G*RAWeexmakYJ^c1aY@t$>K?~sOvw20J4^k}H&rd>76j6mR}+Rk zm9ffLH5o+cx1x;}&`5Qob2>C-PqWljl;PL22cGw41&%#VR>_p<8kLqdXBrxdWx`1#z?p9I;fD{zS*!A176A!Qkb9C z@>ng~6v*9D+qf^_3cNTxB_g%E;Hp|2pH?4Wc#@V$ztx&nz9Hk8Qb6ZWjAL+Q5pS zi5A%5%kPEvR~Ma2il1vufs`QU*Og)<2_~&DUiza-Y)-z(&YAg>#Ue9qHHzcWsyE}O zR3RVPZ3*GoqyFG7b>#A6v(>p$;Yr_=fL&g@&tH32PWgZ^*4=l;2BZ|motTki`z{+2bK=~~6MbJsN+{M%L z{6wq(jA{Rf(rn9nw7StQMc!o5?bFobR1D0j?pg^}8zmDNEV5L;_w#6(>I?68qd%^8 zFU;LV+Fo;glyL%%O05kpjKj;1weN6(Wge#>Sm<}{2Gl;jk>gBT%O$QwC&o94QD-5{ zo8{W)bHUZTETWt9;Bk{iG%`_*7? z$DKjB?^d`%`I7hAR!=7ru{4WYEXdl+r|_BOP7~>tx|G`8UyQQm8vNXKr{|= zyn1goGdawaCVo&I$IK}3;~aNQ_k&DH5%yN}ncmmA=B75?XB77mCmY?!7F3G-wYBPN z>>P4O6>KZW2)OiAxS{|L91&U+>V^oq0!aKbZ9GDpTI`x9@2APiDmC3u*&gRi6Nqh6 zEpuA9o7k4iRHrQ;^TjK6_@rhNbpN?4pBeepIq~v{m>HFIR*Q zSJaC|f7dZ146?*=270`k)cE#^Gyl)8(jRwqGx*r!Kv<)2IZDHJD{<69j&wyn1|KT) zawjM%%@(fp2}Yy?jNIP6iMo}@mfY_;jLzQ?J=1s6t<9nr|uWIWx5kis@ z^gaXcs7P1bpLkhwx-KWSGfKxHhtiKN4HeR!KG$9AKf?o{i|dTvto+h52A{g}1722a zqYBnYH%gPmdCb>CE$EG(zUWz6yDWzpyX1rG!YZ;?z^(na$Cp7W=njlfXD!4{qT8;b z+2&BTO;)L%pkd|B_m^l(nU1Vpu@)|ea@XGvef!BHrpH#FhtZtFxex-D+Q{`7mr~6D z;nGtJ1{@W~udr+2{V&R#6&AW(i)Z}jJ(s_(fRPcn%m4cSFzDWL2w|6%FsdNSOW?hz zw>ERB?VHh=-Wvl&UxIs{tDL?2qbPUE-ONx z6$V~`*exY4E0vHXN-Aw%qo>sgj6L=$(3mw!kIdP!{0kKa_n8~}cvU{!k7d`K0!NBn z9xJmc-9w93iM8P4H|XJuyt$wh7=#LY?1P9}kHFgHPxlo$=H4H76I(VE znBG$n*G}*aH;Krd3w<_(`KD4c-g5sk@mW)2W$n0#-t3?Ycl-w;Kva`A<)`>|>=tD)ce7p}m{u5RsP=;Hw=pXQvL5b7;izD6 zteDh%c`VdLP^!^?fH9@=3(wQNR+}MNEOFpa6qes+X_pH5Wb9Ra<{f^=_jPu!D4vRv z49lKrIIZA_l7g>$-W$_P1)p|$1B+}HTUIm^O-#NG(4BPs2Gd2R&ox8w--62AP2$%D zM$51!N6WAgX75rryHu8{}3R@-cfy98UlyWC#M6+oH;NobR1`k==200x!Z$X`5%!Q0 zwX&hSjTc4(5~+b zx9`o)lCRJE-K&P##lK7nhoXtre zm<#O>7R8rR))7pXZ)0%rx#9PqXhc}FOkn7*1^%W(bInY8R*5#nOnMqPmA{kE<+IL%Up-vyfkDrlsUuq`Yb#X) zocWI${j@wp`pb}W*q;EyP%!%ifcE>4p{>{8pG;gwVG>|7Cnms$-d1MrTPELB5}R}A z=wdCNBiEI=P=2)F!3=Wsvu8g)Dc7kSb_*6?4ter@(cT?RV(!3@ouE_I2 z()1Ah{@aGjy&uaL%`(q&O??zaSHFBN@Z3}3i|;%Lpa3sP!brd?*=G3j3J5zA?0ccW zh3*wU6IM{AR{W@qlaIkY%p(uG0e$k|qMMT)m75&m!uP7W{E3YmX!0{bpO^u5wCJ3Hv7jbNvSao8xYv zq_dWKU@l5qC(GrJBn!6Gywc|1-2GOD%w7wZ3p_}CH?z=K5UhsUZHZUtT7OqOb2jj+ zcc)9m2Sh~i#%s#B1fGuRfy9SWUjbz<*qLx%G$=~JKHuR~?np_tr49y-o6~Y-dM!+E z_omFF6;!m&@#9=%KUPV=F?m6)xG&Ffk3eo>JMhS0#cl9t^mtd_?Q_TE`h!G!+C)T6OSiHUubcE@yvqx=bgP*L!a(mUrEtL z$+ijPb>B0yrU^DJcFZudj|o;_y0CxxS~!m2z)*Hb!P9F`MwWkwB1&OYL(BD!&GmEdJ`xvMl?AWI_?)a;_i=&9?ND7Tqx9wYijT- z3u}z=LYy{(36KPceR9z+?mrmDDPA<_w?_jU9aC?p*W``$Jv`VI{A|OE zNwlO^V5GYO&8Up`>+i`lLOgCNQ?u`Yv3hvTETdhI26u57oSzI?){jV%n?AlvZtH;oro?qA_>@XCPX|be^;$pUCL`Tu2b^j)LF4~=R?gi!Mh_ci_0aIq4MSFV5)}q zSSF8`q~=-9J%_dmcjrj1b474P6mzk=J-^|=DF1OzYB8s_PFWn%3BmWNypW0Xg^^fs zr2<>pKPGB!EE@;xNTt?mx}){x)+aJNnaVOe6`8)N=*W##eoiHi#^ge| zddyAjP*G{^Ld>Q6Py1_s3AS<#5u$$xbw2+(<50nbP9d z335?=TGv2R9kyx6`=)Rs90!d8VY8nMT}Eo)-c%E8Nwr~3Sxal*W*Cj^mzCO&t((5Q z@u-(CF8f9+*QaYw4`-zop1C~e=Frutuc)&ep6eXJ%RIYi>cc!zZ$%>!H`R&>o}LUJ zvi`)ql>1x}y;z}0RP>J`|8EFiSW)RE{6rLjizjXvXcq`;Y;<)G-R6XJ3T2~Uu?47% zjw^uP0q*HCF6n5+=CSqiMX^Q^b^1B$UJJv9^^_H1g(;Iu*ZR3d{dPw8BGy}MTbaGN zdf_JKJ)+rnhMHFQ>*ma}wg{eTghrc9U6RVWYv9F;1_-skcBdR8l>N}ZVgKn^?5~uC zU9H#1H_WMvn|j-e6q1AHMyV!a`}^MUj|}VAt5OMuesIDb+%qxrEk`7R{4buMCaX$B zt89wO_X@l_e~e!XI%Z$Vr33<@9uR!lWaYCnc(3aU_-4>3hg1@}0{SE-!{$HWC2~IC z8A^+RF2}RA2U=zM!rvff=5147vaoQ*NNwaYqm8laCjI40KjU$VUzi$8wnv(+;%&@S<^p^z zDujhz7W*BrvpX&FS3o&F;rP;QuT>z>367na%qgAMz0CSvbi=gg)^Ja_`7vK^Q-X&w z%~ow7yaP>D?ufMByghKNMvZmxjlP`-*uD7`7vyfK_94)@b*Q^wb_D#`jpIth~hDBwj$NgNOL%V*niK{f%nML5+pM-~oW{VWT z=3$6tl2xdhZq4+c=P8=qj5-7$!notjE+7Ejej!HV9H{dxe={(7)cdH>(N&Dsalt3_(|flyvhA`Mm!og42QQf{#=CtWI*^kjq|`8pyS|5ZzYOL6FKCwMFDjKN3u zuck`)$KdBzK;k}W*pz0NWFED_*Vxh`Yt;frxC~sRhTI|3l=?zYA}kFE2#cl0;^hZ( zbr^N|kllB&nX(v{Ice(JbLbTi!74I@@q7q8zpkUZ-gFoLLF`%+)q-N? zwjtUm=G&BIj)Bp<$m~*_IN$Asrk6!w6NpU0~+?>?*9 zSzaDpopFM>;1{Fz&8XL>BU{EsBhx$*<_|mh|Ewt!#vR7J^w~DGqCnls_sDa|%)}3% z#WF_UiOPK3&Mhks%%28%mnO0}jci{5#N|?&;x${N_er_h{mPax?1}BQP1jUVB$1Jj zqrY?zD{!#L=>H1ca(B}=vQsX$uP7k3OEzi0+Tf);@BL+U@oqVIKE|uVSc&n%ZUWz8 zfQlI0O60TZ9jGAg<9nIA_`BYX0gWDNe~6-N{Hq!LdmpReEVUmk<*e=gcx-x*{kh+( z2cRtn!avnLXyZ!j~@hEYOy#ibNoEuEFe zZ!zxj_x@9n>%#DYoQl$Gk+SLfH%v)8eykeVwI4spzP zmacUNHvy$7>SQZkh~NyfU+^+-5e9zoSyjiR{X_#&#cmzl`>hNw z;3jCM@L>DGjm(px^P0pTHT}-@(R}d-Ac55JGEZG1ngliHVCLr9dW?^=8{M7sNk`i> z4L$927A&)oM%O0eX(uP`s7S9fyy;Lnp*a8JwUGBimj@2l7P}Sg^+#SG-9Nf0DKc5$ z*mS$6u@+7+<1x;fj%>@7oD})<#SbZ8La71|G@ZGY^e~90BY5HM5>Cn(DFa5~IT`P& zkB$X_H8OJzlJ>?HaILU&2D_hSCk8XcnRM}cnEy~@q zP7Qc=mVMamu4{JCw+6$sm1UW~N{6?dNZ?Ma8{n?;)En|tx_o2+6cu`X9}1@P&+2m! zJ{GEU8IH4G_XVdfDb*j98x;xvEUn(Y{d#9v(`8vNxjT|T`Nsxf45l_Lpa4U%w${BD zWZF4GAWTdDoZPE-t)hC7+8@?+&QVbKR~3@SE`kyYE%0O_q~>)c{@eg5Up51?;Nw=d zWfjws-ir_-ZY^!DmsupyW7qpFszI?y#<9bRiaW7F&yk z;k3~Bs8k0#!H-ZWSR(xK`G|Iov%|;Yh--8mjqcwz|LFdb>e#?BpbKLq>!78&-G1lZ zr<(lHGs6s;;|Go5&*f|r2i6KPP#2M&_7A4KL_cZG`)ltsd4|ql>8lnD@rky(nd474+c>-*{s!{M0*$f!VRQI|k}1pzX|U^QD%zLu`XToh#2=IUZ)*ZlyVX1eBsna0mzX zJ=>C6U1WtJdpZ_RXsuVRz9|zknCsPM?mua$sjYW5U5i2{EUQ0$tU?gp`-d7F<(`Zo z@9Go07p&f5M??#Ixupo6)B}Ar0)M!bKHJu?f_j90!U>W8{;tcES-tay#sliWsh_$m zVZHGP>I;&&D`2A_k6IF?SRSX~vTW{~)maPXB9$QGvin_tQ4^}AqVU_09s0pCl(c&_de39Ek*Z5O3CwHh z*0D(6huFW~QDkv5f6jH-f@rhZ-SZz~CQ?1^Fks{SDcrBYt4SLDnJ?YEUZs^qVD&lp z(H{(j*V3g1A>oC}FGaBLM}pLE)|gsz^l9voqP$5asnX+@vR750B^$l9u#*x%gii5DqUL3m-Oe1r;E9W$y5|ZwJ_jGYgt{Ta zu&GGAg7gU687J5%+U9fG@cc;t8aFI1`+Nm(mw?ZgAm^k%R#o{#!-P<~uMIj?q92woSQPTp=Ryx&9rtn{#vQj3xj&K7YeCDj?^?CHXqZ9FQO9#FPfI{NO3F;>JiHEiG+ZHC~?yVg^ z)=w~rGU_ATY7Lbn$VW#SG>rw5pXA7$1ewN5ow#KK|D-f>p3p|Uy zD&5uY7{(vdoY~_kN|EeW0^l}oy=O3`mgzhy4P+$ANcAlPvMiJe2hPy4FyV*`qou%7 zwD~)$Q2Z<@@a)5(*m_*3)N&^L-K%j0y6+Bw35HKcpkh^z5>I1+2apcpFoJDsZ9Nbr zD#Hpli{dEOoUZ$IYxMDP`{fv(fRez37VGa}h)T@h_YI-Lsg2E(56R=M%kPwWPMtBC zr3KaHOW*jR(8t3im}nLDI1@mG5<5xE`%7>AQ5O~w?M)Q;`inAg z$OG#{{T6&6<-d)C;U7vhvpkq~^wwC@jZ?2L>aehUe*U5e%kvKzWqpi= zoU@M;l%PMDg@D^Sp49X;PS`Q_mW-Dh^=!aUf@@g6H_#a=O{a@`EA+XYSRm~^+6iJ369&8cAeL@KPyo772Rj}fvqy#k_slx@HRE%#$2 zRo%Q;;(zx?bH06L*elDssmz&g6GW9?Xl`vTy&-zj)K;mugZFGA_RuX-uBn!jrG9wt zpf>a2E;+9csTMC-oeE4~m8#5aB>~rn#_|?zWre5Teix&-;+7U>02Vx$b-VVtg=kPn zUFVxnnA4kp*95Jn0|8*85`5~hS1#3Tu8#8knCmmX<7y=`^@o7$`fjUd7}>}ZRm6br zwreiyFp*%yHMgbP%suCOpYo_mTJbG)EMG3!So!>^4(~zX@gdzCT{wV~8imJWd2=s1 za&J#6@RU5L8LMN`e^8oc)Y@y9n*GBNj(HNq4+MZa!p;Ph?&HILI=I8EB?jqLx7XsfGD;enidOvESn)NSMs zV_FkEN$eULljjo+<8I?L4y81Z;B(^jC}>fO`3Lhk5)RGZzql})*gC*- zt^k>E6K88C1@m%cznN~Bofb%4{^K=8{}15!z`=jM;qlj79{9!Zxom^g?Y(!;*s-=< zF3Ek4&umq9cARx$fkTrPCZ`$C5QsW@vc&$S(h5b(tC5Ynzd=hv&K>x9SS+}>roy9W z(J8ZpN7c)RM;25D!lYV8(G&DyYFimO?=(c}X_v&yKdWVqJ(*e#=O8%lk!=GlRD8u4 zoWh575n*)J+sWoRBeTDyZ-sOtA;Gp%8%KI1uJzwVtCnYXdF&Xsy$F@rZ!Lw#*}JNH zY$oer%pc&pCe9s3X5g97v6E*)Ydh18!n%*MeU3|GZxg1cw$0*hb|Hppk#@Dyxc-&n zrMUh1{V|3T#1l3gEj87mqf*#3Zh|ReZD+A|k%a}jZ4e8reH(!-b78!sQ`axQ?KA{e zoHa_jo|$LAL>`vQU;>sBcOk^4TMe(u1HL(Bt-jwANqAVqz9hF|&ry#{+c{0|3$pWY zZf^NdDqa6$ zD?{#2X&ir7Z>x@Yl0-lchd+mqVYd>+7}mZ}&CT~fMS-$q0U_4Rq8OUD^fCxPA)QC9 zScS3m68Y==?e8T>5j=NmYf@qA2H5RF?8I(p zM*2)1yYLh#AFZ@+6*yV=?kGphe0Wb-DjZ^)3RK;-jE8)D6UdI_N^_dvu+^%DduPQ< zT(a!CGC3Dl7>?8Qo?Z)Az=5Ngj2NFddmu}nNYWd&%qp4CoD!ct7gelyild&H)k07+ z;NOK(vifLm9`u-s?1wG327+O5yaL+OP}4KzK|dA9Jh`2}R2|GhCL2qIw0@09Zd6}N zPmAon;1YH`PYF!jbGWwag_A_}VEJ)u8JZXN(+WKrhr#9d*mOMN64xz>=?ScLp%+wH z&i1gdWHL1v^L#Zyw(q9Jt47ox@?DSPW;)}>;p01A>sl_qdvy(rc~cj&xgNv))ByKP zrVd$QsZO|5snx#4I6iMQk7>28z%Vtb>;ME4Lb?3mUpQGiqes>}?@d6{ba}ByNzqvMEm~Z=V)h8tSybL&Z1U zMAF(f%@j)>S1CNc8IV<3b64{>C%D0RxG3*hQ@<`okuk?bXm%T}=LB(Zw1Ghrf4?*W zn5SIjGvbaJHmmhO1mCcCwL=0$oUP%a$FozUcSFATet+pL+qV(G9+B%|DHE$8UWlnx zOMN`9O4<+7y0QGtVomAYq-M9?6TeA!*sL4IH>vV9e1V9d(3tm3|St%Cq1j? zEtDt)E9#cXg6Lhl5Bk}Ul71?gM5uVu2lpsQ%X^nq#sVt}rl<=YwAzYXip8AcOf@&@ zzKaf9!ei{^lBvs@n3Ne&|9rlpKQ%s|wc5m%t}&hH=^oPVj= zZNgj>KEPs}wOC!&sxAvDOc-}gaOT~xc>#|(MLFBeI2$Kx3K1yRyClrdEw*dg>_d(# z_Ca|7$86mOwa&z%zB2o1yn+x{T=4x{DW~99wl_6X1AcYb=XV znmeF}P9kYu{USA3(;s~5RuS^8 z{|YDwl5rVA#^EFvK^QL_4Vc1gW6}A-ILYE{!D?{!Q!PHi^q9?m4Hs81T(*?yno)I} zNs%6&MW#`4-Z}=I8wsEq1Fvj0Tl-Ud41k>kI zCK@EjkGN-KaC#QS^wUu9T_Pc)4Gp*RsKA{#;#A8?I^zm7$+z!|!Z&Fr2M;9PX!a7( zohAnMaFG>`yV|(X`IqZYx_oune%;#r_5wS2{`>;|U0<2g3;l&wEP9{aiGdNp@Qe@u z0-qY;w{lMhLbSkzR{!=@)rU+=s`|eFX&Q^5jOZNP4v|PP?O&_rB8oGAw`6ZhuG}iE zp_7+53Kc0G9UWzJ=%;_rC?zr;EbC$ME(l7^kLWNW_Nj*@`e6XOli@Ng=h5sA;>n8VdMpZ#W zG)n!+Jrx2R6-GyHF&T`36J;eOm~Vy?Wg;fkcZ6Kx@ng%5B2&3{4x{JWYM6Nzc9kMX@5Tx;@CfgFDYCvtpn zdh6h|33Iz}%kqSC+NZ7}1xw;w!Sma>$2kbG=)PkA@a-8TsL$6e*Tz@r#B-G%ammR; zgbGW@e7~o9pwCV)(3tV)Z!7=kz4d1S&$9Fy5(jzcW%fp7DDzTt3-ezGKedFALw)}E zepdybOb9`L4&B)BZQ&m-tNze;D`8>jw(r|;Bn9*HBn>@zJy5O#7qxvoswK6^L5UYO zyxLe1wPzc-mY^{uG3$A^2QD??4{kN$qz(G;#X|~L=!I)JKDc`+4HQXK7b96Qn?@c_%>C9QmX!l7McqeR8+lY=U7=K?O*Su*oNjLlKdfe=;i!BVl31ui>4{vf zeOE(D-TB0N=UaRPX>g$1;-%2&?L3NdDhFvC>&VwJm%?0cb$f}qyY{baQ?vj-scx>Q z+RTvS@fJ3;%U+_37f*6&x|q0%5(MASVD~H6{2G2Q`qL=J-4Wj+@g8zKaa^}+`}lEY zK;3^xfMwqTj&A_QBxC5UYfoZk^BbpR&N!{QEYPQk7Pz!oJUdaxwlJIff9pCiJKW-Y z(9TO|e#VAfiql<~6Ho3@9c|pgdn(8XW9=qgAzhm1%+x@+vwklm=%4W@^~nVH;qy>+Di&sM zJH>Zpx5T35=0Coe)j^+>y1Z!O6V8ei%X2|;g)?m8T|#kLmrQNhIVg=aYW1>f?7cwl z?&-Z7cg1&)ZeUYg@Z=#N4VkJ$ho7I>*L`h%DzO)(ncpf`} zCxyw`-aHV>w(SERsf21;C=ydqWJ!vYFeD^tk}MG_l~5$xSTB;4Z9=6eBTGa`_H``T zLiT;%v(MOOn3=2ZMBUGQ-{14R@BO{sbKmdtJg>j5Yi6!F=bH06&g1wUzhgN=G#yiJ zs6@G&JZ2ZFx2(Q0$6gQa92Aw)!~jb5eAHJb+d5AW>7;cFRebG#ZN$IOD9VH75#`dk zTr^*AWN9Y(iaX!Ca?jSX66K3*!!mL4;jV{s_V&JX;_c0L-p83U!+9&p67&WaGpU$8N)h+ka)#f-ngDmLa~$K+KVM=d z)UwUdq#@Ejqu0ZJ+DRy{p0kfg_L);VQvT%H9qmfZ>gY;Bg~Zab;qSFr3n0EsOx1gR zGlxfUkhp=Ws`UcH{mqC0Ke;IkuAp6ZUj^_Mhn<(b#o3;XQ=Jp_>_Ebox+(D7?ZZ8^ z?b=V$Gefue-eoH}YI*0Q`OMiKRLH#&Ic}VEbEIIbfF?qK17W@91@M;!+0y@bOw6XUyxeVoZ9SkBE5DlDZ$hsekeIk@VQ^#0EwgIIH$}LZfuop z!vzMNE@9wY2@s1XfpeAY94P>tE9$-8tQ*Gfq?)SO7+&>G@KTwIns#a9hv@-ZcZMXJ zkK2`;M@!X%I`_exlRZ=8$n9or)$O$Jh>6RmLotErUk3e-W=vAM+@p*sA-y@eUvnfT zgtW72Q!l=~5!zwC4Ix%mb|RlKVYAed;k1t;7w&JgZhzj8qhRkG`-T1D;)vwqONXy- zm9%_qlsH$d9*b4t)+;=gwXr#UE9;HN`>XP{l9dpfzFO8tkLd#oI#flh|9qt6ZWP5M zPUbDg?JwNL&XN}J^A?J6k(AxTS?hZ1hHD(xCnxVupBSxl#-g6zQ3u+@ZAqJK(0XoT zwB$J2lMnPmfrArYVoD@;)4TF!u|+_|y}(4Of=@xK5#uZL2%hub5l8(?yEsOggTXRb z^D)P;cLC%sWc%^}2IQmngK!n6#KX^-_FHDy&yg> zF6c>8J3l`uZkO*vsdukzuH8GjsZwn_?iL67Ydp9s&qp~dS(5IDa>QaxnbebCXRK8i zWK-VlHY4fu)LHF@35W#e?oh?10Lx>j94)`+W{RV8k@}K1c_HR&P>`lHHq)|Dx0DY?v)PE=1pwoY2?Ni=?yoCB&1ezm}Z5ypH1dN zjrI#iZe21o^NDidt|u&ZQBK0!H;8`&W&xAb;q9{^dyBRik2M+temL+0dNIHc5Q`ma z?p|U7Qh_6Ge^NT})z>9?p<_ob<){#6s79ZK5=V49n|xzAKHpbmncT*ZUsg8Z*1UVO zP#X9RGYre9Lc!uY&sgju{j-jZ{yV01j=g+dbLiOQiGuP`sV2{dvq*{}P3&fbbMyrI z)<)?m(Ocu0=C6*A`K2ZUft!CzOx>x2zeMcHwGXSlZ*@dYQmHhtv#eCQpU>fX0>!Ae z@tsFpUMduGWpuG(qn;n0;BtBssx55!rWEa%$St&V%@EC^*Tm^s?{~y_ z%-n&G8b`_|lcu`DP|tjee4aYYCuo)S;+}|C2#k-r?9tb2S32ykeA5H9uq{^#EIgWM zZ?4tp?9jW~#oD_*-&aoE_hOUVW~=OzxVkLRp=2{u63PL18A8$h5!4+j|I2T%#q%K} zldod_v@xTG(5nRkp9WNZZzpUPYc)u9B{-aXOe=@AL6C&yhy) z3TBSe~rvNXorLmhmIW zGCjk7=u<;H)LlR@;bk#rH+{pe<$R!;`2 z?sV84{;Iq$i?D&>2yZoHZT&_trE0gP!_5VkK_^;>-^$I=y8?l{;tTwUjkYOkY}d= z!qh9tTOz&+H%W=60})t?r~wgRjPBxCY}WYEu}72J7y~!g%x#jan$ByzyPxnX-L`5} z`*_*cqxBNN(0(T302F2?=7s_&oa!6`0Vo`jeiGeTUfNY^(8%N?%^gjtIH#&w62N)w zF7b*YIl^KTX*r?K4!DOG_>G6&VU0*gZwZlGB|A0>4}4J?u ziVaZALq6V zo&qCS{NrD3qNkv5r99fyo|AH^)5n;R^Zjk`1eAfaVtj3ITj_)B6)pqOJn&Q|O-;wY{#8VFW% zF=4~q9MwLn`1Vo!HTz?)!gq9CwKMGd+Uw(b(ze=RHWu6|Zy!_9ODF55i-65I+V;rs zXxWV9`_i@M+#Bd>){8uWn?2sPn%EAD=@uqEDv{iB;*LXLAf*XV)K%EwLPKKkZPqAy z_Bi&8TotAp)jF*r!Lm&tTiO85vBo^v%Cidr&c#>kql9kC?qo;UjpU%r9d(EB|U`Q(Q zS4t}JdK@Mg>w{ndKs$6BQ5CDL(VVEQzEdQ&azD$}mc4sl_fK(mL+#PIHYpdFczbY+ zxDDKA?Nb?J3`V!x2Xt^aC3p05;cSgwljjqFeqr>lmqyb4`zMq?rv>}9h}Jy$kmD!0 zds{o%2DA|AXx3K*bi4`7L>2waIt6Nq(5AuGVns~;ALWiS4s}>f`&e_U*~|iS=aihdik#E5H*e+c4f zhkv#-7lA5p3v;olX7Cksb-ZSzD`=8-b*1gkO5knv{C_?V>|j$zegz zoiG1q7dk;bi@7mMgQiIO;vCK7uxPC!JLAu%@@F3pc78|1qb3_2lI-9VC|h&1sSy0+ z%b(5pb`S=OXp4=yATjq4YPc!3PVUzt^Jm}OfBg7Ar?oM0;of(IY|2Y^&`#}_MaRA@ z5T=4J+UX9k_{*K&!oA!)e(9R{vwPy7b7Y(Rj%dP>O>=d!zawg zby<*2?o(A%Ej7+#F?Y>A156q*auZe?TTnfrC}$%&U4Z0jqC}^ZTc9uCN%NP7&y-F?Rl;f z2J2}|6W2OsM}@v>^c#9`#hNn1vy6)pIv5`pq0{_O^;%1rV_zt$}9z+4H_C~3zx&g_-^pD`QI zY7R9}&Kdan2Yx?kA%4u3kqG*j2dc8kV%-zH`k;=|%IuI?ty9>eI(BPnDBCbm(?&)D;MS z`g3)y9{-^5f~vV7@7 zKl8|92aPFHQw=`o|4^f;A0K!Ly!+V~l}RX;c98fYQMk=#ew(t@(VhNpUp+gzy}ZN{ zA9bBjV8NV@Nsp!`<3@$*e<<3?=-~daOd~BVgF8VR=B2A(ChWDmiJ@46O2l_WpI|nP zr3Tm0D8c?6fqe_|-4z|8*t4pSuKSJ<)yaK*lIINQ;Wpb(mdK9!4id7_4VrcoW@c*W zJ{$0|5=DQlom8uPXm4c@Etj@)fwfg3sSiioc8x53+83UCg`%E_og$*tQC51X7Ub}2 zbT!8}?BRV^%Atzt0_bV3&q*E@&}A^SmMo!mg3xscJ+GnJ-DCHpd9uqyZ8jJK@5_4- zo^nO~<)OGqq!Ht=Hbtbbcx0XlfDJ3UqBi8-C1&u#b_$;O0%;hksOugQDoyaG;upMI zPA|NIw__WnZub&8vp~L}R}5D#-gn;#`15Pj{7nT~a9$V~Sl?%C1HGD7JigB<7tn9U zes|%?=dp*-87i<6ZrtMW4)8-2pM`Lf!let^4dNO=agCw9o*8!W6xhicbdxrBQ*zW% z{r3dmF47#KE3dhkK64PZncKt$eW-rM9!DQsf+q10KrM1(Sjwbr6fEu3`Cu=pVk@GxbbpMqvk;?c!~U5Waa4e4V#gz(0P zl+ylnV>WlbBc}8cXA|FcpHB*$YSl}p)*ghP>(kKT<>Xm!kP7I~ikyw0T8ThAwxK;2 zn3C}`JX`R)Xj-}27>FE(u8p;(4EEimHc4qf?FX6QYJ(bhE3p9&6}738k6D6@I&EkT zMSYP1okvaPM=Yq*X{CYDrFZW*(}ge=WL4a#X`z!n0R<_fS&)sd(Kk7$ z(vA}y=W6aMQ?os%CG_DPSdqnVx6z%U>a<|o#KtEy9XmSIUQn)`S;SwO$<$5k!;>vQ zIO5Z#5z=S}{W}7AmN41L2;T4-#rN)f(EcuHACzswKWCZ%8g%6B>D-fzA)6!TBF3a; zL>H{~lfNUTX48!^s0=pu$} zY&e>#53i%#qgN?t8h-G{(*yj>d4g0#3A7fM+2^vyvHDqUqfh2M&=CzIq=-Vb#zYPJ z7(|qK0VGx2iY(zc_#I(?0mNm2s7kz|2G5Sca~Wvr;fPk^Y+grDH!SZEeFo1Vy{pq= z7(*0S&9|w8|E3N2FhnU4`;@*5gx=G@qGGj`9U`zoxp$P|%G-<4 z$O_YGLkRR2+ui8M^9a9_a>zANKqa#hC4q`iI85OV&~`Nx76D_6N*?bCxux%IA06tvfo{;U z*aLKR7Ub|9ad-(Z<+2Ykld6l1r$!cI>AXIG{++ufdyJx)RU8vdtr49yEg#4PIS4`L zT(ccvg0^E0O@`l6L%t+cz*j0JjfQbn8Q~eT zl&GHck%T?qQ)cBE-H-UKIeX|5W9smHFRl32G)=eb8G*PHmapa+R#&td=}%-q1B6sW zA8CN3-bm~9X37pf7LSQR~nTgs zegKj);2-^hXMW;ligxX8d`;%EYEqdRq z#}L9z;^DbS<3&w7qOr#SwTrusztE96dOW0QCI&bXCrJ zdV3K3E;~y|G1%xuBLGlt=$XxD=A}t6D@7$sWE#pwQ@ns#W27DGwjH~Ki7cmI%su2) zYkV9y9->iX@7AKE!9=2v9NJ2wA&5S-v^>Lsig!LtuQHv0x}IPt+VSeT!c;NC={XoR zAbN9isUIO<)ru^wAP7{^Z6y2^0KBzNovXLL2QyKPqRKdGvd8ubS3wosh3EmE5^mgP z$hEnD((FYQJS&WT58xWV9?D9yzMVeNzqpGWETs6k)r)F2d20T!tPA9pcfc^)IZBeA z`V<;}t`DrNMd zyYmU&OsFDI_kg@9IwWh0@i7oHNPjT}&w-#V(8MK)J46jXqNz2gZ)RQiQ?&!sMNMxg z*W7bS7V(~Z)W!ABy7i7%1MFrjd{m!h^Ob|u$^jBQ*p7r?$W>!Tpm z`Ve|&Kd6O~7_;bsSO|tllL&gd4XO;l37cM~v9LBISsqnZOOFo7B~~vkcD!?~MaRh*imK{L-j#$`}YB0!08GDN&M2!l;-xNUO29`In z$0^tH=0aofIz})~4RQg1hdXMKK3%Z{Ns>~zOMXyFS;uY`X&?#xi;}nnxzZDvn)^y5b{mP4iXoV{2>|qUhVjC z2gB)kRUlXcr{)~Fy__x+HRhUF@bmFEfr|v5eMnF9NJ1gH%eQn!cZN)1y$>wYJ_~X@ zaE@AiwE+SPqXt)nK)GB=f|xu@ z+VKM2l_gKBvS*~RkPy`17caY~#KD8T3Gng;Y?Q;55es;{ntl;Cp7Uzr35`&8ViYA>sA6`(i{y$EQGYad$NttiQe&cUFPOo~y&0LnaAeY@Jg>brsdiks82=#AgO(`ma5 z?Le&WJEElNJHk;X_X^}J4_^<(v(aw9kw`~R$!~?t6nb|Xw!Vf?(?|=R()tGNs)HoE z67OaW&}5}64Yw!)NSr)TgXA_ewW)tm%iwb>ew&kWx<&N12_~Vt($O_d7W}1&3s#~TE_E{1^5GdbD7if)WAD!b zgr-7Yr-ShdWpN2^!;#|$muV`aU(-~+>=pcE z3CZt!F*`^;JcP3f4^Pr4;o%INq&tvwf;Fv!amOk}an zU6(X3HOwx^4$dV**@E~njmV^OGN0CWg!NFeWkBafXEyaE0PyF*vQ?P$7s0;&7P6`5 zY*ium0SIO<|6btjNe1=Vcqt6wnrZjcsputxH{`+JU28oAOll(%d?ci(8{USQV)uUx zZzg`mI~KKR_Zo97(HiS|Pe@T*@}cZ4s4rRII`7XSvFKb+!SVZv#O8wP=2PjY$zEyRWx>AQ1WC>jO9=59lG!Ci7sYa^Rohm;6()-W?;{ zqfoXbyf2rWk0q$j!lk8BodH0lD(PA)}1zi7X>3aM_sLTMsG$zPRs*D<(>JH`X zh7NfWSGRn}m;2KKetTYtKHOVsPQi z5hoo|EL3R>q8_GgKtq25O4z+bwP_KMg$}Lgu*3+SFHIQ2T|lvLNa)Y19l#KxS|zDX z-=HW|d!`lTqa_W!uw`BYHp;K>SN7J9(;-l6AU6QDii%mu@z&{!0LIpiwHp?8@dQqN zlhT87jX9Tub9=9e6vgozYO;L~dYU$H42h3>qF_6HJERDf$%cnoC=y91Nkc7luYacT)gxCbO! zxnXctI708Yc6$3K!0Kpi@9!|4qF8!T(_aAuB9sU{%wsVRRsbIV#qS7`raruxB9o)>CfgO-#lT=9`X!DOIm7s-uFD7Cxq4!1x((TIAl0O{9PN!5T-5FMJ7b(+=M)qKCXBP-4*^r-zCC zycB3!e!`d*UBBlM=i03ks)nMdM|t882W|%edM9ZgYM=os>Wr@dXGiWKn%+Wm$&u<4757P#nN_j|pQ9GU@&+ z=#Gy^Hl5nM2-Mj!LZh6gSuk}q&aDG{nH8-VHTuSm*ys&_(h6C|Oj0iKpKay?4z2tBBM^_G~B{G<^Y}ozf4M2!@)z^mK)OMU%+@09LPvopy3aLqkQ= ziPUspFvi2d*v5riR$TqVK+1O#DC7!H`eXyL=Tr@m2vg9;oCE@9>0bxxj ztZHh%1V9<)+gwQbQqbhkOM2wV(C3QwqfW-eK%0O$fYE@152gh4!qrQ7dj!h8%RkE= z^2Ks&i7C4_!P_wzlOQ20ggv-uCWo4jk3v!RpT}>@0eo+}B|M!0Ur)BT6|*>NNMQwr zW2Q8-@keMkwVWT9bAT=_e-+*J3^}YjFDpcDK zgX{ConW|>q_Y(M^^fT$=-ys&P3k={=l+I+&uTuzCgPi|$kaH6@B5*1^z2R0|-m7c; z-AN!c6RtrQY-DSQOuKrE;F+Q#3*(~GsZC`NVEH#-O)69QDaADS2+E2Q}NLs|&6 z<%6F+jj308td}kgWp6-zi6az2Oa%S)El`#Y#H+8y9$tF=MdN<*$3dKtALQY};*zxw zM>gNT%o^+?8d=xh1aX=haJ+sMdu$~s&a@7+0RWM?02v$_ayQs&{qzIR`N$uwSKr$DN88^zvOECsSezq=^Xy6jQua* zq_SV>))j^o^yLnGl}_H%xo}~f*WcN4cr}~r?@Y=s>-W$848$jwljQ!|Y%VsNl-xl1 z2BUhNc&hFfk;i+kUyQckcl#<-WP^LuC7DqxI)~YUbyeYx&k0(;Gc(}2?ZG=nlYVb} zof&jOj{x$U>oaP8Qzmei)Z}>Vh)l4YLn+-U8g@4z7|8$T( z2)%c)NUpwL5sJN1LU_wQCC&iA)@CQQIEr1#Q7qTzqbXrxs>Ze=Nv>sxP}@(_N9%FA z^TwTnY2CT*xOl8`F@f@ocBxk4jElBdfzZ~@00lZ?X_Q_JvA%CdmhXY=zOB1;_=@+} zm*XfbG(x;&4-?J9CULPB1wSjoVJhUso0j=>xIOhH;x9*qx zx64^dCJdVVeuZzRBjJ7*&D#nB(F+$LOM;;Edvr$kbHouZ=s7=`YdYdfAk}x_0^{#K z#{3kDs`z=4?DJFS92+eJF_V&mrx>1T(3XebxrFbq_~Oc`h+ZB-eNcs7LW^1bcy(Hq zBx+S0R#(Y?w02m@G-z1Sf*&Rm|I(qS7+%09D z2ovrH;SVY?SB2b2n9oQS1B$Y4YFe|%jZib@eqXlN$s8?+<8$H+Zojdk2HT0o#ZoBG zS>N*_dKVwnsTgZ12BHtQH|65bG^KPp)F#YkXH14^iv{7rt}^g~E=De1eJ*>QP60Z1 zJM|>IA2J8sJ^=O2qg54xZoM-MrJo%Wuc)5j7W_)Bn1<`us3o6h&y+85oC`xDi8~i} zmO9_+I?p+ntw>%VBWtvCwLP3pCub2cpW9I!ZH3h)(q2>PZaIgV&&>;G@{T$S)rC_6 zdzq~ZAbPgxlXS}`?b~p7@s4PDIUXZYjB{e&`X@J0N4T&hOmS!PAwhC55wi)vIj+Az0Z)= zoLf|*X`3;xQQ&d8XoVXvyZFnd=0epcPxlCLAFWcJ&qQaX=>N#ejeMv>y@K!5m2KuH zYsJ-`FJncS1ybXb+Xmp#MhAgUYf6{SYL~u~YTza8nmd%- zQUZQ^p{?9+hE;sJWBCVgOJ9kdRll)r`7C1MK4fE>%XXQ;XzA?s@|`3q?pXcu>V&Nd z$?C%B)cx022Jds4sFJVT@NlmBk0Lg4MwcaJ`a{=g{J&Ia(y{_}PQXmYYlnO95neu% z74i&_i#&vky0l`hRl=6a-FW{QK02qci8zMMU0_u6k%N0vMm6PLt_*4}5~I)YzV<=Z+F&^!|K zmhd@oH+jA1TvhH^J(3?1^5s6V|dU}e+CYwGmf8tO#^1XkaZ6iZqwfx0e zIrga!A`&N02v5a_XQV&9*8g1LV8Z^qO%#y?>F)7`Mk~C)mAvj@(6@0^U`^<&D4pAB z2XDS>Q#8BthS&Gr(NSkiX52GS-+ZH+MS8oXbf9YZWPsA4(g}@?0rw$45J6<|&hMUO z1ly_rpPiC55fir20!h(0O-!+R=X-Nr^Y!~@_6gL-Uu8+hM;)}P{+1~Euafxr-)18G zt7Js{Gw!` zjpsz>$NhTMtq6qrrL^v`W>RE6;h*K-{t@3e`bU{~G3X_*q%`z4fN%X$vHUXv{G(1Y zPG#vy{US7fWc<}@yb%9r4g`Nwno+kDp8VUATakvC*yUaVbyp3;y#&|)Odj~Nj~dxa zgFOcK%H>%>-6u~NO@%QKD*K_ozov+dse-R7wZ$+W}Z_ySS zd5tL(Vs?h3JJP(zq$@b+YKSZ0??)EXLgJLBO|m?q`t}y3rc~_F_+}gEmARa*w2rOW z#+S46LAcKgBUh$)tBL+@=BU!AU3=@(hf!lC-PEz@F*$B_wz6AZeQd#;+jL&v)aN+z zac-?mbPa=|MnkIeN#*qIquNdL{@Y#a%eHL)cC?{!lVk}GchjMDxsOtBN)Jb>3T*6_ zvqyU%?$-Y3-U__lRR-mMm8JRLVNfFCTdQNfN%=lc2^aGAzI@H=)R~sbNJjs%0-3nt zROAEYb3`~g&UDupMkX~fea-Q=`L`|!ANd>r_PUvzd(tnuur)^>>`>p1r?WLM3_}QL zfawF&`~~_ZF*Y}V`t{4%rP*cZgV<8z6+VZGB@+)lKX|4hETf>eGtVM~TT`2fCv%p) zUXh3?Jot{)WM5Xmu=%F$P=(hTQa2;3X>F(dHC>DkRn>J7FXhc|Fnr#jxGk4xF&t(@<*Urm1!^>k6t2L$!G zNH`c&Uu?6MQ4)RrSxgpcQ_*p!5d>#(Ug;uc#QMwD$8VkKr|nYISllkhbFEi@^Kf6c zSL!AUgV1ZydLK2PaB<()l8|xT9oR73Em4bY?}7a-!mgOg?)Ed&YZpIi?_zfRnJE8z zzGLf^km)WG@CLmPXe2HFMChcl(JUzR8c+Q;{@lAOv0?SY^PZMN zR|^l`Fsgj4Tvpos0Nrz9N}rBJ7py-PdaPM|5WOee2uJ2)LC^9UkbbBO(2*MSu32t< zrma-s@>4SIm?(%Js9L!u1v06qpw?URO&64Gyf5Ecu>bJwNS@dtriB_NJZ($HB?-`Wj4FD_mtIfceIuJEqrzWLX7MT;V35>m>4~>Zgk2bw;m3HLu z{V$H~E^&48a$ewv*ZDPe10Q9D9av}f#NY%%!wq-Va{?4~IL@+Be*|uyK5?lRB@^V8 z+it?-Z-RmXAGgup8BV!8z@=2sr)n-;Qop*NdXCUVUxzXYqIQSH#DwST&!EObW@a2< z3B#4q*hQBcug0k`h%OH!u<{K>#go5(W5*i9<#E;LUqQKxEM8v1YYEHaM*J9wolS@N z|N9-Ot_&x;{8!Y~6HnCT0Om;hQ^_|_rtvHJM~__pX$dAun$|aLHhwrP z&$(7hl(si-$9KeA3uRDg&Ox9Aw}Z`U(Y>39>2y{HG>jYLwx9Q8ELQgQcI zKJ|U2y96$s=3rq$FtliV3tOKh{J3nCljGX+NAf(+tJ7Vb_{cntSjY3-ufHQ=xS**} ze3bR23ygao31KgC7@Y~1_c(naHll{14iDHC%*8tKiR&Hm?{(TcKW{!&&2bMIQ>}1m zc?(|UjGa4ni9Flx7w|#u6P<}@@$qB!D19`9~-37DhgEhccG`kPr*u=Z_ElqkCP^Ioi|4~(_e)t*zny`cBeLI z?bW-AJ|@Q`tZZ;R!CjehqE|$WtwT>W=A2Y+r6=cigv~v%xhll{W>m^r!2vz2t+GMY z{uYgewdk)=$&~Mi>yzsy-7HzihdYElr}si{9B%Ni6X!m8xSy)$MUA)ZfuD0XyXPEr z@GuwOB$3>qx5uJ#W6iovjD31(# zb4LAvi6oz-;ZB|5>3u;3yA1uL#roFP&Y2X$k7~ud2DZVMxz}SV2@0iA2(UNXf8Ffj zXC(Ufyp0de3dA{Mp9nnJptlkr#Wg{~Ge)j#N z_2|R-Q@ZS*om7Zi*mkwFW6cJNb1!BPPzE(^K+=+|gdCCM&SmfI+Cd45qbPvp?GNor zK4JzxpNcQ?Dc>@VAvbs2pqoJ@1P)>GjJ)FBQrrcCztRE2V1eX%`q2Dths6s_c5#%D zv$coD?@S$%%g%eV=fS$cmgIFz+|Ra`!#P^H*lIVJ%XINYDwDUgp4IWJ%UXRLyJ6C+ zGlGWHu&dXUN9uBe|8OGFYTJcByS2i9HU3|e>7<>Pms&3zm6My{*VVQ+bHM*2B91-Z z=?u{zs$Z{jTU}d^UdyRlj+v}CkeTVMG&OS4NVRMjk4?WJ8-}ECIlWf;L?m;BN4aVc zj{HGtdmwrHShLTf*4;dOoD&~OKX*PpQmy47Nz+$Sh!QLo7qPi?Rwh|@u3xz~A--pdYs4Y(QG#)x>WgWCNh7U=z9+BOZ$$fwb;pc4 z+{%5iD)s*sIe&kR)bAd8)&GX^Qsd<8vs@h;R=p=8ZOpKet(M@~`Ky-g_-p#y_Y6cS z>TfCGmo4^weG`)@Twj=ZnX9orlguHTnXpZbw~z23Pfd2zS*cuh>X~AGX_yk1XO945 zmp~!|nuBI>)hVuXSIzOUVcoN!CGcD1;3=9G*ozsSQmgIT=)k4N$%lTw&sApbmAa}J zgIqZuah+9D=Y#$Dly6j+jBM?ZcP_6a&Nml#?aqCscsJ4HQyF$cx=IFpy;;eusC#hplYP6=AKPfB z1`Lx!xX&dYmT3t%hOZ{Z>vg*xURc{Q_Voaxn>6A*`T~{)+7cS}Gy8lzO*+6_uG5tsd3o{BLhRjP%iW;YXUJ@c0W`@_>Oqi3YECZ40*kM+tJZQ zfz+~daTVF2O$QICMcHQW5k73cJFwXB4@v*3u3oINpOsB+^*sW6whKQ5no?mp-kXqTex4iy$sqCh7KW&%-L zWG`u|_;P6wkLP_Kha)BzZ@66bLex_tq(-{goP->2o$9_Rou7NzsS@E5ma(Xv-0VdW z?Ik~+vtTdo!*Pgv)nq*9edQLQjI!HvFBW;c|DBlEeC$80BK>`J>1svqKd!j_{apRC zU;iZ)y@x9}jvv{>+l7-7UHFD4*CcW^!5a)l+u(@z2U0C|b#|&AS76=twk7Q--eD+5 zh4>V!ryQu!kfUgxbg(TZZCqDLhFxh=4dJ|fSn)eTW;?vdW?u_cp9bCLJ)CNL@ef`0 zk+`4Pm6m@@d1~%EsnRjl;=vf*%%18<!eJ5V>GVRsbZOn*1uVgmz^WkI?wou4LE;~cY z8s^ybUK^MBp7UeM;70Ghz@(yhUkdfPng#n{LPSJjPtc)z{#W+;B+afRV$!ViZC1i? zn%vqt^(iM0(5;<$UQV2GEJ3R$6iHoFAe1)R(2^xzu$MIU-q==4a+%}&FI~l zBB(yxLz}qNRCkSOW2=X%k1HK#&59)&dgK zBawu!;`H%p_Zr$aF0?#ttWC&|;@jijueR=*N0ZE{MxPg7J2xu}Iy2-CH?Sba*#s1s zeKY#G#Z(Ptbe!Z#=|z-!l_u+Y@hvoe*3_#7%5tAJb3#&?mZWyQ97A5UT9Jqh7q%BB z=bfJ!1tV%G(MNP?7l(3Z{mYY2y)<$^GK8RG48i+uNBP5Up`-nsrM=M#@veAc)MFUoOn_Zp@XJf6gFFgqpKa69%8 z=b^|W2D&HGUX>3L8MM$Blx4%3F$t&Ac~A{u*PaJ7UE*zd%RYcl9Z-$}f;1$3S z)(H7L=)Ay1)YaeC%<@8!~QJ}bt8~kq1?&`PuE(na}X?DK3lpvMISM>0JaH|P1Q=YSDUvG zQa?H^X{|3Ds&X{{Zfj3VksA8_TTb@>#^vmv3jS$z?ZMK3l$% zL#tn2ov>9QSzQ=QYUy8C8T{C#L~==UnOYpos~Q|I2ATXS9|eA`?4p0KLm3Rbl9sip za{y*aWT%|=N)IW_vhH)zn%eF5B(Oq>k)xm$WNoQXJ}H`1JG@g&Y|{X+&$W2~8kdmGG!d_ zF_?$cgl^cg(dWToDXXVQGm=c34pv($>psHks}{|6H}Y#v(L9uLqdv8wZgcrfuRYE+ zV(3MV4~aXR6!IwEYp@oDi)!6L_rBaRW^_aVTfSHOIA36UI&_(+{m6(I1orLHcV6TGn}@m*oNqP5Fm3g}0@!T51q;mny}$Afz>@8~;Fuu*00RUaPr zj1OQhEWfnCqJi5|`^U+3PDxMl5t7tc6^m##hO)8HUG?!UBDX`iY9nMdlhRdQzNxIr zC>t-$Y=ib1Dm~q>$H%=bhtM13HYX&bG+gYme-k~?E~Au-{;kM)$U;d*ti}1B%!zWH zR@ZXGQvt>biRuS6sq$~M6HbtY&KfMf-{F$9(DI#PO)pE8TGOHUO&mv zc5S0i@!<%5D=|JlDND5!3hCOem#Mxii*18gn}=~5=^XFb{Szqt`jik(upO^Q%9Gxd zmu{BQo&lNPKHu{mE9Z|+LIn0~dgyBtZ+b3r&zT+#KW4rp+DF10M-ZJx7afDQ5eM_0 zGiVrHt3aljZ+Np@(8$Nn5p#P zbK$P23J#n*8+Owz_L{bQObo|Fh4+-A3Tmrc{sEGJpkMF%Ji2}9WYtra}840TQl**H*2Dvge~ zeE50%e%}K^Uz-kbdiI2Erz~oxH-|VKqulD+@qEtN;B<(V{~1k<%xg@EGA(4ZdX%1ohj-+s*6q z7p2{~Gk0AbfDVn%dZs(kZFU>R=$hR<$3c7cT5WHqbMlc+ zI>fw0)^?!EV&z3;NtwXSt7nS%M!oX^?M z^WNxMyBlC!l20TII}b0xq{;7+pYI}~pB^*%WQc}jSqV^bm2EcCvlB(_H?>MUak`l6 z$zp;aaSFH#I`U9)b$^kw(;5C4&s?~FLjv4?tOG8*mp5)od9_3G&aOU#V@pqQp7%LV z>B>@->z4R-lgg&s6K0n_d}v@7{~#{fbvz@uRLIj66)IcUGpcH7dXXqrSbKP6#>DID zwWJx3!D=Y|BqZ=T)5)?&=Tu9rc)5pcm8J>v`xyf_u|6xAzgFYyPeppRU;C*3?d|>; zX*W&k^0Hy)XM=GuY3I7~7g-#AQZfn{(6e5y4g!J5eX|SPGu+FGL=XHWy#iDh7bQvZ zJl%^QyM)U17|jyAOPW&U0;l-gl1TI76d0V|p86;;`1&1ccO!#CevZuDEHb6xDxv=@D|cEROW zoLs1|Pd02#!CxY#N4;Zvpgz{a7qNoqoK!=LmBst67o#Zdx@6hBz3dUj5Qf=@B7W#E z{3`~~pUj{Ce&$cXb@cmW@$GjC25W=> z91j_wAnN4l1JKkW1PM<8(lVkBKXg!41n>VN_Q*x zPI|Csx1|jOli8^4=VZr-q4$PAP_BQ*r2QTu%>7=dt+ep*&uU=&nN;!59^U`>e}4!a znTUT1!Duml%38CZbaqXKX5j3oeZ>c{Fz5UbTF(XMIMeiPvZ7|%hawcqM%Sh@14>b! z8Zi$Ld=xCAPq)tV3S-|Lsm=DeCZ;v@O0+>l)Jnwf8tu?o#obw$6ka^1WgfkkQ~E&w z!yh7*bVlCg0j%Z(_>BxIWCMV015ID+mwCoUoV_*vAdLYp3*p$Yp>J^K4os8H&|BP@ zfy1M_Rjc?B;FBb2J>2%{>kynJ?GbX zni)^+D@)l~KH?A#7SbFYVNnx5A>2~y(f(K2{{JkY{ZA!;|8^w+G5IN{0wg?Sm^zqA z!1{KOj_;u9=}C{(J9~q<8NnsOe27*4{@nJYcD!A}RU!nUTrc!J6LoKmernmP%v75W z^z1DrRy+`KAl{&@Be1$V6U1LF=Lhkcy8i%HbseGyYG3yg43`iP$jxjXWA*5e=Ll|k zwA)c{EDPnFj6AZ4KiNAIGA>t94L|M?XK=1XlE_3{I6vD*3_6wOr;biW+HmtE+^`5W zE!&gdG8im{ChB^M_UTVqqsz4rW${PDQ1x_W(Ai->xKv~l6p+98m6fpmG;;p?jhvL< zUh3*kf9}8Cp98dre=Fefub`~(Z2;g+7|I95x-k>cb`~h<%!S-}ktPpbNgPm_51^Cv zc)_Y(5mVH1%gg$*%4PiM8U?Q@I#u`pv^?0+*H%nS={?{-?HzXj^20Et*;nXA8N_m7 z4?tQvAjc0CdRDIGJdIjhY zo~pgiddi3w4=C>X8;8uDY*|YOG>e>zjY=8Jm=No8@XjrvY_%NK>ChV`>atGp{}zRg zU#X+=UwN-TMxEq$Bt!jXXNAnIM|R{QC%GJLwuZ$3FYFUzVE-uyrr$fHIiD{4zEzZJ zKb>?ga$O*dwnjjt(E%-lPsvnHnUH?Se2U2F0Z@HKE6aD3^Uxn=ad1CW^%ysN*}Q{KKt`Havum_0u#*5J-FiUnDXmkLEMbBQzhC5&ec zCWeGC;I)3dD#m)9AG5}kfwII+^l-{dk;qR;qTmiu7%jBjm5c+Bme>O?g87ZytQtYu zIoctwM6*nlWTW>Vvs)pb?NF)9m1SUb34q*SN$^rDSGBxPZeW0d-3K7voRo@giR8QX zMsBq)g>Y^;_UM2`p(y63URSM^JY@szo|hOHPaJ@V)-UTC>Kl>qIc;f+Nn9-Rz7(O* zImdw&@dwz%OKYd$()KWBqdbO25`DBYB^E!Q)uKVB2Y-J~HzkC#@XuOuDZeeQ^T)(u zeqqoXu!cx*p(*2dym>2zJ%+yV+^iDC=jg@y$E> zN86J=+BI>*%84)c5VW=~ep6$o3Y;>gwirKCtRJqo{D~)uflW~H0(cDEH|a5W6anUA zb|_gC2?~V3X2v4Zzbs2K70ceYU}EmUs%DU^ax9vMit3n>hA+I^AhQCDSo@?)=u#wt zY+Ui4p2u=U$IzL6DNPCgU_!{v0sgp_NOD!7h_wcM|H+nke|w1vVXW_Ka?O++OH6oD z&K;_4|0dUSy<{IQGCYQKS{46NF8+5*SrLH3D5A73_i|7mLoV2yrdb9Ix zvls9`oEDZE=Wga2XYj2IFqc_qw9lq=^WKoqxaaqI5?|WaZDjZ$@EFu$1Hp=8&QCH8 z3e?9W+F@^ULFQ{Dy&UjVW{f~<@IAj#6MSr6zh%k3BA{=tpDgj(1b?^U0953l2^q64 zo!Pss_3b|7CfbHyOo1*3?x6))MtM4%B!HUo$u_5e}ruh`bRDw1K}!i{{-6 zNfr#uk9jdJOf+~rGS{327C~(N3D$`98=ADjUUz^EZddj`KG7u29qmbyQ@Ef1Bw(>SHjNh$ehEs)=v z(Wu!rCL1ol2Pm6qI^JZodz`YjxR)4v)p-%C4vTjQk-hH_Ya*Y?S33S`NBHOX(XaJf z{-0|1hlwWqmDx-9617u z7K)|1G-*6;gf!`k?RzA2qnDXkeOCJ@1N)dYwuUa?A6M}NsOQ2?@yQwm9GM68TC0akDjvB0j&F`$h9syJ3HTeU;=^>n7*1wMq zMpbQyE5vhMVTswjv3@sbRcq&15bKil7)uY|&;v5R#hpC!a=WK@4PR_EUo)aKbwQAO z3mVm>k!nM4D~R<7Q7E}`didCrcVVD4irI_ZI`q4BW%}aPHJcnOE%_@`K4Op?%*Zdz zLr(hUchZd&3RhUg6bK?w+UL!CB}!M4II1fO*CUEttrRK(K0?C-wM?G)p`8m7Jr$>f zuu>=ms~wM;$kK+gd74XVnPliFcynFKbD_E5saUOPXO54J=l%NVKq!c>`>B^6r{3g@ z)ib`I(cn_w%clQC^TCANF{y4sZ)<|q%|m%mj2Fg;D`9D|{lf5GZs7&{1>e;$Xb63l z*oA5RAPI&g%B3WSi~9WK*I$EhqmgyTz1N$02lF4G%ioVnRQLMBgG%E+fauO$5e2*ka{)Tm$LVrp|XN|8pK@T$pYAd^tL_IV|FzyK3d(fTvv^@vei z_DPzO-onO`7qp|0ClNwh+a0RLzg%;T63F@eY@3gZXu?bnKz5RqcuV_7(b?qg%VK5pUR>G70( zFpdw$e)-t%y4-q%a8efz9Bl1F8|0^91{O8xo?WTgD6e*N+?Qv9z<06)hhJTl7& zCer=0n{cqwsOVWOw`G1*cMUD-Y)9@Z3WwF0f1dF!m9{%&z;jJ0LY(}gb-~Y0;cUx` zF|ophLasn2y@?FPyf1>S@EIot^!(#Tnd2eDVe{KmEPDN0tj8rMU7S0CWx5*n?J1_j zDLr=B?+2WrN4y3+7qjFIuDF7E&~_UlkP=gIPK}%M7f|OH`4Llr7&^qMQd2=}*=H{X z)9pnK6{J49u$)`xK9I}P0dJrz5ZGnIcf5f$$4NK_j&X!yw(flL$tebQkgURJme49e zZKiC;Tlu#P={1XOCd^|Zr-Ih-nxD6Aq=(8%avl2b(c(S(<%7gu&NlTtGu|G5x6yIm zT|*VK_sW^7vvEppQWYez087pj^)gq16Xeq|FJ6SHM;V1POdo?HGvo5%@t-sZg91?d zrQF>bG7QutFKlm1;p>SoeeiF?%AF$dUVi+?-dk;d{WduApJ=7zc}3zAyq|shG$hat zDcyl!EHUceKAnCseq_6-EOLy$T^RiBi(2x}%Y2=)e?FT($IYLU;isMJ-!~b45_Zoqrwt!*XG=-D^DCnT z|IFqlO2S$cd2|LkwuUH{Rzy4`N>_s8PUU$rBb2`cV61YE>@+K|Fg+1wJ^(Rt?hW37 znc8J7?w%7=U@5N%PmS`@45M3LrZ=YbWa(d)cP2cTdcEVJ+C z@UDL~G%wuYC`pTFTel6J+Sd%!D!(+{87n6LAPPsGQPi4G2lh!gV-Ogp*~jvPVbAn# z%!)p^r?kd=YT_K)x`}XKv~e|Sk?cs(z2Fbf51nn5Ew(9@)sew?{(&h0cWS@2EBOXC zoAE}5N!aX6OcEO*iGB~4T^*kV9^9SRBPuS|NXg;_lD_YmG#?Y~F_`PATwBqLXS5bP zm#b~eZaP{k>8feh9exj>0ecAs9BafpiZ@xu%ve6i4u;BR@p=B-@|7QcP z4gi_$f+9sAX-1ygtvI5Hy0cc)YEo%+S#Ol~gM(?ii{6<)FVQnFM0WZ?w^f72`BD7O z63jx8%GIo|ZrWXiMm=LXMU(B1NqRp|iO?_IM9N4y8f8YK!{zAA%sXdsa zhKR?{sd65{=FHh%cQ?5;SkAt3?Ga1TD{)>!+>fyC*RZWbtkFMaDmZ-2QvNBXUh)*c zW^ewc2z@-uwVT(tET(bE30_p!L!ygiR5F~*Rgyn{>fxo`mIK^=o2o{8^5QV>lqzD* zj=X0kIi8w5TfG}1S<}{ymR^{|Cke8+%F&BgB2a*y{q`c|?Y=AemI+dk$fxGOBvAp- z$l--taT>FlMIfWP*z>d1+faP9&XO+tdZ`X8&a=ZEeCB42$`Qe;L@l)QKD*FE6nrNR z%b{ChO^cX~Xj}>e(;1AbIP3H)WOaSnC{2kmY;GE+r z=BBcaYUmK8+jEwu?vc;>(VkKkScL-GU0U!P*Y1vWE$=aS5_}y(nUroE6gY%P&XbiQ zTLnzvrHt^%^heli_budYN~mAehVUh7YD-Sw4QMO;j@ZO+jb;3=Z4VGue?BPF-sQ-i z)%d{~{FVc%u30v*Jcvqr_iron@Xxs6-~abV^YhthwE}34h@W_q5(7Q*Y*;01!u^=W z5}aoPm{qpCZM<*yh1zY^4fx;IFxo{V>{;$H@}U59>AE?)fS(67N7nfR(0xSnYf7tg ztj98fPPHd`*Ff(}9Rhbo_s}lSj0Kp|UITf!^5|C%?U(zrWfd8kBra6lV$qoa-=ss3 z2j#(oO`~~NH;JnS;_pKy%5P#ahha(pes=!Bi*zT6FhSzToOv1;vR&oT7xV$>1up-1 z!Fy_I9890?>e9KG15i$dwAaacY`AW|o*ntPeA|5%rWT^@?9YAQalW%Ko4c8?S~cr- z8=Pn3(pf9m@r((aci?^5hh59NQS19G;8jv^PeJ39kbZEolSH;F@k%P(B>M|DbPkbr zM}xab<_(T4nW1V^}G}2BL4W3awRbVQp*sq$3Ep^s4bmctG^m?xH zLyfS==GoTiE^K-he?+p`$dlU{`|7|Z$^j`Y+JcN*cn)q2PJuNVHStuc2#ma`>_c^H z=z1cppzlyK46_YvoGu9yC>b1?6QEm7rNT054D4*2@tU7FTWq|@Fq{&MR?T*M)`b`$ z0B+JNU2oKc=vdz)< z*CyW0tc%aoD?^f?uwO^<3R?k3!6_tbp6L5M$foFp1nr3EV&N0p*87enA6vkU=Q}s& z3S_3_8B}aD#!f+G^m}B^HlC4-0Bps~_7t732B=I(*Rqxg`{}k8V1bC z0{NW?^FX}#srzHFcNJdHlTfttm1Q7)l~S?Q=S(6ueE+E_8bI}&UBDD=fJF+aO4RQA zO3ny=zgGZ6p)Rt<4(K(vEiB|#{4i9Rw{lR1QV>j56wc2P2qD{kXJ;(^U}ref`2UV^ z@xPX3p?3g^Bi~%s`pLb&Net}ESAQ%X&qr`T1xG4n8+;Gn^#BX+%{tZ9mcoj=%X}^y9)%M#j<%(Y7b)HCM0_O5S&XzWn{gySSW{7uY50yE& zZ;44evUHiOo~eTu6O#PZr@3~Xu{-B4+>FT71CZ#eXPY9Zv5CIv=U>#`ir$Q2-RBm< zh8qMnK~FXnvH10QrdP?SHIfoA1!T3SeQw{)>sQV8M0~77ymjbRH>yTn7-8AilA{Z);91JNZ53=nIJ||4n zOSF~8jylJLElruk16czQ-|HWj^wup?Gg69>D5_!}u)Vl7Cx3S=3xWG3^?^V!vif+G95()Y_xkf(YgIUU$T1zG<>;1+Q`$uxj*4^jO%LPh`skteI$*7A=m&@<0 z1wxW@)|Ym8of6r%yg7SQ6HJypsi-?u+h(%-3RI^;v7h>GbLQmFnl$z?^bESY_A}J~ z++_y>(Ldbx|Bc6gO#0yOa6JDN-Slsa(o*ec;5oDV{Ej>Q{|W|a^6Ke#;>*rDEw%E2T$>R6o%uaM<90F}~qZe+aAooKMXKTc#06dPO~G*tie!{)4DlDQpGq1OQ! zj72bK>?`s1l=8JHcq$HC!`_o`0puL3X5$}@O}v2LeV#GxumjNBiD}Kt>!W@z%(ZtM zLJWomz)smb%iZ`p4~`k3fVgg#JlDhgg)sy7tag!#-JWd{v(XFQkPu$@tTIy$%+@9T z#q^;8=-F!^xmbNS^CM*NmG5lga8Ckkux)uEA8@#YrmUSM790TLqVou?_hyOtS3c2} z#XJqih^F$OYTIm_tBqRhRTl^rIR})*OC{!D21t!xL z2pEWE)ji8!**xe}ix9vYUCC^gBYPBs99G;^%VL87I}59UqhhI5)NOg5L+|E@ zY=qR>RhZHt_C67O*R3;i&)@i^N-_M)NV=CMK!pYPQ2OQ9loq_m+c+fh67L*<0@Z&G zzB54q@SPgj_W*z^p!V@wtVG*6TAu($Rmt4-XmV3OOuu1*f28FUVeesgAY~3tknv3Z zxiTDd3akdOX!#|8opei-gEKbJxHdA79rzKJ_Fwyhw7u-xJ9f)DKf>#APEF+@QWz-5 ze|_|Z0CY=WA=`ebOv7u$pQyXecwW1n^w8Xd<|vhlWN*#DQ!86U9Q{gS{Q-aL_AiF` zA2rqfFHI$B-eb3$CW$i#Bie&fI|mcI7B7;Xny0$-_NcGJ=)JCEuo8mUmf&TjhjK?k(u~z#{-ZU0@VvW z06l)G^zQbLhG2`z15lDNdNHR&qws9q_jd9>UD0awtPZj9_cp1k)R4_j0E>a&C4}o3 z&u{6B^7m8uyi(@zk6_d3`Q5TUZ+`p?iSKuYTBQ8_?x~xwe?K+y`BxD^RA>SkaZY%cLyj(b|wLK1t=D_IAyJYzSm)9cWN}_ znj(YW2EW>f)j9bKmm|kcw#;!BrPNf_hz8gtJWDjr17K@B>pfnjEImuUhxW^H-S!=C z$>ojLbJo#OuftrEmn}EUR&wz=Y-^`}8IQQ`3%W23;V&uSEOoTNMR z6W`y)2CM0+T0^A!ExB328?tr}Kng1s`Epmx@PvNcSeK{3k1v2CQLL6gr_VyV24$Hp z)c5tM9Kzp~&o(}yMl`%bIs^5K;v)H)0ah*0VW{bIVW^XBzPRRGt(8y$KO&G1N82Kz z#g28b@H(rmAODA8-~X3z>5%e?B`9tw3A{~l0IK33`LfN4-&Q5Z1Ra3J&Q|$g*iteN zH(DatF@aauv%`{r@Anso{fFC>t@nPh*GYMQ8}!oJ@-k?57>I`|Nm5re$EO$S5vlzl+ zWBsj3vB@=-=}{ACKH9NS59VL8$K}-jRE8zpY!mD-D#_>_!X|0mEqnG~La%R_iqe%c z1>j7n7P~pG29`zQkBiJYCR7{oCc{3!`XJ+gezRnOS(*=Evzz=g_m)a~U!!-cVn9V2 zGu+Lvs$CIlO6oMw5;#R&!d{m&Ja{?T4N6I z>LDe(HzBBnP3~p;U~Oxc-&^(Fm5jJ%<&!(t`?{*-*;_zJN&Kl~bFBNp1$C>a2P zY$U8!e;s67WXsKT-WL|~0@z>y@{7=^>T)&AGd6+Ew z=rSCYu56>oRilpF59fhlOB?UYXUT!4N6i8D&AO=e6sfMhw;C<$U-o z3cx^ob|~ur)D`Ipc=A^cK+RciJ?;B0hNs2UL)k*E>*l{8K<{Z1tY|{10R54c9=AoF z84;miWD{N&*-?F2b)3)T>Wrx@w{0<2SB;PupfWnN&xg0)HHJUIZt(+@y@ISEq2 zbpV=Q0`ISA6aA&v@;^J=c@FI4k^rR(h;jpWR{^9#Ng#WeK>Yq$*pFY4OOBSegCM@% zI`U_St6ZIU-viK9Og<&v8(5-;&%rjg$#FYu_^x80E19q_MLOX1h;f)VcpqS4`v2AO zur3@&h`+OKz>39*oUJX5dicfUkmGT6|JhQ&ISF3zIERPM6| zWqh!y>A0!Fp*g&Bdk#PcYbl>`cHl3$xf=A{sqR~G9?llRaA6i(dH6XF8*$(rPL926f$$W4lsb`03BmXMO2-+D)Hy0 zv?m_#^e0Yy#yeG!zU$$_TB5GJ=`$O+*Xz_it0t$b373_WTXBz3&b3V9Lq8~5dR{e? z5RsUFPWEY$I{XVWrH}hP@TZWiFGojOY$mB2{NO6qRR%_Y5oQ}F*<0^4_;O?^{ z0UtFw@1A`MJ3^4sK;6i6z8odtYm=>?Uw^(SDkg?Lf*5e_J{Pc~3PSn4h^7+ZW>q!M z$1jDwiSC#ZG^KgRaf-Z6Di7{kENUT=#sa^>9UHf#35)~!HJmw5(G;+2&e)#Aw;nvdUSfLKnyMJm}B+aJv}39Nia^KC-)lHCU5prK>-3G@FT18P*2qz_~QXVf=!8( zBK@LH(YP=8O!;X;yT)*yea?68_ik21Tj0C!vmHpxX$Y~`4J}ka@YJfERsbWluA$(~ z(?G%>l}n7HIlbK4OAlXzQ8=-uJY9Sq*7D(?q1=~i!UrHXVaOW;R7PZ}h3Abrn)TAE zPnn5a(Xpg56p3tN_QEXd5aP5XT6B(+*auH}J3=NUz9v&VTAH1-auRaRS`8gN-N3|Z zcO7&7CNGn2A@5^Y5Y?XV^0HxE7;C~{eO1%WMvaENB{NG@k@jsOHF+5$!HnT$I^$-Q z>tt4Y+}i{fiQ;JrIn*Za`M86XcF2L8$JTJbh#yRZW5`a($Dkol6gal(F2@u_2dwl(qssTAQ+(@(F?Y>5@MtD`Rwvp(Uu z40;r8e3kX{MHBCg&HR1c_|+|OX^RhM$mhY}3(be;O(2|M7Pr4QF zHg86ydx1z{1&c|C~?+ZV%Aev93}V3cewqzv*!1b?BXa9apktWB@T&6bfiQtw@$5D2nd<*hazAa05rL)wvRmb6BP4s6o>nkT8HY+Ja!5?8_mk`(e>1mhn$~0*|X4} zn(6K?B)DXVQM}VrZdbC@P1}q7=v%eLNWi#vq*icB5F4{6deRte+*H#dP<{3+AF(o0%w!y-bx+SK7Zk75nh?oc6HDt2ZyMGS=sBA`d{JEumYK zV%b?sC1$%mW6aA* zUwbn^A=9E`ir#!;30~P_h1x+A^`tA_~Jxj>e zlINC+Os};zpCTFTE3uC8kAG3A-mDbU%FN8!%P4OB*~l%m+&VDo28aKrd7p|7$HkEK zFI0j}VCu#7p(Q37ogQ}2ev1nxL-jNt99tl-^Ou`cADONj4mi234XOmt##TL+P9@w+ zI31x|Izfe18sN665qBCk3%9lJJw4=7X%D)yh^(i#8guW)F zlj#6tqI}vrFk~J}mWS?DUf&CC#2l-gec9Gw<`y6936hhEc)^$h?D9_#Q;|60=Lj@8 z>j8-2wb=ovF_Nx*>Lu-}Gu6p#-P4yERyV2BS6qZ?m=~8r?3xPbKOt2_=&i{zG^#tF zE3^j7YqvNi5lt_K%+)bj;91JieqK3CRX3ljkH6>+{WYd2g*m;32lTYcuo%SJqZ47fzB2!0#kNd8S7uDD-se zk)0HU6$!(}665O(dqq#>SrHO7+nrXJ%QY{j6DeNtDpZ_*l*9X4d|i_g>rZAul)4~t#&%PUT69do@Ys@|>$z?&yutO3j^N;Dq(|(p3KhuxmB+Tx8DPAj zC8mg9ZNW$C9rfUUB=9oV>Y=AoRqup)wKbLHwFl;=G)=l-Zim5y?F7^-PW-JiUvP`C zEiC~eR}{zETq6>n;b`t~rDk!~<~cRnr+766iq^>}BS)kMkaa$y5U5zx&stJeLblT` zr>mP58y`$eFB(BjNXs<;xQSgtH1+6SOA2u7Hryn%)}5V>9anjk#^3UjO|*icu_ zWN;}(f@#nk{iUyXn_Q`jq&C|sO(eJ3l#er&E{q>X>A6)mesEVetqQKm2^xHDFA{n8 z6ztk?6)iGiZd~V*TK7kP2Byg{Iuyb;x@uxsjqpXAgJ7A|9h}9G?<(LItNq{tY614i zZNwL?>>Mxr8`yZ-&Jnk7zG?4;1CTm+KLvi6uKwFHfPMpjfBY?H{-9c8S6IJri3e@f zyVIEDmP2N-x=w#%PcEPwf=NMY+oS2_&W_V-vWw*M zwBja)(hoD3C4a3!8_y`a2dM;i4tRmU}{0gd6@54DnkN8UI7V8qw&uybrI@ zB)6-4KN={coOt1Utm7opv~m1MG6m%+<&e!<}FaOYxcQJjsvZlU$P zjaL3^d7}n5{Bo#{o)+JUK~I&JKu#@6(3eCHDwgewU00mDz8ALlfFQAm^)*$7|90x@ zPIj~nibOe5Do}QSyQgERji!5ERJ~X1q1#cX`YkHfCcB^ng>_sq zAvdP-Y1TGqoxF}$S&a?Ly}p8HT7OJZ(dG($NFPiQ$aaE26Hh2H#NT3t*sy;N{~`}O zgq-24b*14vPfCQb(g|D7Jxck`=i;ryOl-j1E?Yb4$7`W=B@qW8)ym?RIKw6tN*v8; z*ektn&s_dv;fG8b`Ua56r>qMji)?LfIQdy)Wh09&1GJS5mE>vhtTF)NI5AS2& zohKm95I;<3kMG(H!`meufIJh|MylQrB5Z4<{!*P? zk71O?>hI}f&dAdF-%!gYG7-^cMEH90Tj#@UWHOHRi4DAL-%Jc&O?>sHY*#!dxdGGV zv;Ki+dfm18Q}vjxu8l42XXSFy+ft3>YUD&r0kz!wQ{?^+9waQ5=IHv+Qq-T18X2*e z@fHHGq((Zy;41@++U=|-cfpqoCBlTkN)3f*D3XLEHqmlbl>KAyEO#oNunR8?OuQav z&RX;7P zvXvJ~)3)8d8-C>(rIyAzdP~~8Lxvggb|N%0z~Iy9yZ4Yy^3iU+>fRY!fXS(fBjD`5 z6wK;B_0(4WcCNV8Wd9Y)MvQUb1HDeA@_q4Yg?sFCyx1ntEuEOp+Rv#65-5;Nr~6?@ zU39N{cWEAK=$7PWS@iLfL#*HiDdCo_0_A@1i)hoEy-ADC`qmjF&mNCUuFV&|TioW) z%SRi_qF6}I<=l%|*^i9Qhs!bG89C{d(WXdoj#b-~-nO^NA8BW}&*I3zFVc8F;Tbwz zol*Dm?NYCe-MXA`gPDfwO_j^C(Z23!H)WYJ)X{_$0S{;PlUD>2=7nh>?4mHGAz&U$ z3KRhvKH-ptvVlR7xPT}}n3J^4chwMru9E;zEQjZz6SJQfD~GfEDkAWeBh3O$#jThz;I>+W4V zu{x?C$mAb1Nia}|Cw23l>Qt#2Sc{4+P*89^r9>J%A+5S-NCt0!pU^c3Zcgf@64pvG za|Q>>py#SDg;N|kPn>-fG)W-IB#shQuIsANy)&(R;$~Fyhxuu=p&NZIAvd+e_*Ata z^OT_fw)Ksd``Y6@(h_dAiE^!8!jt&pRh1V*EQW=(j0BJA>Kr+-b+Kaki@(w7+W0s4 zj;p*3QAcw_fhjsqxPBId)0i^%=c|a1$YSjEZF2EzyMm5<=FOr#SDfBb;O6~B)v?re zl8q*^N?{UDM6{mkC5p^=;`HHY)S`O(`Gvt#_i;4STls|?zCLd0%uO`}Q>E)y#=Zo2 zDn(kP++Ae9Eqx8dFpa)Na1^GuHGnGjAKspQ>k2|TEdwx4gN%1e8=N4heaB>sp9({vI- z-AENB|K0(ezCn4&(Zp5n|X1?Q_rxI!yi@c;#DEWR%;R* zzQBYV)Skx!rqserZX|)5qXG)tuzGz$yEp<#HOHoS;ptv@ci_c{*LtYPrdO0VyVfXG z`p(xN60SeZs}4M&&^*WN8&R=j6<3cSSADZUP4}wLOql<;23_I&Cxb~m3;Z3Sz8}~z;5{lykUf!=i9m=8^*|U;-Vu6PQIcgj>H(j>9Q+il zb0k^TvU|4yg-<1~94$NmWk%w+#5q(ilJ6}ZfFg;Zhe}wjet37- zbVKC#Kf3Bd}0HR7p-&;I7 z4eM0Jx~ba(I$t$EHP`?cSV?g7ddeiAwe-Egce6_tNR8L~-XisY%T29|v$bat! z^`9O6Pd?FFtK4B{z)tcW<6SyVzNnYk%6FrZgC@&X8z0f{C6 ze|FHa?ZH?6-r)UTyxK?3Z^g)ri$7u!Io?F~#abv%m2V3{c+`sKDVvBK5@G&2fkxlC ze2;LPCEo{3_NJwt6F~PnBOp;jxHm|>!+kMuKhLEGzKsFUUyIvWSSw3FTXRr@7);UR zmOy0eEDU6U{6zD^8_z33{I}+hz87tRjgvw;r*-3)&Q(NSCV6p*>B%YtiRoV5OoX%? zJ5204GV#;{=wiSv9t^%*5MT$UQ7o}rM|z0My)S;NgPRrxzsR~U$diJ6b&s$Z$VO=l zN~Zo4lmQSVyHvSipl?_Px0jrsX#>mhgh4DLDrgKUMD^H&|F?}O?4>htP1ElJ<9JmY>^x!?@yTs}E7 zn0vP$oCKoGQU4C+4+FTFk$)Q;2o0xOvsOGa&Gc>({_YyYUvP3RgDH7^t>g5&Pv$I7 z6+|On6yj*0&SDe>BOmNUxT2K&8ENc4%AJ7tyXCmCHY3POKZSM@Q6YLWmI)n113nx@~9h+T0Ph_ZaWuv29iLRDTS8Jd*Q z2Y>ibb-;&5-N+CX+92{J1W^Pldf?r^V+U!+EnOZPH`-}ta6pj!l`HFW+nO)l&&`NboRZ@^TrTd%@@u>WY8B%jCtmsiAwk27+)p&v$p^+ukMR z!Z=p2JSiE0p1c{0>|rAfRIPYcu_CNaj3nLG&yr{UUZUbpAgLzsrVi}^HRD4*Pxd#U z^i3a8w=C5mp9&<3W-s|Nncnm7%()_nn-T}(&ybUkTN=TlcAgE*t&EjFb-!nxk*B%t z8h}7jc2O(u8xNPYcFsNW5ZP%~-OR(VA7*d;Q!-UEO55)MbW$*2Z3|X{d$~#v@hJ8l z%g^g&hPX0%^th=4oPqUF^urN_edw@f^nClqj_v6Cs(TpBK>yiau&V$z6%Gi-)@o=Z0# zwba2g^#qV-b0Vk0=JzyG26!fgu0LcS#_1tu9EP|%q7`O|<5?Un;w zLpcqe7P<8~oBM0JS+;w3g1ND*V<9?Ps;P_9Bl*2?pyAuN05c%JtptHI@(1)o^{s;4 z0WIp%t3p9`;A)LqArYKG=79VGY*(NElDqn|dj-@^4@-KBfd`){^=km-{|t!y5BXL< z<8l2!EvaG4^82Mmc@HlBG-@-x4b{ir2D3&ZJ_WKX%l%p+gf08+r=uzF<=v#5$!NQbgrx@8D`dP1B$@4TQzzT>-L++c7Gr(L#D3#sGBwKsQe zQ_tQdx~*!bx3KUxONNXqrc%W~$b=8ZDSGd~xCJ5zWNidZO(#K+*3Lny@J@iDP|m zCw5CsUN8~0>Z@Qt>K^TZ8C3rjaUagkl^VyaQLF~M;mfhtMPU_Zj@t1)85JIjsO$6D z0>;XVA91Eby^8;Etm(*ah)aEK8~>HuDC@A$$cJI)ODLwZu?pKO(WiyYM#+2^k=y@P z?xK95K%`9LaUM#;_zHwMJxQQOAa3d5^>0kZ>90)2yAw-a*@}Q<9iUT^-y9WX=u#EH z^yE;+Zp&n^u>U?0&ZVfYSn!fnA)xu2I3vOQ$}Qon!%~JN!0Ue8aSQDERYXB-SUhaZ zgxE_1(5(;X+9YcvKTM-jE$*;C0FlPZ0+h)n&(Flw8PquUg!QqJPJH$~#|3)$$MpPL zHm5M$TIZye!twm4EBm-2NZP^0evz%+xN5zK7ujTw3V>J@c&Kj)c02`8z3&2u+-6`3 zgN!pRhjesr@abX18I~w`x~!{uCr~rvZ&d6GWT6500ob-HMYM-G(MAr{l5`peXMK82 zgrb}}AhxA0OLgz>8gTzjG;2=k_f^wqyN2E25+J~95j{qrvwud1jrFrR>t|>PJc5ah zWol5FX23$+Y=KY4FP`d0y)9oIojGyTipaQ9>FJ@1{(A6e5gJW3-{Uy}rCWaLix)ln70 zZ^jta`Le!fy;}xw*!UTB2uGv6U+nUap{pBQm{_~b%bt&4b2Mjq!EFKq7JLfYufvLnqUyq5uepFzq zw(4A_rbV87R}x!%+f!}Otjm*Hbht8NYDu}7AY$_0gaiLG}Lozu2{e97Foc6oj9x+4`}0|vZqfn-&Z=&MuKLHbU& zM|e|BX^%Z*w<~!is^Qba9FN^dC_@;k=Wp)(4ORCk`(PI3tRd5b%I+Z8bkZ`=x6?WM$ElB1FJW0FJ zo+u-0Rc}~6uoNDgkYYW4iN`MWLVIaSv|vFBhFp8e!-J*So!lR5Y%}%wUef>9-gk#J zwe4vKK~Soq^b!O-2nYy>(nFCZB1o4ig3<&841`cZN05$yf(S^jQ6cnRLQ{J00TiSr z)BqvQ_S|#Mz4yB_^Uaw%Gv9f>Ie+kkCs`|dWtF{u<^8>H$KBye6?bKiUk%)gd4~z- z!?vqn(%HYgEpBO<|3WHO4z{#V;#L7-TfqY*GA ztVatMQvYf4ndkYFw3Zq(fg(3CIx<`{p(qCb*^_q^E+69ZX{=JFA{u}-`vE|u+gbD8 zt{9PsBV?E|xNN%pmHQk>nt0nxw;==07a^->g%J`J*&|C^^tyFB_R3WavRz(RfA7)I zZpM%u%keQuaLrCwf zT#BBYl>`<89dHp2y2c-{JeiDkp_%9&m~8igE28m>XBtz_A#OiT#bvdQc~UXV)X&6P z$A}I~)0NU9ZV%Pqm;z*3E|uxox=24AFjcH?25yY`YkN|qrtN-`SibP_E}3aq4uYp>~^>G&19CB7Qwrn}Tm}M$ymu&S=}yW}coddB&Vssq)~O zf>aNk$#Mg=gYGO$$x*|&a(t6g?QLh%HSlYPIfBLRIQ(T_~d{OSSyCoA;ISVHDq`0MJGMH$VNo_$JsKCZKwD`vEU$N{_Cp@jXM z)>RpWqRLOQ&wX?{6cd}V4>~80iorRyB)Q`~oZLpqLXY}`c+fXqTU(chc9_{p?S2+g zKw({ow-kZsoG#+_F6+)#5MrB~?&v7t$jITSG>z?}e5ONbcBtWNMT-6%EfTl)vRaEA z;FhO6t9q5GTt5NJJ1Du=p&-}AfF#d+1ichJyE$u1Dq|fh+EGe^WW>Fgm?YQ-F1FoE z9`r!#ODCD9UX$D;d(G=>_rL|wf*X|mgs_==vN!#jPD5n^fmn+SR1xYe_Glzf${l-Zv)Mp@WmNJl7PLwac9oYvX|>q;ODp z1hZbS2wCt(eT?j}ht0VO`jdYL9jNnO*k~^8uJc^@4hnTDA1{&B$h}ojdCHZ2R(&d= z%V;r^40^z@0*5?TnF}b{J4CzM*uJ8bGLZRz54uAROGQueg)kW1t9o4JTJ3}}A3VPn z?eD8zswxuA%5#^|Tj?V?!@ zd;#Xf4|laIrTeQgig6xP>Pvb`)vqcK?^H*FNs|_Il(SaZF;IK`)cM^Uk0xugH$xLE0}T=SGopchA1#^PW%H{`w(OlD!B@lYlkoCwyLqiAZo@&Tfp*P*jf- zh#J2q#YM80iwB$g>NOckr!6MMOTK>W-gWsbS-KlJf_29w%tRNqh-S@uE|K5f($d_f z>y|T~mc`K$f|h@|NuifZu*05wnQVqSdD;{|z5JAgVvK@kTcU4eGoILgQ%tpc1+8kR z7poW+NCR)k%d}5vc)NeILZ`>Mr(C#+^0;8#w;h)5U;gcSOeADtP|5>kzd(x!SGq>A z-To6rIqZcc3fmhcT;R<w=?RHQj1zB z@IpCJp9rRa3FGApFlTTB_{H}7c&i^4j5=V!XrHj|9tHHT9=(yikpL|)yBbAP{#ebp z>8iqR7wKN1A2UuOIbt^}e_X|tom-D_Vwk)Ww9gmqP#FVRQXXww8(B-kivYxe*VFe7 z`G|q#P*QJG;1p%x<2#gU0yj31WRGu#U(A^OLay-u9yC76iNXc79vQ4nPN}*2;nGTY zCC@KUgfC;64@$)sFVasF6bCovB~wf#<7i!XSZql5kmmie61gyU%?Nb&q$o-uUN`Gm z$;_*R&s<_YLO3>MoI%Hg(4?v3CmY45Ad$85K*b~IerZje<-%ClgMMcq1x9aw=IR&w zeT}8440ZA^Pbrj}`}zT1_7(W^kzF<#H4t0svzGzmv1h?)*H0h)1(sN3@JobA`HvP@ z`mJ>G-w0$Wky4*KvwOW<_tcw7^B<5yk{^&mfLm6I%AI=i?I&^5XGSzn%FdcD<~E9S zuH?NyYGV?l?w3=IA1eId!vn~Xk5N>dJ8cxDfC$>ey&pWy5kA8|{ouLVMubnW4?3A- z>-`{$5MEpWdbkDwOhP_t&eNI~GWWpOigxTr-M@p1E0uf|nok6RXE`eU9wsizhka!L z8}?oUu)z7bbq55*C%-XLqlFkjz~cxYs^Fo5Qp7JUwgb_0YlZ=xouw8obmn=YeO}`>%#=r ztq?5?7?&faXN`o@+SshWAH4H`+9(d(OWbEK`VixU(Wbj^cDcPGEKgaCJv;nKz{{hc zXqqSYRBWXa^0#ix=G~L>fk*6n=A?nS6`+Ie(sB90pm((e^4W=fdIJV-oOW{h6 z_3$e(3;XTP$d8~7dkPtC&iUw$_$wn^__Rey^lL9G#`MQ+UOBF;H;>ysJd_I&^!GUz zr~#F8#+UMphi@eD0)V>F!ijG<*w!Lr0r~sQ{R*cm#-8V2Mzsjw`rQ*^;Cq&5VGpNT z;1^AxwO%o4JGWJ-qv7dRO&~bsJmLPl%Q^d5A(x9VjUY`P7T(}%)Wv=WG_CxrYGx@y z85~O%RM@sF=Vc+ev5>|m(Fn1#$jgHk%?jb~4rUron3#YT#g5*Llsu^p+hVV!EJBTf5ipba1rqFi_wp%GqOp0w0o_ z0}dS%@C4NI3UWX&LZQd9CXXuK(9v8oxI>4(aC1d)Fr>bz-|w@aSEc2n?TNC3Ip?!o zd)G;J4uWK98?N<3JpwHSbx^QRr(-$~X%b!T%)3w4lQ#LurwNCJ5bowq>G~hu)4YNv zJOiRJj{UnXNK?zR1d(Ex!<4?3Mc?{o8MHO0gZwXFjLt^neh^55l{m3)Oxv5Ca3to@vS=iu+^|?goELlSP=xF-H>`z0l z&Lp3kJ;YT%fP^$X)Yz3gI(kjyhQSxx8qU4#?Y$(lpdQ~868I&cc`25n+az`5%tHIH zpD2DLujlNXZQc}b(>}%%$jkWF#I)q%fkX;Dm=o{PR$>@?-GCXb0|T z(|j&4Uty`6EyEzU8p8Nw2 z`%F$FY+193a7ji*S)r-WMq8O!VYh}p=~OizU%JCuon9u)QVpA%2cP6jWQmyAzg8Z} zv1IksbzP@wnGV@&!@88YJ;CyXi zghP+2_3zf^kYyWDM{QKQL3S@#6x^@bf8huic*~k~6y0gARZvy-P|o?GhdkK&b(64= zXGQtO7rl*-JT58lV7X#iJ~Z&nw%Y^8DV{m=O6ZBc*#n%B?kZ8}33WOd95uVjB;Nb7~{wB9DjZ9n%>_7HAM{j%bDn`|2^1nBx?hj)r9B?rM6+k|s#bTg$Z5zXY^c-TT)f^g)n0RT$wuvr z4}3w%agr-VCKa|E(%Q})rpm#5KGqTKC#sapDb}A6OCNtLe|%T$!D##h;THCGMS4~{ z?S~wGh^@WC<0#)V(|Lyx(vbuBA@7Tvw4o!fyegSAQQjCu;J{qMkDO3wRw05k@V6h@%oi*94xeU0xhVW|AJ6y1pN0eU6;vQLjf~!J*=&w#2&K3swQ{5Cdiq zHR#Iuc>rd|lL0H8B+A{nh^R;S7no6` z6|(WU56d&5O|zu<@%I+m=GBMlavqi7P9<4ooVi-8kK7Z>yjRxXzE4vMy;-Vwd&~@l zVUXTO%{X_G%T%A^I=OH$N{&JJ3CFTF6S?(n^aw`u24un~Zm-qI8bbr=pebp{TX-?v z-WWfSlOlO&9P1v;1_SNHdzFT@%d>pMSwtK`zv?Xe=;j}0x%S6cD#=}0;Ykj|fP*%< zqU}_MeoV4yG%(9*415IsxwAmjS*Gv!I|#_Nmb&$6<5TDJiS2nE2B=?A>lvDnktm`we+rw0fNC>a;PvwC&hiXE0CW zqKy-TB{vVAuJRDmaAxMG_QSEv7i`jf2NeXYj$nwa^)GbrIH)@*d3a70Wos5F*v8it zNJZG;^A@h$O%qCMNTv?5sZ{j1F$>@`v`@An0=B_Ku){1HK8|B@vdsQmR^fDcLGmY9 zZ@-(^1h5rT7rMm>(!*z1CUF;bcP=7Kk});~e649$`N{H9isk5D&7C-WlkgRzu~n}b zat_#bwif-sp$b30Ne2n>VtkziY)R^!N&6dd4marKzsyTt81lGD{^Pbcnyw>9wIKA# zumLnOX-MiLZ*p9A4s_y3H{VEjK8P4wNrO9dGr%I`Y|)YQXF9pZKgQ3q2UFGw934QS9mib*0<_Ku z{F1CKF);nPz=ZAr-!BLQvA=NN@bCUR+0fr7JAUPN%9Q_S+ao0sf$Fj!N~on$gw@Dd z&T>C%d!8Q@xFrsP@fD;rKjaaervOFjFtT^6M&YZKvcqNl@`rsa;2G zTmlfg;{bn)1xM@6&l**YK#ZZ8aJTbuqG5u@o0leEcFkDG{5_yQNTT3c&T2L6#L$XI zs6fQt`>f-{ulZUT_q)0&08GHg32@1yJ@zp=0_${7ACcxeNR0R)sED9$I&^e$)@t<0 zV*nK2OqUgj*ehI>RDP{F3%FlQn54P>}16Z*cViU|GoEinr4=Z8R%z_iyIc z#R-=Uvjl$-Mu`<9jRn?>c(DOc67R6LKOi?h{uqQRkM;--0zV4{Tz|y`AjJay)kNce zgJ#0NrnSNF7up|L$STiw&@+9eza?hkhm!36QhrGb_+LK|L;rxL`So}Gv#+qe_Qpfb zIyvGhVr6pQ;Y#-Zjcte&?Z<=YdVn1}u4p{c*;^mJrw-)7b4mVZd5r!e7>@tyHm9T> zL|BDCR0D4a+h0%_T8wfwoIcAGx5I8LpD6UpMm5%N9VocW-+ZGs+n%*7qTE!)fV07QBe*tQzL zlsDpW@8-?l75KhgCc=R-0d!2+H1IgLB)|rq0uhGOH;Q~ix4L#@Hz9Kp;E7GdGLk53 zZ$?#aAo5$geed+cH2@mV>UnB|L38~8vNHw%-LL=ljV}T`B65CfB;N-_rsNQ~MU|%7 z-!&TH0sV$w`>Opi61yrUyF$xfe@NDE*ZwyKD13TSAozfA6SzP>iM>tUDUU3dqg)1a`6#KKYLix`f;@{I5<*gjv@E2DN`vR5YaPJi# z_E;SCmjPtn|7WTX{#RTFc@v+tSUX2oQW2BUNVw~afZf-TiV$5_WRaAEgS2x%a21BH zV2V1^Qdpso6{=_p#pV&La-6MITe-X$@HVIf{&S7U&o2Ex>-zq){dsE8k(P)q~|W9KKxlqFtLhhDVa$M$s%`?$bdAZ91m0wMzDq z^?px({F*p&i+A-B1s(vBi_d*C-vA2t3H)-}#IP5Q1acRmjA_GpysbAT{hUfc-@=P1 z#q%t8=%}tpG56jH%Nz8Uqy6`W_~&Q8Ypc+)gHLTfd}|L-AztTnWVGUVZMHKzJB^c*XmlLS;*s`+MAV(Zk?MOiHt2EVYoB&Sg*|l%wI)k!BqJ9|dIv+7 zk31d$kp1w&0|EJ$liH0?@pPW zC$X9mnuyV4DPLPll`=WJ_jdMiRK&V_pnkL@I1vOlV3=><#KyFVoL{+7Q(qY+Y#sVV zQ$RD{%$b?jJu-SkJ_}%HZD1`=xEIvs?r@ee`I%;3qCb1WN5!CBmJB-gcXeABM4N_> zHQBy&glbA!A%zfg%cs}qbq}sZxoT$QtzSkj7>Um;UA5rA+5p3G-}laxwc{N}Hp^n0 zB0%darKXr(q%RUir!|b?CmeUx&53D6RJWC8g@B%sqM+A*)_FA0QZ^VWe>JG3N*wLI zDPvIj*tiv}U>rG8vki%9+!nUC+}j7_5EM8+kYQ&O+4r}z-*;vXs*j6(d>eeenc?yS z7?^lVj+EGpMXa2v{87yKSB^Nr)KqU7dtu?ch%-DaUXPv@W(_Aa?G;v@z^})yz9=DWdS?VvcI|T-wxu>|Npumf0(&I3gCh41GfiIyk&J7 zAV_zyD6%lXtr2f+K!`0=0Dax8zqCt8)W&NxCH2+Sy68(!*qoW8YeV4%Gs^P}!#6gBQ%lukSH(VP zySx|Oov9+e^B3@;94h6*8DACop!XxySK-0Ga zG6o)sM!>bQ3?!I3E{f3Q<~YCA4~TScxqHW1#%e?UOf z#lc$NLEiGieG^1X`)I9G!uxD7>;AyjqcKw)lBh8~x3|NzbDd$ob4Rnm~6D<<@HHH_Ncp&;6bO$YcG-4TSxvja2ICcTZ zGP6q?j+WQqzMltJwjbJyuK8>4X3$;79kkzn=A^U2$x&%quXl+zw!oE*^Le5Cj$cro z>>{fW-=c6Pb4`LCZ$P;&d?~T})NVrIeHo zUBB-yyVaCTlrx9>inNbrOqG;rGW17zJ?%cVrJbC0AyB+y^aBydJ9k#+AjT#D#32p* zix;?K?vUv^^5__HM3O;0!_#gdGPcWc^A=VsBU+G6(yC_oG9@UMtGjfQTt4_<9-3F= zkj@}W$ukTUZ1%JUgy?SY=0c`i7CBSFuQCcVQC)cWk!$oY7 zymV#sa+ArfTmB=)3+g0$23G;b{8QRz@JVy+4eS*oB||+TiAh(Ndt8TY_c+@OfpInQ z8{^##g45%iTS+!`NC_eEFQ6Zkb@W6YGrY;dl-fYXeUxb;U3K!)D2*2XtRi{<)?z8~ zjbXu(ao61R4lX)vk&WtcHNc53j?p|bd!uujLWPI@#5bR0z;2F3B#TMRq3FryzNBq* zwjX6SmK|d+H+s1jQpm*(A?O7S0 z@s55Z1i#^V1bKcv{haCR-83bts*c(1sl+4XJ$wWxXVFP@OC9?d0KoNpF;^skElsl7 zcdjdvzxShu#TeiqA;L)L3uO89<()R;bM)GgLg3R#Maq0F#J*+GVU2hlcSIQ<>rEiR+ZBm&5;O1&{6xp#W& zm@s}Wzvt1sL}hry$;WTpKAeT5QYZR46jH7@Dzx9t5xz1kK)7D^e12L?cpYALv%7ko z0%9U8Brm$61EeUXLbkc4v6&ioAu|F1aLRwj_$@WQ>=A~JMlnY%qJ-vk1Z}Qy_{8NlO<_naz+rX`l{X&~ z1~&@K@_OJ=)0pVd1?Bel_zr@q0p{e~;C`(*fnN80w} zodO8%FSVCJ4fY3bGXQl`!HA>trDKb?#eJlzxNL;lZXMH%6H%5JMC*)cvr$dtWMccv zB6n79Dq9b(Ky8_q@G)qq-Dm^Xfb(uqzgB(s6Q_E`)iVcVq<9?)A#C6+M5wfG(J5>D z#IVdAGr+77EU&3Ft<@-bgLxE{CArJPuz>U7!MzRVjl+Al+tZX)xGrmwZ@CpRKdz@} zf&)dcTGFdvdWWL?KHkiC(6pt=DzsRWbqZ)ZJs2L3*Xw{s2Nk{#)Qs8n`Ovfz7ftTF zJ-)X%+7MVu+L2CG>|-UeA5z}Zqfbea?5wLJP0m^W5n>^j5C+uRGM2Bj=OCY-pSMgj zo?&d-pRT@0vf;&o_CoUD0i-7PSHPsUv<1X{BvA9(r1+p)W45CNl~RPAx#pA6HD*ts ze7Gd-bIj%<$g>D?p^haVKcw-od?*(cq0T2JTK!l@bx?9L*Z)Qx@C@SS3;=Wq@Q_^+ zU04tCmdiQB_>JZ7AQcH??ANp#ahMWuttuP)5-01BNwM2W2B1YQcGqur$JQ^8_(&Dn z_lsQ}^N4SXTneG?)W6^l>&l=L#ghW6Uvk#O27m%V5ouX^0TUL85;<*OxqtL>Kfb=L zd&Md36^LAF`dw`eh`)%}c{5_Q_Z4l%QmGfqZl2LA*bt_nwg~^$@|+#(>V&!a^I9s| zTTaZ!Yj2i0Uy_VFygh^M{m8a4c{#Nf8ppE@AYPi1DfW=`NzFc|ak=OYU zgc3d~k*dztu}2>JI%WYiFV>4BmSt(aWVTU2NQ8y3zgf~pM&Kj1jG(Ja=c1JLbFD~W+S}ZVU+?wm_me^B<++;Y zXT&=z_=S;A^ExB9dBAg4-&SPOGfQ27-F!&EEzHqVBa+c5G732jF{a?UDiar1Ut3y#X_SVO4 z@TXw{*^WX^3L0AdKzhB|kWguhEp?S8#oduk!bLt(WC6PX2K{rS%`*)iyCt2@rceMr z%83Tdv-!T~LQs*@%DHFtE$qLxEn;; z*K66I$l<`rI=Vc%=WDmhG^FsrYUdj0&|Zg From fb7654cf8e7cde3e8143867858d96614dfc503f9 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 19 Sep 2023 19:18:01 +0200 Subject: [PATCH 75/88] revert changes on dockerfile --- docker/transformers-all-latest-gpu/Dockerfile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 76440ffedc9926..a6c672e1a9df64 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -52,11 +52,6 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes # Add auto-gptq for gtpq quantization testing RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ -# Add flash attention -# commands copied from https://github.com/Dao-AILab/flash-attention#installation-and-features -RUN python3 -m pip uninstall -y ninja && python3 -m pip install ninja -RUN python3 -m pip install flash-attn --no-build-isolation - # Add einops for additional model testing RUN python3 -m pip install --no-cache-dir einops From a737bdea84b79cbdbabcbafb01e211b119e63045 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 20 Sep 2023 13:33:19 +0200 Subject: [PATCH 76/88] Apply suggestions from code review Co-authored-by: Lysandre Debut --- docs/source/en/perf_infer_gpu_one.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index a11e8e6b440a96..4d4e9f8b4593c2 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -25,28 +25,26 @@ Note that this feature is experimental and might considerably change in future v
-Flash Attention 2 can considerably speedup the training and inference speed of transformer based models. Flash Attention 2 has been introduced in the [official Flash Attention repository](https://github.com/Dao-AILab/flash-attention) from Tri Dao et al. The scientific paper of Flash attention can be found [here](https://arxiv.org/abs/2205.14135). +Flash Attention 2 can considerably speed up transformer-based models' training and inference speed. Flash Attention 2 has been introduced in the [official Flash Attention repository](https://github.com/Dao-AILab/flash-attention) by Tri Dao et al. The scientific paper on Flash Attention can be found [here](https://arxiv.org/abs/2205.14135). Make sure to follow the installation guide on the repository mentioned above to properly install Flash Attention 2. Once that package is installed, you can benefit from this feature. -We natively support Flash Attention 2 for some models, currently supported architectures are: +We natively support Flash Attention 2 for some models. The currently supported architectures are: - Llama - Falcon -You can request to add Flash Attention 2 support for more models by opening an issue on GitHub! - -And they can be used for inference and training, including training with padding tokens - which is currently not supported for `BetterTransformer` API below. +You can request to add Flash Attention 2 support for more models by opening an issue on GitHub! The supported models can be used for inference and training, including training with padding tokens - *which is currently not supported for `BetterTransformer` API below.* -Flash Attention 2 can only be used for models using fp16 or bf16 dtype, and can be run only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature. +Flash Attention 2 can only be used for models in the fp16 or bf16 dtypes and run only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature. ### Quick usage -To enable Flash Attention 2 in your model, simply add `use_flash_attn_2` in `from_pretrained` arguments +To enable Flash Attention 2 in your model, add `use_flash_attn_2` in the `from_pretrained` arguments: ```python import torch @@ -66,8 +64,11 @@ And use it for generation or fine-tuning. ### Expected speedups -You can benefit from considerable speedup for fine-tuning and inference, especially for long sequence length. -However, note that due to the fact that Flash Attention does not support computing attention scores with padd tokens under the hood, we need to manually pad / unpad the attention scores for batched inference when the sequence contains padd tokens. This leads to an important slowdown for batched `generate` with padd tokens. To overcome this, one should use Flash Attention without padd tokens in the sequence for training (e.g. by packing a dataset, i.e. concatenating sequences until reaching the maximum sequence length) +You can benefit from considerable speedups for fine-tuning and inference, especially for long sequences. However, since Flash Attention does not support computing attention scores with padding tokens under the hood, we must manually pad / unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens. + +To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length). + +Below is the expected speedup you can get for a simple forward pass on tiiuae/falcon-7b with a sequence length of 4096 and various batch sizes without padding tokens: Below is the expected speedup you can get for a simple forward pass on `tiiuae/falcon-7b` with a sequence length of 4096 and various batch sizes, without padd tokens: From 80951ae19e4579c71d4d40c8864fa64ace19b83b Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 20 Sep 2023 19:44:24 +0200 Subject: [PATCH 77/88] address some comments --- src/transformers/modeling_utils.py | 13 +++++++------ tests/test_modeling_common.py | 18 +++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 46ab861b1f2533..12afb0e3efec46 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1264,7 +1264,7 @@ def _check_and_enable_flash_attn_2( if not cls._supports_flash_attn_2: raise ValueError( "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to " - "request support for this architecture." + "request support for this architecture: https://github.com/huggingface/transformers/issues/new" ) if not is_flash_attn_available(): @@ -1273,10 +1273,11 @@ def _check_and_enable_flash_attn_2( " installing it." ) else: - is_flash_greater_than_2 = version.parse(importlib.metadata.version("flash_attn")) > version.parse("2.0.0") + flash_attention_version = version.parse(importlib.metadata.version("flash_attn")) + is_flash_greater_than_2 = flash_attention_version > version.parse("2.0.0") if not is_flash_greater_than_2: raise ValueError( - "You need flash_attn package version to be greater than 2.0. Make sure to have that version installed." + f"You need flash_attn package version to be greater than 2.0. Make sure to have that version installed - detected version {flash_attention_version}" ) _is_bettertransformer = getattr(cls, "use_bettertransformer", False) @@ -1298,7 +1299,7 @@ def _check_and_enable_flash_attn_2( if device_map is None: if torch.cuda.is_available(): - warnings.warn( + logger.warning( "You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU" " after initializing it on CPU with `model.to('cuda')`." ) @@ -2462,7 +2463,7 @@ def from_pretrained( variant = kwargs.pop("variant", None) _adapter_model_path = kwargs.pop("_adapter_model_path", None) adapter_name = kwargs.pop("adapter_name", "default") - use_flash_attn_2 = kwargs.pop("use_flash_attn_2", False) + use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False) if is_fsdp_enabled(): low_cpu_mem_usage = True @@ -3074,7 +3075,7 @@ def from_pretrained( elif load_in_8bit or load_in_4bit or low_cpu_mem_usage: init_contexts.append(init_empty_weights()) - if use_flash_attn_2: + if use_flash_attention_2: config = cls._check_and_enable_flash_attn_2(config, torch_dtype=torch_dtype, device_map=device_map) with ContextManagers(init_contexts): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index a62bb2dd2063e2..928d9390d70c04 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2735,7 +2735,7 @@ def test_flash_attn_2_conversion(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True).to( + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True).to( torch_device ) @@ -2761,10 +2761,10 @@ def test_flash_attn_2_inference(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True) + model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True) model_fa.to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False) model.to(torch_device) dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) @@ -2799,10 +2799,10 @@ def test_flash_attn_2_inference_padding_right(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=True) + model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True) model_fa.to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attn_2=False) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False) model.to(torch_device) dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) @@ -2838,7 +2838,7 @@ def test_flash_attn_2_generate_left_padding(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False, low_cpu_mem_usage=True + tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True ).to(torch_device) dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) @@ -2849,7 +2849,7 @@ def test_flash_attn_2_generate_left_padding(self): ) model = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True, low_cpu_mem_usage=True + tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True ).to(torch_device) out_fa = model.generate( @@ -2875,7 +2875,7 @@ def test_flash_attn_2_generate_padding_right(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=False, low_cpu_mem_usage=True + tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True ).to(torch_device) dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) @@ -2886,7 +2886,7 @@ def test_flash_attn_2_generate_padding_right(self): ) model = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, use_flash_attn_2=True, low_cpu_mem_usage=True + tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True ).to(torch_device) out_fa = model.generate( From 6f7ff42b46cae2e3b91a4b4d93edb8a7f7591b98 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 20 Sep 2023 19:44:29 +0200 Subject: [PATCH 78/88] docs --- docs/source/en/perf_infer_gpu_one.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 4d4e9f8b4593c2..e01c96d45ea3b5 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -44,7 +44,7 @@ Flash Attention 2 can only be used for models in the fp16 or bf16 dtypes and run ### Quick usage -To enable Flash Attention 2 in your model, add `use_flash_attn_2` in the `from_pretrained` arguments: +To enable Flash Attention 2 in your model, add `use_flash_attention_2` in the `from_pretrained` arguments: ```python import torch @@ -56,7 +56,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, - use_flash_attn_2=True, + use_flash_attention_2=True, ) ``` @@ -66,23 +66,23 @@ And use it for generation or fine-tuning. You can benefit from considerable speedups for fine-tuning and inference, especially for long sequences. However, since Flash Attention does not support computing attention scores with padding tokens under the hood, we must manually pad / unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens. -To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length). +To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length, for example, we perform something in the following lines specifically [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516). Below is the expected speedup you can get for a simple forward pass on tiiuae/falcon-7b with a sequence length of 4096 and various batch sizes without padding tokens: -Below is the expected speedup you can get for a simple forward pass on `tiiuae/falcon-7b` with a sequence length of 4096 and various batch sizes, without padd tokens: +Below is the expected speedup you can get for a simple forward pass on `tiiuae/falcon-7b` with a sequence length of 4096 and various batch sizes, without padding tokens:
-Below is the expected speedup you can get for a simple forward pass on `meta-llama/Llama-7b-hf` with a sequence length of 4096 and various batch sizes, without padd tokens: +Below is the expected speedup you can get for a simple forward pass on `meta-llama/Llama-7b-hf` with a sequence length of 4096 and various batch sizes, without padding tokens:
-For sequences with padd tokens (training with padd tokens or generating with padd tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below we used a padding rate of 0.3). +For sequences with padding tokens (training with padding tokens or generating with padding tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below we used a padding rate of 0.3).
@@ -115,7 +115,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, load_in_8bit=True, - use_flash_attn_2=True, + use_flash_attention_2=True, ) ``` @@ -133,7 +133,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, load_in_4bit=True, - use_flash_attn_2=True, + use_flash_attention_2=True, ) ``` @@ -152,7 +152,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, load_in_4bit=True, - use_flash_attn_2=True, + use_flash_attention_2=True, ) lora_config = LoraConfig( From 257a6335178ee5ecc2b7ac536a9d90bb89637c6a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 20 Sep 2023 19:51:22 +0200 Subject: [PATCH 79/88] use inheritance --- .../models/falcon/modeling_falcon.py | 130 +----------------- .../models/llama/modeling_llama.py | 66 +-------- 2 files changed, 14 insertions(+), 182 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index bc083e05d5dde7..cbdd0d1ebb844b 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -543,129 +543,13 @@ def forward( return output_tensor, present -class FalconFlashAttention2(nn.Module): - # Copied from transformers.models.falcon.modeling_falcon.FalconAttention.__init__ - def __init__(self, config: FalconConfig): - super().__init__() - - self.config = config - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.split_size = self.hidden_size - self.hidden_dropout = config.hidden_dropout - - if self.head_dim * self.num_heads != self.hidden_size: - raise ValueError( - f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" - f" {self.num_heads})." - ) - - self.maybe_rotary = self._init_rope() if config.rotary else lambda q, k, t, p: (q, k) - - # Layer-wise attention scaling - self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) - self.beta = self.inv_norm_factor - if config.new_decoder_architecture: - qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim - elif config.multi_query: - qkv_out_dim = self.hidden_size + 2 * self.head_dim - else: - qkv_out_dim = 3 * self.hidden_size - self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias) - self.new_decoder_architecture = config.new_decoder_architecture - self.multi_query = config.multi_query - self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias) - self.attention_dropout = nn.Dropout(config.attention_dropout) - self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 - - # Copied from transformers.models.falcon.modeling_falcon.FalconAttention._init_rope - def _init_rope(self): - if self.config.rope_scaling is None: - rotary_emb = FalconRotaryEmbedding( - self.head_dim, - base=self.config.rope_theta, - max_position_embeddings=self.config.max_position_embeddings, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - rotary_emb = FalconLinearScalingRotaryEmbedding( - self.head_dim, - base=self.config.rope_theta, - max_position_embeddings=self.config.max_position_embeddings, - scaling_factor=scaling_factor, - ) - elif scaling_type == "dynamic": - rotary_emb = FalconDynamicNTKScalingRotaryEmbedding( - self.head_dim, - base=self.config.rope_theta, - max_position_embeddings=self.config.max_position_embeddings, - scaling_factor=scaling_factor, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - return rotary_emb - - # Copied from transformers.models.falcon.modeling_falcon.FalconAttention._split_heads - def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv` - - Args: - fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim] - - Returns: - query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim] - value: [batch_size, seq_length, num_heads, head_dim] - """ - if self.new_decoder_architecture: - batch, seq_len, _ = fused_qkv.shape - qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim) - query = qkv[:, :, :, :-2] - key = qkv[:, :, :, [-2]] - value = qkv[:, :, :, [-1]] - key = torch.broadcast_to(key, query.shape) - value = torch.broadcast_to(value, query.shape) - - query, key, value = [x.flatten(2, 3) for x in (query, key, value)] - return query, key, value - elif not self.multi_query: - batch_size, seq_length, three_times_hidden_size = fused_qkv.shape - fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) - return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :] - else: - batch_size, seq_length, three_times_hidden_size = fused_qkv.shape - fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim) - return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :] - - # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads - def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: - """ - Merge heads together over the last dimension - - Args: - x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim] - - Returns: - torch.tensor: [batch_size, seq_length, num_heads * head_dim] - """ - # What we want to achieve is: - # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim - batch_size_and_num_heads, seq_length, _ = x.shape - batch_size = batch_size_and_num_heads // self.num_heads - - # First view to decompose the batch size - # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim - x = x.view(batch_size, self.num_heads, seq_length, self.head_dim) - - # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim - x = x.permute(0, 2, 1, 3) - - # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim - return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim) - +class FalconFlashAttention2(FalconAttention): + """ + Falcon flash attention module. This module inherits from `FalconAttention` as the weights + of the module stays untouched. The only required change would be on the forward pass where it needs + to correctly call the public API of flash attention and deal with padding tokens in case the input + contains any of them. + """ def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 4ac39c070dbbfc..9fc2958f8557d2 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -409,65 +409,13 @@ def forward( return attn_output, attn_weights, past_key_value -class LlamaFlashAttention2(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ - def __init__(self, config: LlamaConfig): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self._init_rope() - - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._shape - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - +class LlamaFlashAttention2(LlamaAttention): + """ + Llama flash attention module. This module inherits from `LlamaAttention` as the weights + of the module stays untouched. The only required change would be on the forward pass where it needs + to correctly call the public API of flash attention and deal with padding tokens in case the input + contains any of them. + """ def forward( self, hidden_states: torch.Tensor, From 360da707c83ef0d3b6699c417905e479e97b18db Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:53:03 +0200 Subject: [PATCH 80/88] Update src/transformers/testing_utils.py Co-authored-by: Lysandre Debut --- src/transformers/testing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index c78135fae95e67..08102ead58c3f2 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -387,7 +387,7 @@ def require_flash_attn(test_case): """ Decorator marking a test that requires Flash Attention. - These tests are skipped when PyTorch isn't installed. + These tests are skipped when Flash Attention isn't installed. """ return unittest.skipUnless(is_flash_attn_available(), "test requires Flash Attention")(test_case) From 1d91bc4ac179658477520b12bf5c83427ec47091 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 20 Sep 2023 17:55:51 +0000 Subject: [PATCH 81/88] fixup --- .../models/falcon/modeling_falcon.py | 8 +++---- .../models/llama/modeling_llama.py | 8 +++---- tests/test_modeling_common.py | 22 +++++++++++++------ 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index cbdd0d1ebb844b..58a9be8b145d4f 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -545,11 +545,11 @@ def forward( class FalconFlashAttention2(FalconAttention): """ - Falcon flash attention module. This module inherits from `FalconAttention` as the weights - of the module stays untouched. The only required change would be on the forward pass where it needs - to correctly call the public API of flash attention and deal with padding tokens in case the input - contains any of them. + Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. """ + def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 9fc2958f8557d2..1f0db2197fcec7 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -411,11 +411,11 @@ def forward( class LlamaFlashAttention2(LlamaAttention): """ - Llama flash attention module. This module inherits from `LlamaAttention` as the weights - of the module stays untouched. The only required change would be on the forward pass where it needs - to correctly call the public API of flash attention and deal with padding tokens in case the input - contains any of them. + Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. """ + def forward( self, hidden_states: torch.Tensor, diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 928d9390d70c04..52007aeba19202 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2735,9 +2735,9 @@ def test_flash_attn_2_conversion(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True).to( - torch_device - ) + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True + ).to(torch_device) for _, module in model.named_modules(): if "FlashAttention" in module.__class__.__name__: @@ -2761,10 +2761,14 @@ def test_flash_attn_2_inference(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True + ) model_fa.to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False) + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False + ) model.to(torch_device) dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) @@ -2799,10 +2803,14 @@ def test_flash_attn_2_inference_padding_right(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) - model_fa = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True + ) model_fa.to(torch_device) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False) + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False + ) model.to(torch_device) dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device) From 7c5720f1b0ccf4bd6bd5d3e8ecb996284de25c4f Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:36:49 +0200 Subject: [PATCH 82/88] Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- docs/source/en/perf_infer_gpu_one.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index e01c96d45ea3b5..85beac52ddb906 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -29,16 +29,16 @@ Flash Attention 2 can considerably speed up transformer-based models' training a Make sure to follow the installation guide on the repository mentioned above to properly install Flash Attention 2. Once that package is installed, you can benefit from this feature. -We natively support Flash Attention 2 for some models. The currently supported architectures are: +We natively support Flash Attention 2 for the following models: - Llama - Falcon -You can request to add Flash Attention 2 support for more models by opening an issue on GitHub! The supported models can be used for inference and training, including training with padding tokens - *which is currently not supported for `BetterTransformer` API below.* +You can request to add Flash Attention 2 support for more models by opening an issue on GitHub, and even open a Pull Request to integrate the changes. The supported models can be used for inference and training, including training with padding tokens - *which is currently not supported for `BetterTransformer` API below.* -Flash Attention 2 can only be used for models in the fp16 or bf16 dtypes and run only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature. +Flash Attention 2 can only be used when the models' dtype is `fp16` or `bf16` and runs only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature. @@ -66,17 +66,17 @@ And use it for generation or fine-tuning. You can benefit from considerable speedups for fine-tuning and inference, especially for long sequences. However, since Flash Attention does not support computing attention scores with padding tokens under the hood, we must manually pad / unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens. -To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length, for example, we perform something in the following lines specifically [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516). +To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length. An example is provided [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516). -Below is the expected speedup you can get for a simple forward pass on tiiuae/falcon-7b with a sequence length of 4096 and various batch sizes without padding tokens: +Below is the expected speedup you can get for a simple forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes without padding tokens: -Below is the expected speedup you can get for a simple forward pass on `tiiuae/falcon-7b` with a sequence length of 4096 and various batch sizes, without padding tokens: +Below is the expected speedup you can get for a simple forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes, without padding tokens:
-Below is the expected speedup you can get for a simple forward pass on `meta-llama/Llama-7b-hf` with a sequence length of 4096 and various batch sizes, without padding tokens: +Below is the expected speedup you can get for a simple forward pass on [`meta-llama/Llama-7b-hf`](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes, without padding tokens:
From 28b82e20dfee64db92041ca21ffcd33c896b55b7 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:37:21 +0200 Subject: [PATCH 83/88] Update src/transformers/modeling_utils.py --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a3f30f1b78795d..1fc604d859f3a5 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1288,7 +1288,7 @@ def _check_and_enable_flash_attn_2( ) if torch_dtype is None: - warnings.warn( + logger.warning( "You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour" ) elif torch_dtype is not None and torch_dtype not in [torch.float16, torch.bfloat16]: From 84b57938cc8bcdc086f28bfe5b74487fe230df6d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 21 Sep 2023 14:13:01 +0200 Subject: [PATCH 84/88] final comments --- docs/source/en/perf_infer_gpu_one.md | 4 ++-- src/transformers/modeling_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 85beac52ddb906..86e137cf14d7a1 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -82,7 +82,7 @@ Below is the expected speedup you can get for a simple forward pass on [`meta-ll
-For sequences with padding tokens (training with padding tokens or generating with padding tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below we used a padding rate of 0.3). +For sequences with padding tokens (training with padding tokens or generating with padding tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below 30% of the input has been filled with padding tokens).
@@ -90,7 +90,7 @@ For sequences with padding tokens (training with padding tokens or generating wi But for large sequence length you can benefit from interesting speedup for pure inference (also training) -Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. +Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. It can lead up to memory reduction up to 20 for large sequence length. Check out [the official flash attention repository](https://github.com/Dao-AILab/flash-attention) for more details.
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 1fc604d859f3a5..f4e376e593a25e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1306,7 +1306,7 @@ def _check_and_enable_flash_attn_2( else: raise ValueError( "You are attempting to use Flash Attention 2.0 with a model initialized on CPU and with no GPU available. " - "This is not supported. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map " + "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map " "or initialising the model on CPU and then moving it to GPU." ) elif ( From 949172f1220a3c5edf54a6550d48dfcb24ef279c Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 21 Sep 2023 14:32:27 +0200 Subject: [PATCH 85/88] clean up --- .../models/falcon/modeling_falcon.py | 60 ++++++++++++++----- .../models/llama/modeling_llama.py | 50 ++++++++++++---- tests/models/llama/test_modeling_llama.py | 2 +- 3 files changed, 82 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 58a9be8b145d4f..31dc5d760b8d7f 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -603,40 +603,68 @@ def forward( if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") - # contains at least one padding token + attn_dropout = self.attention_dropout if self.training else 0.0 + attn_output = self._flash_attention_forward(query_layer, key_layer, value_layer, padding_mask, query_length, dropout=attn_dropout) + + attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + attn_output = self.dense(attn_weights) + + if not output_attentions: + attn_weights = None + + return attn_output, past_key_value, attn_weights + + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward(self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + padding_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the position + of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence if padding_mask is not None: - query_layer, key_layer, value_layer, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_layer, key_layer, value_layer, padding_mask, query_length + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, padding_mask, query_length ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens attn_output_unpad = flash_attn_varlen_func( - query_layer, - key_layer, - value_layer, + query_states, + key_states, + value_states, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_in_batch_q, max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=0.0, - softmax_scale=None, + dropout_p=dropout, + softmax_scale=softmax_scale, causal=True, ) attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: - attn_output = flash_attn_func(query_layer, key_layer, value_layer, 0.0, softmax_scale=None, causal=True) - - attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) - attn_output = self.dense(attn_weights) + attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True) - if not output_attentions: - attn_weights = None - - return attn_output, past_key_value, attn_weights + return attn_output # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 619f97057e3fae..8156c3c3dcf995 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -469,11 +469,41 @@ def forward( # It is recommended to use dropout with FA according to the docs # when training. dropout_rate = 0.0 # if not self.training else self.attn_dropout + attn_output = self._flash_attention_forward(query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate) - # contains at least one padding token + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward(self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + padding_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the position + of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence if padding_mask is not None: + batch_size = query_states.shape[0] query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, padding_mask, q_len + query_states, key_states, value_states, padding_mask, query_length ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens @@ -487,22 +517,16 @@ def forward( cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_in_batch_q, max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=0.0, - softmax_scale=None, + dropout_p=dropout, + softmax_scale=softmax_scale, causal=True, ) - attn_output = pad_input(attn_output_unpad, indices_q, bsz, q_len) + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: - attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.o_proj(attn_output) + attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True) - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value + return attn_output def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length): indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask) diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index b35b02dabd71ac..0223acbbd72a8a 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -403,7 +403,7 @@ def test_flash_attn_2_generate_padding_right(self): output_native = tokenizer.batch_decode(output_native) model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, use_flash_attn_2=True + "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, use_flash_attention_2=True ) output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) From 825c7e0f188958122a596acc4a9a57cf000280b2 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 21 Sep 2023 14:33:57 +0200 Subject: [PATCH 86/88] style --- .../models/falcon/modeling_falcon.py | 20 +++++++++++-------- .../models/llama/modeling_llama.py | 20 ++++++++++++------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 31dc5d760b8d7f..715ad657f40ed0 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -603,9 +603,10 @@ def forward( if alibi is not None: raise ValueError("`alibi` is not supported when `use_flash_attn` is True") - attn_dropout = self.attention_dropout if self.training else 0.0 - attn_output = self._flash_attention_forward(query_layer, key_layer, value_layer, padding_mask, query_length, dropout=attn_dropout) + attn_output = self._flash_attention_forward( + query_layer, key_layer, value_layer, padding_mask, query_length, dropout=attn_dropout + ) attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) attn_output = self.dense(attn_weights) @@ -614,10 +615,11 @@ def forward( attn_weights = None return attn_output, past_key_value, attn_weights - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward - def _flash_attention_forward(self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None): + def _flash_attention_forward( + self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None + ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token first unpad the input, then computes the attention scores and pad the final attention scores. @@ -630,12 +632,12 @@ def _flash_attention_forward(self, query_states, key_states, value_states, paddi value_states (`torch.Tensor`): Input value states to be passed to Flash Attention API padding_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the position - of padding tokens and 1 for the position of non-padding tokens. + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. dropout (`int`, *optional*): Attention dropout softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) """ # Contains at least one padding token in the sequence if padding_mask is not None: @@ -662,7 +664,9 @@ def _flash_attention_forward(self, query_states, key_states, value_states, paddi attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: - attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True) + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True + ) return attn_output diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 8156c3c3dcf995..265786ab25df7c 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -469,7 +469,9 @@ def forward( # It is recommended to use dropout with FA according to the docs # when training. dropout_rate = 0.0 # if not self.training else self.attn_dropout - attn_output = self._flash_attention_forward(query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate) + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate + ) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() attn_output = self.o_proj(attn_output) @@ -478,8 +480,10 @@ def forward( attn_weights = None return attn_output, attn_weights, past_key_value - - def _flash_attention_forward(self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None): + + def _flash_attention_forward( + self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None + ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token first unpad the input, then computes the attention scores and pad the final attention scores. @@ -492,12 +496,12 @@ def _flash_attention_forward(self, query_states, key_states, value_states, paddi value_states (`torch.Tensor`): Input value states to be passed to Flash Attention API padding_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the position - of padding tokens and 1 for the position of non-padding tokens. + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. dropout (`int`, *optional*): Attention dropout softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) """ # Contains at least one padding token in the sequence if padding_mask is not None: @@ -524,7 +528,9 @@ def _flash_attention_forward(self, query_states, key_states, value_states, paddi attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) else: - attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True) + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True + ) return attn_output From 1af232cf45d89f2185c93d83fea8940aa96164a8 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 22 Sep 2023 16:37:55 +0200 Subject: [PATCH 87/88] add cast + warning for PEFT models --- .../models/falcon/modeling_falcon.py | 16 ++++++++++++++++ .../models/llama/modeling_llama.py | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 715ad657f40ed0..bcecb47084b04f 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -604,6 +604,22 @@ def forward( raise ValueError("`alibi` is not supported when `use_flash_attn` is True") attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_layer.dtype + if input_dtype == torch.float32: + logger.warning_once( + "The input hidden states seems to be silently casted in float32, this might be related to" + " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + " float16." + ) + + query_layer = query_layer.to(torch.float16) + key_layer = key_layer.to(torch.float16) + value_layer = value_layer.to(torch.float16) + attn_output = self._flash_attention_forward( query_layer, key_layer, value_layer, padding_mask, query_length, dropout=attn_dropout ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 265786ab25df7c..02a1f37b02f6af 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -469,6 +469,25 @@ def forward( # It is recommended to use dropout with FA according to the docs # when training. dropout_rate = 0.0 # if not self.training else self.attn_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + input_dtype = query_states.dtype + if input_dtype == torch.float32: + logger.warning_once( + "The input hidden states seems to be silently casted in float32, this might be related to" + " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + " float16." + ) + + query_states = query_states.to(torch.float16) + key_states = key_states.to(torch.float16) + value_states = value_states.to(torch.float16) + + attn_output = self._flash_attention_forward( query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate ) From d7f16c580195a59a378375211eadf3dbc30d6ac4 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 22 Sep 2023 14:39:05 +0000 Subject: [PATCH 88/88] fixup --- src/transformers/models/falcon/modeling_falcon.py | 4 ++-- src/transformers/models/llama/modeling_llama.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index bcecb47084b04f..85a83258517b6f 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -607,7 +607,7 @@ def forward( # In PEFT, usually we cast the layer norms in float32 for training stability reasons # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. + # cast them back in float16 just to be sure everything works as expected. input_dtype = query_layer.dtype if input_dtype == torch.float32: logger.warning_once( @@ -615,7 +615,7 @@ def forward( " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" " float16." ) - + query_layer = query_layer.to(torch.float16) key_layer = key_layer.to(torch.float16) value_layer = value_layer.to(torch.float16) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 02a1f37b02f6af..82d0300f60e85f 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -469,10 +469,10 @@ def forward( # It is recommended to use dropout with FA according to the docs # when training. dropout_rate = 0.0 # if not self.training else self.attn_dropout - + # In PEFT, usually we cast the layer norms in float32 for training stability reasons # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. + # cast them back in float16 just to be sure everything works as expected. # This might slowdown training & inference so it is recommended to not cast the LayerNorms # in fp32. (LlamaRMSNorm handles it correctly) input_dtype = query_states.dtype @@ -482,11 +482,10 @@ def forward( " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" " float16." ) - + query_states = query_states.to(torch.float16) key_states = key_states.to(torch.float16) value_states = value_states.to(torch.float16) - attn_output = self._flash_attention_forward( query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate