diff --git a/.gitattributes b/.gitattributes index 7a6ba382df2d9d..6505bc7edf9af5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ *.py eol=lf *.rst eol=lf -*.md eol=lf \ No newline at end of file +*.md eol=lf +*.mdx eol=lf \ No newline at end of file diff --git a/README.md b/README.md index 773b00fd743ffb..12453a52bc5a4b 100644 --- a/README.md +++ b/README.md @@ -321,6 +321,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. diff --git a/README_ko.md b/README_ko.md index 30d576b413af91..198c806f36db5d 100644 --- a/README_ko.md +++ b/README_ko.md @@ -300,6 +300,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. diff --git a/README_zh-hans.md b/README_zh-hans.md index a2be92ef70d410..4a697fbcf25c92 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -324,6 +324,7 @@ conda install -c huggingface transformers 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。 1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。 +1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index e66607590ccbe5..5cce396ff53f86 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -336,6 +336,7 @@ conda install -c huggingface transformers 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d668a71afb8578..36580ecb2fcf38 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -342,6 +342,8 @@ title: TAPAS - local: model_doc/tapex title: TAPEX + - local: model_doc/trajectory_transformer + title: Trajectory Transformer - local: model_doc/transfo-xl title: Transformer XL - local: model_doc/trocr diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 3897629b7b1a75..176e0f2de889e1 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -142,6 +142,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. 1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. +1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. @@ -259,6 +260,7 @@ Flax), PyTorch, and/or TensorFlow. | Swin | ❌ | ❌ | ✅ | ✅ | ❌ | | T5 | ✅ | ✅ | ✅ | ✅ | ✅ | | TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ | | Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | | TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | | UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/trajectory_transformer.mdx b/docs/source/en/model_doc/trajectory_transformer.mdx new file mode 100644 index 00000000000000..da7a55a50eca3e --- /dev/null +++ b/docs/source/en/model_doc/trajectory_transformer.mdx @@ -0,0 +1,49 @@ + + +# Trajectory Transformer + +## Overview + +The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine. + +The abstract from the paper is the following: + +*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models, +leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence +modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards. +Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well +in other domains, such as natural-language processing, can also provide effective solutions to the RL problem. +To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture +to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence +modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common +in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction, +imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with +existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.* + +Tips: + +This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from +actions, states and rewards from all previous timesteps. This model will treat all these elements together +as one big sequence (a trajectory). + +This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer). + +## TrajectoryTransformerConfig + +[[autodoc]] TrajectoryTransformerConfig + + +## TrajectoryTransformerModel + +[[autodoc]] TrajectoryTransformerModel + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d071411ec7f809..9f28d18fdf6f28 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -284,6 +284,10 @@ "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"], "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"], "models.tapex": ["TapexTokenizer"], + "models.trajectory_transformer": [ + "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "TrajectoryTransformerConfig", + ], "models.transfo_xl": [ "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "TransfoXLConfig", @@ -1571,6 +1575,13 @@ "load_tf_weights_in_t5", ] ) + _import_structure["models.trajectory_transformer"].extend( + [ + "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "TrajectoryTransformerModel", + "TrajectoryTransformerPreTrainedModel", + ] + ) _import_structure["models.transfo_xl"].extend( [ "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -2788,6 +2799,10 @@ from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer from .models.tapex import TapexTokenizer + from .models.trajectory_transformer import ( + TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, + TrajectoryTransformerConfig, + ) from .models.transfo_xl import ( TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig, @@ -3863,6 +3878,11 @@ T5PreTrainedModel, load_tf_weights_in_t5, ) + from .models.trajectory_transformer import ( + TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + TrajectoryTransformerModel, + TrajectoryTransformerPreTrainedModel, + ) from .models.transfo_xl import ( TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, AdaptiveEmbedding, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 1552f27023c756..90ce4b1e9b3c46 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -116,6 +116,7 @@ t5, tapas, tapex, + trajectory_transformer, transfo_xl, trocr, unispeech, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 49ad266e509cde..3013f8d87d7821 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -113,6 +113,7 @@ ("swin", "SwinConfig"), ("t5", "T5Config"), ("tapas", "TapasConfig"), + ("trajectory_transformer", "TrajectoryTransformerConfig"), ("transfo-xl", "TransfoXLConfig"), ("trocr", "TrOCRConfig"), ("unispeech", "UniSpeechConfig"), @@ -338,6 +339,7 @@ ("t5v1.1", "T5v1.1"), ("tapas", "TAPAS"), ("tapex", "TAPEX"), + ("trajectory_transformer", "Trajectory Transformer"), ("transfo-xl", "Transformer-XL"), ("trocr", "TrOCR"), ("unispeech", "UniSpeech"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 980c560019b040..dd8324400f5f83 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -108,6 +108,7 @@ ("swin", "SwinModel"), ("t5", "T5Model"), ("tapas", "TapasModel"), + ("trajectory_transformer", "TrajectoryTransformerModel"), ("transfo-xl", "TransfoXLModel"), ("unispeech", "UniSpeechModel"), ("unispeech-sat", "UniSpeechSatModel"), diff --git a/src/transformers/models/trajectory_transformer/__init__.py b/src/transformers/models/trajectory_transformer/__init__.py new file mode 100644 index 00000000000000..0b8a6f2c5892d7 --- /dev/null +++ b/src/transformers/models/trajectory_transformer/__init__.py @@ -0,0 +1,68 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +# rely on isort to merge the imports +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = { + "configuration_trajectory_transformer": [ + "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "TrajectoryTransformerConfig", + ], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_trajectory_transformer"] = [ + "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "TrajectoryTransformerModel", + "TrajectoryTransformerPreTrainedModel", + "load_tf_weights_in_trajectory_transformer", + ] + + +if TYPE_CHECKING: + from .configuration_trajectory_transformer import ( + TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, + TrajectoryTransformerConfig, + ) + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_trajectory_transformer import ( + TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + TrajectoryTransformerModel, + TrajectoryTransformerPreTrainedModel, + load_tf_weights_in_trajectory_transformer, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py new file mode 100644 index 00000000000000..537a467c701667 --- /dev/null +++ b/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TrajectoryTransformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "CarlCochet/trajectory-transformer-halfcheetah-medium-v2": ( + "https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2/resolve/main/config.json" + ), + # See all TrajectoryTransformer models at https://huggingface.co/models?filter=trajectory_transformer +} + + +class TrajectoryTransformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`TrajectoryTransformerModel`]. It is used to + instantiate an TrajectoryTransformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the + TrajectoryTransformer + [CarlCochet/trajectory-transformer-halfcheetah-medium-v2](https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 100): + Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be + represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`] + batch_size (`int`, *optional*, defaults to 256): + Size of the batch of trajectories passed to the model. + action_weight (`int`, *optional*, defaults to 5): + Weight of the action in the loss function + reward_weight (`int`, *optional*, defaults to 1): + Weight of the reward in the loss function + value_weight (`int`, *optional*, defaults to 1): + Weight of the value in the loss function + block_size (`int`, *optional*, defaults to 249): + Size of the blocks in the trajectory transformer. + action_dim (`int`, *optional*, defaults to 6): + Dimension of the action space. + observation_dim (`int`, *optional*, defaults to 17): + Dimension of the observation space. + transition_dim (`int`, *optional*, defaults to 25): + Dimension of the transition space. + n_layer (`int`, *optional*, defaults to 4): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 4): + Number of attention heads for each attention layer in the Transformer encoder. + n_embd (`int`, *optional*, defaults to 128): + Dimensionality of the embeddings and hidden states. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`TrajectoryTransformerModel`] + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + kaiming_initializer_range (`float, *optional*, defaults to 1): + A coefficient scaling the negative slope of the kaiming initializer rectifier for EinLinear layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + Example: + + ```python + >>> from transformers import TrajectoryTransformerModel, TrajectoryTransformerConfig + + >>> # Initializing a TrajectoryTransformer CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration + >>> configuration = TrajectoryTransformerConfig() + + >>> # Initializing a model from the CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration + >>> model = TrajectoryTransformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "trajectory_transformer" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=100, + batch_size=256, + action_weight=5, + reward_weight=1, + value_weight=1, + block_size=249, + action_dim=6, + observation_dim=17, + transition_dim=25, + n_layer=4, + n_head=4, + n_embd=128, + embd_pdrop=0.1, + attn_pdrop=0.1, + resid_pdrop=0.1, + learning_rate=0.0006, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + kaiming_initializer_range=1, + use_cache=True, + is_encoder_decoder=False, + pad_token_id=1, + bos_token_id=50256, + eos_token_id=50256, + **kwargs + ): + self.vocab_size = vocab_size + self.batch_size = batch_size + self.action_weight = action_weight + self.reward_weight = reward_weight + self.value_weight = value_weight + self.max_position_embeddings = max_position_embeddings + self.block_size = block_size + self.action_dim = action_dim + self.observation_dim = observation_dim + self.transition_dim = transition_dim + self.learning_rate = learning_rate + self.n_layer = n_layer + self.n_head = n_head + self.n_embd = n_embd + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.resid_pdrop = resid_pdrop + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.kaiming_initializer_range = kaiming_initializer_range + self.use_cache = use_cache + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 00000000000000..14e6556e07b7a1 --- /dev/null +++ b/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,70 @@ +# coding=utf-8 +# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TrajectoryTransformer pytorch checkpoint conversion""" + +import torch + +import trajectory.utils as utils +from transformers import TrajectoryTransformerModel + + +class Parser(utils.Parser): + dataset: str = "halfcheetah-medium-expert-v2" + config: str = "config.offline" + + +def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device): + """Converting Sequential blocks to ModuleList""" + + gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device) + trajectory_transformer = TrajectoryTransformerModel(gpt.config) + + trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict()) + trajectory_transformer.pos_emb = gpt.pos_emb + trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict()) + trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict()) + trajectory_transformer.head.load_state_dict(gpt.head.state_dict()) + + for i, block in enumerate(gpt.blocks): + trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict()) + trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict()) + trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict()) + + trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict()) + trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict()) + trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict()) + trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict()) + + torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin") + + +if __name__ == "__main__": + """ + To run this script you will need to install the original repository to run the original model. You can find it + here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the + original pytorch checkpoints. + + Run with the command: + + ```sh + >>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset + ... --gpt_loadpath + ``` + """ + + args = Parser().parse_args("plan") + convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch( + args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device + ) diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py new file mode 100644 index 00000000000000..f647a13afead44 --- /dev/null +++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py @@ -0,0 +1,617 @@ +# coding=utf-8 +# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch TrajectoryTransformer model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Tuple + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import functional as F + +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_trajectory_transformer import TrajectoryTransformerConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "CarlCochet/trajectory-transformer-halfcheetah-medium-v2" +_CONFIG_FOR_DOC = "TrajectoryTransformerConfig" + +TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "CarlCochet/trajectory-transformer-halfcheetah-medium-v2", + # See all TrajectoryTransformer models at https://huggingface.co/models?filter=trajectory_transformer +] + + +def load_tf_weights_in_trajectory_transformer(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +@dataclass +class TrajectoryTransformerOutput(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the + attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. GPT2Attentions weights after the attention softmax, used to compute the weighted average + in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class TrajectoryTransformerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = TrajectoryTransformerConfig + load_tf_weights = load_tf_weights_in_trajectory_transformer + base_model_prefix = "trajectory_transformer" + main_input_name = "trajectories" + supports_gradient_checkpointing = True + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, TrajectoryTransformerModel): + module.gradient_checkpointing = value + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, EinLinear): + for i in range(module.n_models): + nn.init.kaiming_uniform_(module.weight[i], a=math.sqrt(5) / self.config.kaiming_initializer_range) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight[i]) + bound = (1 / math.sqrt(fan_in)) * self.config.initializer_range + nn.init.uniform_(module.bias[i], -bound, bound) + + +TRAJECTORY_TRANSFORMER_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`TrajectoryTransformerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING = r""" + Args: + trajectories (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Batch of trajectories, where a trajectory is a sequence of states, actions and rewards. + past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`, *optional*): + Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see + `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have + their past given to this model should not be passed as `input_ids` as they have already been computed. + targets (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Desired targets used to compute the loss. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class EinLinear(nn.Module): + def __init__(self, n_models, in_features, out_features, bias): + super().__init__() + self.n_models = n_models + self.out_features = out_features + self.in_features = in_features + self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features)) + if bias: + self.bias = nn.Parameter(torch.Tensor(n_models, out_features)) + else: + self.register_parameter("bias", None) + + def reset_parameters(self): + for i in range(self.n_models): + nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i]) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.bias[i], -bound, bound) + + def forward(self, input): + """ + Args: + input (`torch.FloatTensor` of shape `(B, n_models, input_dim)`): + The input to the layer. + """ + # [ batch_size x n_models x output_dim ] + output = torch.einsum("eoi,bei->beo", self.weight, input) + if self.bias is not None: + raise RuntimeError() + return output + + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + + if config.n_embd % config.n_head != 0: + raise ValueError(f"n_head ({config.n_head}) should be a divisor of n_embd ({config.n_embd})") + + # key, query, value projections for all heads + self.key = nn.Linear(config.n_embd, config.n_embd) + self.query = nn.Linear(config.n_embd, config.n_embd) + self.value = nn.Linear(config.n_embd, config.n_embd) + + # regularization + self.attn_drop = nn.Dropout(config.attn_pdrop) + self.resid_drop = nn.Dropout(config.resid_pdrop) + + # output projection + self.proj = nn.Linear(config.n_embd, config.n_embd) + + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer( + "mask", + torch.tril(torch.ones(config.block_size, config.block_size)).view( + 1, 1, config.block_size, config.block_size + ), + ) + + # mask previous value estimates + joined_dim = config.observation_dim + config.action_dim + 2 + self.mask.squeeze()[:, joined_dim - 1 :: joined_dim] = 0 + + self.n_head = config.n_head + + def forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + layer_past: Optional[Tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ): + batch_size, sequence_length, embedding_dim = hidden_states.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + # [ batch_size x n_heads x sequence_length x head_dim ] + key = ( + self.key(hidden_states) + .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head) + .transpose(1, 2) + ) + query = ( + self.query(hidden_states) + .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head) + .transpose(1, 2) + ) + value = ( + self.value(hidden_states) + .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head) + .transpose(1, 2) + ) + + if layer_past is not None: + past_key, past_value = layer_past + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + + if use_cache is True: + present = (key, value) + else: + present = None + + # causal self-attention + # [ batch_size x n_heads x sequence_length x sequence_length ] + attn_weights = (torch.matmul(query, key.transpose(-2, -1))) * (1.0 / math.sqrt(key.size(-1))) + attn_weights = attn_weights.masked_fill( + self.mask[:, :, :sequence_length, :sequence_length] == 0, float("-inf") + ) + attn_weights = F.softmax(attn_weights, dim=-1) + self._attn_map = attn_weights.clone() + attn_weights = self.attn_drop(attn_weights) + + output = torch.matmul(attn_weights, value) + # [ batch_size x sequence_length x embedding_dim ] + # re-assemble all head outputs side by side + output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, embedding_dim) + + # output projection + output = self.resid_drop(self.proj(output)) + + outputs = (output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Block(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config.n_embd) + self.ln2 = nn.LayerNorm(config.n_embd) + self.attn = CausalSelfAttention(config) + + # MLP + self.l1 = nn.Linear(config.n_embd, 4 * config.n_embd) + self.act = nn.GELU() + self.l2 = nn.Linear(4 * config.n_embd, config.n_embd) + self.drop = nn.Dropout(config.resid_pdrop) + + def forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + layer_past: Optional[Tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ): + residual = hidden_states + hidden_states = self.ln1(hidden_states) + + attn_outputs = self.attn( + hidden_states, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions + ) + attn_output = attn_outputs[0] + outputs = attn_outputs[1:] + hidden_states = attn_output + residual + + residual = hidden_states + hidden_states = self.ln2(hidden_states) + hidden_states = self.l1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.l2(hidden_states) + hidden_states = residual + self.drop(hidden_states) + + if use_cache: + outputs = (hidden_states,) + outputs + else: + outputs = (hidden_states,) + outputs[1:] + + return outputs + + +@add_start_docstrings( + "The bare TrajectoryTransformer Model transformer outputting raw hidden-states without any specific head on top.", + TRAJECTORY_TRANSFORMER_START_DOCSTRING, +) +class TrajectoryTransformerModel(TrajectoryTransformerPreTrainedModel): + """the full GPT language model, with a context size of block_size""" + + def __init__(self, config): + super().__init__(config) + + # input embedding stem (+1 for stop token) + self.tok_emb = nn.Embedding(config.vocab_size * config.transition_dim + 1, config.n_embd) + + self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd)) + self.drop = nn.Dropout(config.embd_pdrop) + # transformer + self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]) + # decoder head + self.ln_f = nn.LayerNorm(config.n_embd) + self.head = EinLinear(config.transition_dim, config.n_embd, config.vocab_size + 1, bias=False) + + self.vocab_size = config.vocab_size + self.stop_token = config.vocab_size * config.transition_dim + self.block_size = config.block_size + + self.observation_dim = config.observation_dim + self.action_dim = config.action_dim + self.transition_dim = config.transition_dim + self.embedding_dim = config.n_embd + + self.action_weight = config.action_weight + self.reward_weight = config.reward_weight + self.value_weight = config.value_weight + + self.gradient_checkpointing = False + + self.post_init() + + def get_block_size(self): + return self.block_size + + def offset_tokens(self, trajectories): + _, sequence_length = trajectories.shape + + n_states = int(np.ceil(sequence_length / self.transition_dim)) + + offsets = torch.arange(self.transition_dim) * self.vocab_size + offsets = offsets.repeat(n_states).to(trajectories.device) + + offset_trajectories = trajectories + offsets[:sequence_length] + offset_trajectories[trajectories == self.vocab_size] = self.stop_token + return offset_trajectories + + def pad_to_full_observation(self, hidden_states): + batch_size, sequence_length, _ = hidden_states.shape + + n_pad = (self.transition_dim - sequence_length % self.transition_dim) % self.transition_dim + padding = torch.zeros(batch_size, n_pad, self.embedding_dim, device=hidden_states.device) + + # [ batch_size x padded_sequence_length' x embedding_dim ] + hidden_states_pad = torch.cat([hidden_states, padding], dim=1) + hidden_states_pad = hidden_states_pad.view(-1, self.transition_dim, self.embedding_dim) + + return hidden_states_pad, n_pad + + @add_start_docstrings_to_model_forward( + TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length") + ) + @replace_return_docstrings(output_type=TrajectoryTransformerOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + trajectories: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + targets: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Returns: + + Examples: + + ```python + >>> from transformers import TrajectoryTransformerModel + >>> import torch + + >>> model = TrajectoryTransformerModel.from_pretrained( + ... "CarlCochet/trajectory-transformer-halfcheetah-medium-v2" + ... ) + >>> model.to(device) + >>> model.eval() + + >>> observations_dim, action_dim, batch_size = 17, 6, 256 + >>> seq_length = observations_dim + action_dim + 1 + + >>> trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to( + ... device + ... ) + >>> targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to(device) + + >>> outputs = model( + ... trajectories, + ... targets=targets, + ... use_cache=True, + ... output_attentions=True, + ... output_hidden_states=True, + ... return_dict=True, + ... ) + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if past_key_values is None: + past_key_values = tuple([None] * len(self.blocks)) + + batch_size, sequence_length = trajectories.size() + + if sequence_length > self.block_size: + raise ValueError("Cannot forward, model block size is exhausted.") + + offset_trajectories = self.offset_tokens(trajectories) + # [ batch_size x sequence_length x embedding_dim ] + # forward the GPT model + token_embeddings = self.tok_emb(offset_trajectories) # each index maps to a (learnable) vector + position_embeddings = self.pos_emb[:, :sequence_length, :] # each position maps to a (learnable) vector + + hidden_states = self.drop(token_embeddings + position_embeddings) + + presents = () if use_cache else None + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)): + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + layer_past, + use_cache, + output_attentions, + ) + else: + outputs = block(hidden_states, layer_past, use_cache, output_attentions) + + hidden_states = outputs[0] + if use_cache is True: + presents = presents + (outputs[1],) + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + + # [ batch_size x sequence_length x embedding_dim ] + hidden_state = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states_pad, n_pad = self.pad_to_full_observation(hidden_state) + + logits = self.head(hidden_states_pad) + logits = logits.reshape(batch_size, sequence_length + n_pad, self.vocab_size + 1) + logits = logits[:, :sequence_length] + + # if we are given some desired targets also calculate the loss + if targets is not None: + loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.view(-1), reduction="none") + if self.action_weight != 1 or self.reward_weight != 1 or self.value_weight != 1: + # make weights + n_states = int(np.ceil(sequence_length / self.transition_dim)) + weights = torch.cat( + [ + torch.ones(self.observation_dim, device=trajectories.device), + torch.ones(self.action_dim, device=trajectories.device) * self.action_weight, + torch.ones(1, device=trajectories.device) * self.reward_weight, + torch.ones(1, device=trajectories.device) * self.value_weight, + ] + ) + weights = weights.repeat(n_states) + weights = weights[1:].repeat(batch_size, 1) + loss = loss * weights.view(-1) + loss = (loss * attention_mask.view(-1)).mean() + else: + loss = None + + if not return_dict: + return tuple(v for v in [loss, logits, presents, all_hidden_states, all_self_attentions] if v is not None) + + return TrajectoryTransformerOutput( + loss=loss, + logits=logits, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 43a4477fb49afb..fc8f7448d4fff4 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4028,6 +4028,23 @@ def load_tf_weights_in_t5(*args, **kwargs): requires_backends(load_tf_weights_in_t5, ["torch"]) +TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TrajectoryTransformerModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/trajectory_transformer/__init__.py b/tests/models/trajectory_transformer/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py new file mode 100644 index 00000000000000..7cf5c741a1f6fa --- /dev/null +++ b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py @@ -0,0 +1,275 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch TrajectoryTransformer model. """ + + +import inspect +import unittest + +import numpy as np + +from transformers import TrajectoryTransformerConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...generation.test_generation_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, random_attention_mask + + +if is_torch_available(): + import torch + + from transformers import TrajectoryTransformerModel + from transformers.models.trajectory_transformer.modeling_trajectory_transformer import ( + TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + ) + + +class TrajectoryTransformerModelTester: + def __init__(self, parent, batch_size=13, n_embd=128, action_dim=6, observation_dim=17, is_training=True): + self.parent = parent + self.batch_size = batch_size + self.n_embd = n_embd + self.action_dim = action_dim + self.observation_dim = observation_dim + self.is_training = is_training + self.seq_length = self.action_dim + self.observation_dim + 1 + + def prepare_config_and_inputs(self): + trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to( + torch_device + ) + attention_mask = random_attention_mask((self.batch_size, self.seq_length)).to(torch_device) + targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to( + torch_device + ) + + config = self.get_config() + return config, trajectories, attention_mask, targets + + def get_config(self): + return TrajectoryTransformerConfig( + batch_size=self.batch_size, + n_embd=self.n_embd, + action_dim=self.action_dim, + observation_dim=self.observation_dim, + ) + + def create_and_check_model(self, config, input_dict): + model = TrajectoryTransformerModel(config=config) + model.to(torch_device) + model.eval() + + result = model(trajectories=input_dict["trajectories"], attention_mask=input_dict["attention_mask"]) + result = model( + trajectories=input_dict["trajectories"], + output_hidden_states=True, + output_attentions=True, + use_cache=True, + return_dict=True, + ) + + self.parent.assertEqual(result.hidden_states[-1].shape, (self.batch_size, self.seq_length, self.n_embd)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, trajectories, attention_mask, targets) = config_and_inputs + inputs_dict = {"trajectories": trajectories, "attention_mask": attention_mask, "targets": targets} + return config, inputs_dict + + +@require_torch +class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + + all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else () + + # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids + test_generate_without_input_ids = False + + # Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_attention_outputs = False + test_hidden_states_output = False + test_inputs_embeds = False + test_model_common_attributes = False + test_torchscript = False + + def setUp(self): + self.model_tester = TrajectoryTransformerModelTester(self) + self.config_tester = ConfigTester(self, config_class=TrajectoryTransformerConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_conditional_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["trajectories"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + # # Input is 'trajectories' not 'input_ids' + def test_model_main_input_name(self): + model_signature = inspect.signature(getattr(TrajectoryTransformerModel, "forward")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(TrajectoryTransformerModel.main_input_name, observed_main_input_name) + + def test_retain_grad_hidden_states_attentions(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = self.has_attentions + + model = TrajectoryTransformerModel(config) + model.to(torch_device) + + outputs = model( + trajectories=input_dict["trajectories"], + attention_mask=input_dict["attention_mask"], + targets=input_dict["targets"], + output_hidden_states=True, + output_attentions=True, + use_cache=True, + return_dict=True, + ) + + output = outputs[0] + hidden_states = outputs.hidden_states[0] + hidden_states.retain_grad() + + if self.has_attentions: + attentions = outputs.attentions[0] + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + + if self.has_attentions: + self.assertIsNotNone(attentions.grad) + + def test_training(self): + if not self.model_tester.is_training: + return + + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + + model = TrajectoryTransformerModel(config) + model.to(torch_device) + model.train() + loss = model( + trajectories=input_dict["trajectories"], + attention_mask=input_dict["attention_mask"], + targets=input_dict["targets"], + output_hidden_states=True, + output_attentions=True, + use_cache=True, + return_dict=True, + ).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + if not self.model_tester.is_training: + return + + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + + model = TrajectoryTransformerModel(config) + model.gradient_checkpointing_enable() + model.to(torch_device) + model.train() + loss = model( + trajectories=input_dict["trajectories"], + attention_mask=input_dict["attention_mask"], + targets=input_dict["targets"], + output_hidden_states=True, + output_attentions=True, + use_cache=False, + return_dict=True, + ).loss + loss.backward() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @slow + def test_model_from_pretrained(self): + for model_name in TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TrajectoryTransformerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class TrajectoryTransformerModelIntegrationTest(unittest.TestCase): + @slow + def test_prediction(self): + batch_size = 1 + + config = TrajectoryTransformerConfig.from_pretrained("CarlCochet/trajectory-transformer-halfcheetah-medium-v2") + model = TrajectoryTransformerModel.from_pretrained( + "CarlCochet/trajectory-transformer-halfcheetah-medium-v2", config=config + ) + model.to(torch_device) + model.eval() + + seq_length = model.config.action_dim + model.config.observation_dim + 1 + + trajectories = torch.LongTensor( + [[3, 19, 20, 22, 9, 7, 23, 10, 18, 14, 13, 4, 17, 11, 5, 6, 15, 21, 2, 8, 1, 0, 12, 16]] + ).to(torch_device) + outputs = model( + trajectories=trajectories, + output_hidden_states=True, + output_attentions=True, + use_cache=True, + return_dict=True, + ) + + output = outputs.logits + + expected_shape = torch.Size((batch_size, seq_length, model.config.vocab_size + 1)) + expected_slice = torch.tensor( + [[[-0.7193, -0.2532, -0.0898], [1.9429, 2.0434, 2.3975], [-3.3651, -2.8744, -2.4532]]] + ).to(torch_device) + output_slice = output[:, :3, :3] + + self.assertEqual(output.shape, expected_shape) + self.assertTrue(torch.allclose(output_slice, expected_slice, atol=1e-4))