diff --git a/README.md b/README.md index 09362378f3..f0358cc4e9 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,6 @@ print(HelloBot(['Hello!', 'Boo...', 'Bye.'])) [Intent/Sentence Classification](http://docs.deeppavlov.ai/en/latest/components/classifiers.html) | [Sentence Similarity/Ranking](http://docs.deeppavlov.ai/en/latest/components/neural_ranking.html) -[Goal(Task)-oriented Bot](http://docs.deeppavlov.ai/en/latest/components/go_bot.html) | [Seq2seq Goal-Oriented bot](http://docs.deeppavlov.ai/en/latest/components/seq2seq_go_bot.html) - [Question Answering over Text (SQuAD)](http://docs.deeppavlov.ai/en/latest/components/squad.html) [Morphological tagging](http://docs.deeppavlov.ai/en/latest/components/morphotagger.html) | [Automatic Spelling Correction](http://docs.deeppavlov.ai/en/latest/components/spelling_correction.html) diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py index d14adda914..0beee8d2dd 100644 --- a/deeppavlov/__init__.py +++ b/deeppavlov/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '0.0.6.6' +__version__ = '0.0.7' __author__ = 'Neural Networks and Deep Learning lab, MIPT' __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] diff --git a/deeppavlov/configs/sentiment/insults_kaggle.json b/deeppavlov/configs/classifiers/insults_kaggle.json similarity index 88% rename from deeppavlov/configs/sentiment/insults_kaggle.json rename to deeppavlov/configs/classifiers/insults_kaggle.json index e60cb10c61..606f2f5b7b 100644 --- a/deeppavlov/configs/sentiment/insults_kaggle.json +++ b/deeppavlov/configs/classifiers/insults_kaggle.json @@ -37,20 +37,24 @@ "name": "dirty_comments_preprocessor" }, { + "in": "x_prep", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin", "load_path": "embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin", "dim": 300 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x_prep" + "x_emb" ], "in_y": [ "y" @@ -61,8 +65,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "sentiment/insults_kaggle_v0", - "load_path": "sentiment/insults_kaggle_v0", + "save_path": "classifiers/insults_kaggle_v0", + "load_path": "classifiers/insults_kaggle_v0", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -81,9 +86,7 @@ "coef_reg_den": 1e-2, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -117,7 +120,7 @@ }, "download": [ "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/sentiment.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/datasets/insults_data.tar.gz", { "url": "http://files.deeppavlov.ai/embeddings/reddit_fastText/wordpunct_tok_reddit_comments_2017_11_300.bin", diff --git a/deeppavlov/configs/intents/intents_dstc2.json b/deeppavlov/configs/classifiers/intents_dstc2.json similarity index 78% rename from deeppavlov/configs/intents/intents_dstc2.json rename to deeppavlov/configs/classifiers/intents_dstc2.json index e6eeb6df70..25499b4aff 100644 --- a/deeppavlov/configs/intents/intents_dstc2.json +++ b/deeppavlov/configs/classifiers/intents_dstc2.json @@ -5,21 +5,7 @@ }, "dataset_iterator": { "name": "dstc2_intents_iterator", - "seed": 42, - "fields_to_merge": [ - "train", - "valid" - ], - "merged_field": "train", - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "seed": 42 }, "chainer": { "in": [ @@ -36,24 +22,28 @@ "y" ], "level": "token", - "save_path": "vocabs/classes.dict", - "load_path": "vocabs/classes.dict" + "save_path": "vocabs/dstc2_classes.dict", + "load_path": "vocabs/dstc2_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -64,8 +54,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intents_dstc2_v4", - "load_path": "intents/intents_dstc2_v4", + "save_path": "classifiers/intents_dstc2_v4", + "load_path": "classifiers/intents_dstc2_v4", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -83,9 +74,7 @@ "coef_reg_den": 1e-4, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -116,7 +105,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - "http://files.deeppavlov.ai/deeppavlov_data/intents.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", diff --git a/deeppavlov/configs/intents/intents_dstc2_big.json b/deeppavlov/configs/classifiers/intents_dstc2_big.json similarity index 76% rename from deeppavlov/configs/intents/intents_dstc2_big.json rename to deeppavlov/configs/classifiers/intents_dstc2_big.json index 9e7be12d95..f89e56b60b 100644 --- a/deeppavlov/configs/intents/intents_dstc2_big.json +++ b/deeppavlov/configs/classifiers/intents_dstc2_big.json @@ -5,21 +5,7 @@ }, "dataset_iterator": { "name": "dstc2_intents_iterator", - "seed": 42, - "fields_to_merge": [ - "train", - "valid" - ], - "merged_field": "train", - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "seed": 42 }, "chainer": { "in": [ @@ -36,24 +22,28 @@ "y" ], "level": "token", - "save_path": "vocabs/classes.dict", - "load_path": "vocabs/classes.dict" + "save_path": "vocabs/dstc2_classes.dict", + "load_path": "vocabs/dstc2_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/wiki.en.bin", "load_path": "embeddings/wiki.en.bin", "dim": 300 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -64,8 +54,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intents_dstc2_v5", - "load_path": "intents/intents_dstc2_v5", + "save_path": "classifiers/intents_dstc2_v5", + "load_path": "classifiers/intents_dstc2_v5", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -83,9 +74,7 @@ "coef_reg_den": 1e-4, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -104,7 +93,9 @@ "validation_patience": 5, "val_every_n_epochs": 5, "log_every_n_batches": 100, - "show_examples": false + "show_examples": false, + "validate_best": true, + "test_best": true }, "metadata": { "requirements": [ @@ -115,7 +106,7 @@ "telegram_utils": "IntentModel" }, "download": [ - "http://files.deeppavlov.ai/deeppavlov_data/intents.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", diff --git a/deeppavlov/configs/intents/intents_sample_csv.json b/deeppavlov/configs/classifiers/intents_sample_csv.json similarity index 88% rename from deeppavlov/configs/intents/intents_sample_csv.json rename to deeppavlov/configs/classifiers/intents_sample_csv.json index 413b50fde4..19bd282da6 100644 --- a/deeppavlov/configs/intents/intents_sample_csv.json +++ b/deeppavlov/configs/classifiers/intents_sample_csv.json @@ -44,20 +44,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -68,8 +72,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intents_snips_v4", - "load_path": "intents/intents_snips_v4", + "save_path": "classifiers/intents_snips_v4", + "load_path": "classifiers/intents_snips_v4", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -87,9 +92,7 @@ "coef_reg_den": 1e-4, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -122,7 +125,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - "http://files.deeppavlov.ai/deeppavlov_data/intents.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", { "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", diff --git a/deeppavlov/configs/intents/intents_sample_json.json b/deeppavlov/configs/classifiers/intents_sample_json.json similarity index 88% rename from deeppavlov/configs/intents/intents_sample_json.json rename to deeppavlov/configs/classifiers/intents_sample_json.json index f8ef1858a3..ae58e5f321 100644 --- a/deeppavlov/configs/intents/intents_sample_json.json +++ b/deeppavlov/configs/classifiers/intents_sample_json.json @@ -39,20 +39,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -63,8 +67,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intents_snips_v4", - "load_path": "intents/intents_snips_v4", + "save_path": "classifiers/intents_snips_v4", + "load_path": "classifiers/intents_snips_v4", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -82,9 +87,7 @@ "coef_reg_den": 1e-4, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -117,7 +120,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - "http://files.deeppavlov.ai/deeppavlov_data/intents.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", { "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.json", diff --git a/deeppavlov/configs/intents/intents_snips.json b/deeppavlov/configs/classifiers/intents_snips.json similarity index 87% rename from deeppavlov/configs/intents/intents_snips.json rename to deeppavlov/configs/classifiers/intents_snips.json index ac22758ccf..d77232df88 100644 --- a/deeppavlov/configs/intents/intents_snips.json +++ b/deeppavlov/configs/classifiers/intents_snips.json @@ -37,20 +37,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -61,8 +65,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intents_snips_v4", - "load_path": "intents/intents_snips_v4", + "save_path": "classifiers/intents_snips_v4", + "load_path": "classifiers/intents_snips_v4", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -80,9 +85,7 @@ "coef_reg_den": 1e-4, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -101,7 +104,7 @@ "validation_patience": 5, "val_every_n_epochs": 5, "log_every_n_epochs": 5, - "show_examples": false, + "show_examples": true, "validate_best": true, "test_best": false }, @@ -115,7 +118,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - "http://files.deeppavlov.ai/deeppavlov_data/intents.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", { "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", diff --git a/deeppavlov/configs/classifiers/intents_snips_big.json b/deeppavlov/configs/classifiers/intents_snips_big.json new file mode 100644 index 0000000000..0f8651df31 --- /dev/null +++ b/deeppavlov/configs/classifiers/intents_snips_big.json @@ -0,0 +1,133 @@ +{ + "dataset_reader": { + "name": "basic_classification_reader", + "x": "text", + "y": "intents", + "data_path": "snips" + }, + "dataset_iterator": { + "name": "basic_classification_iterator", + "seed": 42, + "field_to_split": "train", + "split_fields": [ + "train", + "valid" + ], + "split_proportions": [ + 0.9, + 0.1 + ] + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": [ + "y" + ], + "level": "token", + "save_path": "vocabs/snips_classes.dict", + "load_path": "vocabs/snips_classes.dict" + }, + { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", + "id": "my_embedder", + "name": "fasttext", + "save_path": "embeddings/wiki.en.bin", + "load_path": "embeddings/wiki.en.bin", + "dim": 300 + }, + { + "in": [ + "x_emb" + ], + "in_y": [ + "y" + ], + "out": [ + "y_labels", + "y_probas_dict" + ], + "main": true, + "name": "keras_classification_model", + "save_path": "classifiers/intents_snips_v5", + "load_path": "classifiers/intents_snips_v5", + "embedding_size": "#my_embedder.dim", + "classes": "#classes_vocab.keys()", + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": 256, + "confident_threshold": 1, + "optimizer": "Adam", + "lear_rate": 0.01, + "lear_rate_decay": 0.1, + "loss": "categorical_crossentropy", + "text_size": 15, + "coef_reg_cnn": 1e-4, + "coef_reg_den": 1e-4, + "dropout_rate": 0.5, + "dense_size": 100, + "model_name": "cnn_model" + } + ], + "out": [ + "y_labels", + "y_probas_dict" + ] + }, + "train": { + "epochs": 1000, + "batch_size": 64, + "metrics": [ + "classification_accuracy", + "classification_f1", + "classification_roc_auc" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": true, + "validate_best": true, + "test_best": false + }, + "metadata": { + "requirements": [ + "../dp_requirements/tf.txt", + "../dp_requirements/fasttext.txt" + ], + "labels": { + "telegram_utils": "IntentModel", + "server_utils": "KerasIntentModel" + }, + "download": [ + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", + { + "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", + "subdir": "snips" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", + "subdir": "embeddings" + } + ] + } +} diff --git a/deeppavlov/configs/classifiers/rusentiment_cnn.json b/deeppavlov/configs/classifiers/rusentiment_cnn.json new file mode 100644 index 0000000000..7518bfbc1b --- /dev/null +++ b/deeppavlov/configs/classifiers/rusentiment_cnn.json @@ -0,0 +1,149 @@ +{ + "dataset_reader": { + "name": "basic_classification_reader", + "x": "text", + "y": "label", + "data_path": "rusentiment/", + "train": "rusentiment_random_posts.csv", + "test": "rusentiment_test.csv" + }, + "dataset_iterator": { + "name": "basic_classification_iterator", + "seed": 42, + "field_to_split": "train", + "split_fields": [ + "train", + "valid" + ], + "split_proportions": [ + 0.9, + 0.1 + ] + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": [ + "y" + ], + "level": "token", + "save_path": "vocabs/rusentiment_classes.dict", + "load_path": "vocabs/rusentiment_classes.dict" + }, + { + "in": [ + "x" + ], + "out": [ + "x_prep" + ], + "name": "dirty_comments_preprocessor" + }, + { + "in": "x_prep", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", + "id": "my_embedder", + "name": "fasttext", + "save_path": "embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "load_path": "embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "dim": 300 + }, + { + "in": [ + "x_emb" + ], + "in_y": [ + "y" + ], + "out": [ + "y_labels", + "y_probas_dict" + ], + "main": true, + "name": "keras_classification_model", + "save_path": "classifiers/rusentiment_v0", + "load_path": "classifiers/rusentiment_v0", + "embedding_size": "#my_embedder.dim", + "classes": "#classes_vocab.keys()", + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": 256, + "confident_threshold": 1, + "optimizer": "Adam", + "lear_rate": 0.01, + "lear_rate_decay": 0.1, + "loss": "binary_crossentropy", + "text_size": 40, + "last_layer_activation": "softmax", + "coef_reg_cnn": 1e-3, + "coef_reg_den": 1e-2, + "dropout_rate": 0.5, + "dense_size": 100, + "model_name": "cnn_model" + } + ], + "out": [ + "y_labels", + "y_probas_dict" + ] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "classification_f1", + "classification_roc_auc", + "classification_accuracy" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "validate_best": true, + "test_best": true + }, + "metadata": { + "requirements": [ + "../dp_requirements/tf.txt", + "../dp_requirements/fasttext.txt" + ], + "labels": { + "telegram_utils": "IntentModel", + "server_utils": "KerasIntentModel" + }, + "download": [ + "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", + { + "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_random_posts.csv", + "subdir": "rusentiment" + }, + { + "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_test.csv", + "subdir": "rusentiment" + }, + { + "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "subdir": "embeddings" + } + ] + } +} diff --git a/deeppavlov/configs/sentiment/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json similarity index 85% rename from deeppavlov/configs/sentiment/sentiment_twitter.json rename to deeppavlov/configs/classifiers/sentiment_twitter.json index be211c40c7..394948c024 100644 --- a/deeppavlov/configs/sentiment/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -28,29 +28,24 @@ "load_path": "vocabs/sentiment_twitter_classes.dict" }, { - "in": [ - "x" - ], - "out": [ - "x_prep" - ], - "name": "dirty_comments_preprocessor" + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" }, { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", "load_path": "embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", "dim": 300 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x_prep" + "x_emb" ], "in_y": [ "y" @@ -61,8 +56,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "sentiment/sentiment_twitter_v1", - "load_path": "sentiment/sentiment_twitter_v1", + "save_path": "classifiers/sentiment_twitter_v3", + "load_path": "classifiers/sentiment_twitter_v3", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -81,9 +77,7 @@ "coef_reg_den": 1e-2, "dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -117,7 +111,7 @@ }, "download": [ "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/sentiment.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz", { "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", diff --git a/deeppavlov/configs/classifiers/sentiment_twitter_preproc.json b/deeppavlov/configs/classifiers/sentiment_twitter_preproc.json new file mode 100644 index 0000000000..58b159abaa --- /dev/null +++ b/deeppavlov/configs/classifiers/sentiment_twitter_preproc.json @@ -0,0 +1,132 @@ +{ + "dataset_reader": { + "name": "basic_classification_reader", + "x": "Twit", + "y": "Class", + "data_path": "sentiment_twitter_data" + }, + "dataset_iterator": { + "name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": [ + "y" + ], + "level": "token", + "save_path": "vocabs/sentiment_twitter_classes.dict", + "load_path": "vocabs/sentiment_twitter_classes.dict" + }, + { + "in": [ + "x" + ], + "out": [ + "x_prep" + ], + "name": "dirty_comments_preprocessor", + "delete_smile_brackets": true + }, + { + "in": "x_prep", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", + "id": "my_embedder", + "name": "fasttext", + "save_path": "embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "load_path": "embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "dim": 300 + }, + { + "in": [ + "x_emb" + ], + "in_y": [ + "y" + ], + "out": [ + "y_labels", + "y_probas_dict" + ], + "main": true, + "name": "keras_classification_model", + "save_path": "classifiers/sentiment_twitter_v2", + "load_path": "classifiers/sentiment_twitter_v2", + "embedding_size": "#my_embedder.dim", + "classes": "#classes_vocab.keys()", + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": 256, + "confident_threshold": 0.5, + "optimizer": "Adam", + "lear_rate": 0.01, + "lear_rate_decay": 0.1, + "loss": "binary_crossentropy", + "text_size": 100, + "last_layer_activation": "softmax", + "coef_reg_cnn": 1e-3, + "coef_reg_den": 1e-2, + "dropout_rate": 0.5, + "dense_size": 100, + "model_name": "cnn_model" + } + ], + "out": [ + "y_labels", + "y_probas_dict" + ] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "classification_roc_auc", + "classification_f1", + "classification_accuracy" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "validate_best": true, + "test_best": true + }, + "metadata": { + "requirements": [ + "../dp_requirements/tf.txt", + "../dp_requirements/fasttext.txt" + ], + "labels": { + "telegram_utils": "IntentModel", + "server_utils": "KerasIntentModel" + }, + "download": [ + "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", + "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz", + { + "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "subdir": "embeddings" + } + ] + } +} diff --git a/deeppavlov/configs/sentiment/sentiment_ag_news.json b/deeppavlov/configs/classifiers/topic_ag_news.json similarity index 87% rename from deeppavlov/configs/sentiment/sentiment_ag_news.json rename to deeppavlov/configs/classifiers/topic_ag_news.json index c6c6fda144..dec39bc95e 100644 --- a/deeppavlov/configs/sentiment/sentiment_ag_news.json +++ b/deeppavlov/configs/classifiers/topic_ag_news.json @@ -26,18 +26,6 @@ "save_path": "vocabs/ag_news_classes.dict", "load_path": "vocabs/ag_news_classes.dict" }, - { - "id": "my_embedder", - "name": "fasttext", - "save_path": "embeddings/wiki.en.bin", - "load_path": "embeddings/wiki.en.bin", - "dim": 100 - }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ "x" @@ -47,9 +35,25 @@ ], "name": "str_lower" }, + { + "in": "x_lower", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", + "id": "my_embedder", + "name": "fasttext", + "save_path": "embeddings/wiki.en.bin", + "load_path": "embeddings/wiki.en.bin", + "dim": 300 + }, { "in": [ - "x_lower" + "x_emb" ], "in_y": [ "y" @@ -60,8 +64,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "sentiment/sentiment_ag_news_v0", - "load_path": "sentiment/sentiment_ag_news_v0", + "save_path": "classifiers/topic_ag_news_v1", + "load_path": "classifiers/topic_ag_news_v1", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -80,9 +85,7 @@ "dropout_rate": 0.5, "dense_size": 100, "last_layer_activation": "softmax", - "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_model" } ], "out": [ @@ -115,7 +118,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - "http://files.deeppavlov.ai/deeppavlov_data/sentiment.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "http://files.deeppavlov.ai/datasets/ag_news_data.tar.gz", { diff --git a/deeppavlov/configs/evolution/evolve_intents_snips.json b/deeppavlov/configs/evolution/evolve_intents_snips.json index 311570ba4b..b11dc801aa 100644 --- a/deeppavlov/configs/evolution/evolve_intents_snips.json +++ b/deeppavlov/configs/evolution/evolve_intents_snips.json @@ -43,29 +43,29 @@ "load_path": "vocabs/snips_classes.dict" }, { - "in": [ - "x" - ], - "out": [ - "x_lower" - ], + "in": "x", + "out": "x_lower", "name": "str_lower" }, { - "id": "my_embedder", - "name": "fasttext", - "save_path": "embeddings/dstc2_fastText_model.bin", - "load_path": "embeddings/dstc2_fastText_model.bin", - "dim": 100 - }, - { + "in": "x_lower", + "out": "x_tok", "id": "my_tokenizer", "name": "nltk_tokenizer", "tokenizer": "wordpunct_tokenize" }, + { + "in": "x_tok", + "out": "x_emb", + "id": "my_embedder", + "name": "fasttext", + "save_path": "embeddings/wiki.en.bin", + "load_path": "embeddings/wiki.en.bin", + "dim": 300 + }, { "in": [ - "x_lower" + "x_emb" ], "in_y": [ "y" @@ -75,9 +75,10 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "evolution/classification/intents_snips", "load_path": "evolution/classification/intents_snips", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -92,8 +93,7 @@ "discrete": true }, "confident_threshold": { - "evolve_choice": true, - "values": [ + "evolve_choice": [ 0.5, 1 ] @@ -113,7 +113,7 @@ ], "scale": "log" }, - "loss": "binary_crossentropy", + "loss": "categorical_crossentropy", "text_size": 15, "coef_reg_cnn": { "evolve_range": [ @@ -141,8 +141,6 @@ "discrete": true }, "model_name": "cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer", "check_bool": { "evolve_bool": true } @@ -192,7 +190,7 @@ "subdir": "snips" }, { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", + "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", "subdir": "embeddings" } ] diff --git a/deeppavlov/configs/evolution/evolve_rusentiment_cnn.json b/deeppavlov/configs/evolution/evolve_rusentiment_cnn.json new file mode 100644 index 0000000000..32ee92b9b5 --- /dev/null +++ b/deeppavlov/configs/evolution/evolve_rusentiment_cnn.json @@ -0,0 +1,192 @@ +{ + "dataset_reader": { + "name": "basic_classification_reader", + "x": "text", + "y": "label", + "data_path": "/home/dilyara/evolution_data/rusentiment_data/", + "train": "rusentiment_random_posts.csv", + "test": "rusentiment_test.csv" + }, + "dataset_iterator": { + "name": "basic_classification_iterator", + "seed": 42, + "field_to_split": "train", + "split_fields": [ + "train", + "valid" + ], + "split_proportions": [ + 0.9, + 0.1 + ] + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": [ + "y" + ], + "level": "token", + "save_path": "/home/dilyara/evolution_data/rusentiment_classification/rusentiment_classes.dict", + "load_path": "/home/dilyara/evolution_data/rusentiment_classification/rusentiment_classes.dict" + }, + { + "in": [ + "x" + ], + "out": [ + "x_prep" + ], + "name": "dirty_comments_preprocessor" + }, + { + "in": "x_prep", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", + "id": "my_embedder", + "name": "fasttext", + "save_path": "/home/dilyara/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "load_path": "/home/dilyara/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "dim": 300 + }, + { + "in": [ + "x_emb" + ], + "in_y": [ + "y" + ], + "out": [ + "y_labels", + "y_probas_dict" + ], + "main": true, + "name": "keras_classification_model", + "save_path": "/home/dilyara/evolution_data/rusentiment_classification/rusentiment_v0", + "load_path": "/home/dilyara/evolution_data/rusentiment_classification/rusentiment_v0", + "embedding_size": "#my_embedder.dim", + "classes": "#classes_vocab.keys()", + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": { + "evolve_range": [ + 50, + 100 + ], + "discrete": true + }, + "confident_threshold": 1, + "optimizer": "Adam", + "lear_rate": { + "evolve_range": [ + 1e-4, + 1e-1 + ] + }, + "lear_rate_decay": { + "evolve_range": [ + 1e-6, + 1e-2 + ] + }, + "loss": "categorical_crossentropy", + "text_size": 100, + "last_layer_activation": "softmax", + "coef_reg_cnn": { + "evolve_range": [ + 1e-6, + 1e-2 + ] + }, + "coef_reg_den": { + "evolve_range": [ + 1e-6, + 1e-2 + ] + }, + "dropout_rate": { + "evolve_range": [ + 0, + 1 + ] + }, + "dense_size": { + "evolve_range": [ + 50, + 100 + ], + "discrete": true + }, + "model_name": "cnn_model" + } + ], + "out": [ + "y_labels", + "y_probas_dict" + ] + }, + "train": { + "epochs": 100, + "batch_size": { + "evolve_range": [ + 50, + 200 + ], + "discrete": true + }, + "metrics": [ + "classification_f1", + "classification_roc_auc", + "classification_accuracy" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "validate_best": true, + "test_best": true + }, + "metadata": { + "requirements": [ + "../dp_requirements/tf.txt", + "../dp_requirements/fasttext.txt" + ], + "labels": { + "telegram_utils": "IntentModel", + "server_utils": "KerasIntentModel" + }, + "download": [ + "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz", + { + "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_random_posts.csv", + "subdir": "rusentiment" + }, + { + "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_test.csv", + "subdir": "rusentiment" + }, + { + "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin", + "subdir": "embeddings" + } + ] + } +} diff --git a/deeppavlov/configs/faq/fasttext_avg_autofaq.json b/deeppavlov/configs/faq/fasttext_avg_autofaq.json new file mode 100644 index 0000000000..babbab8056 --- /dev/null +++ b/deeppavlov/configs/faq/fasttext_avg_autofaq.json @@ -0,0 +1,68 @@ +{ + "dataset_reader": { + "name": "faq_reader", + "x_col_name": "Question", + "y_col_name": "Answer", + "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" + }, + "dataset_iterator": { + "name": "data_learning_iterator" + }, + + "chainer": { + "in": "question", + "pipe": [ + { + "name": "ru_tokenizer", + "in": "question", + "lemmas": true, + "out": "q_token_lemmas" + }, + { + "name": "fasttext", + "in": "q_token_lemmas", + "load_path": "embeddings/lenta_lower_100.bin", + "out": "tokens_fasttext_vectors" + }, + { + "name": "sentence2vector_w2v_avg", + "in": ["q_token_lemmas", "tokens_fasttext_vectors"], + "out": "question_vector" + }, + { + "name": "cos_sim_classifier", + "in": "question_vector", + "fit_on": ["question_vector", "y"], + "top_n": 1, + "save_path": "faq/fasttext_cos_classifier.pkl", + "load_path": "faq/fasttext_cos_classifier.pkl", + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + + "train": { + "validate_best": false, + "test_best": false + }, + "metadata": { + "requirements": [ + "../dp_requirements/fasttext.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/faq/school/fasttext_cos_classifier.pkl", + "subdir": "faq" + }, + { + "url": "http://files.deeppavlov.ai/embeddings/lenta_lower_100.bin", + "subdir": "embeddings" + } + ] + } + +} + + + diff --git a/deeppavlov/configs/faq/fasttext_tfidf_autofaq.json b/deeppavlov/configs/faq/fasttext_tfidf_autofaq.json new file mode 100644 index 0000000000..ae48b91440 --- /dev/null +++ b/deeppavlov/configs/faq/fasttext_tfidf_autofaq.json @@ -0,0 +1,73 @@ +{ + "dataset_reader": { + "name": "faq_reader", + "x_col_name": "Question", + "y_col_name": "Answer", + "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" + }, + "dataset_iterator": { + "name": "data_learning_iterator" + }, + + "chainer": { + "in": "question", + "pipe": [ + { + "name": "ru_tokenizer", + "in": "question", + "lemmas": true, + "out": "q_token_lemmas" + }, + { + "name": "fasttext", + "in": "q_token_lemmas", + "load_path": "embeddings/lenta_lower_100.bin", + "out": "tokens_fasttext_vectors" + }, + { + "name": "sentence2vector_w2v_tfidf", + "in": ["q_token_lemmas", "tokens_fasttext_vectors"], + "load_path": "vectorizer/tfidf_vectorizer_ruwiki.pkl", + "out": "question_vector" + }, + { + "name": "cos_sim_classifier", + "in": "question_vector", + "fit_on": ["question_vector", "y"], + "top_n": 1, + "save_path": "faq/fasttext_cos_classifier.pkl", + "load_path": "faq/fasttext_cos_classifier.pkl", + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + + "train": { + "validate_best": false, + "test_best": false + }, + "metadata": { + "requirements": [ + "../dp_requirements/fasttext.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/faq/school/fasttext_cos_classifier.pkl", + "subdir": "faq" + }, + { + "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", + "subdir": "vectorizer" + }, + { + "url": "http://files.deeppavlov.ai/embeddings/lenta_lower_100.bin", + "subdir": "embeddings" + } + ] + } + +} + + + diff --git a/deeppavlov/configs/faq/tfidf_autofaq.json b/deeppavlov/configs/faq/tfidf_autofaq.json new file mode 100644 index 0000000000..746ddca995 --- /dev/null +++ b/deeppavlov/configs/faq/tfidf_autofaq.json @@ -0,0 +1,61 @@ +{ + "dataset_reader": { + "name": "faq_reader", + "x_col_name": "Question", + "y_col_name": "Answer", + "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" + }, + "dataset_iterator": { + "name": "data_learning_iterator" + }, + + "chainer": { + "in": "q", + "pipe": [ + { + "name": "ru_tokenizer", + "in": "q", + "lemmas": true, + "out": "q_token_lemmas" + }, + { + "id": "vectorizer", + "name": "tfidf_vectorizer", + "in": "q_token_lemmas", + "load_path": "vectorizer/tfidf_vectorizer_ruwiki.pkl", + "out": "q_vect" + }, + { + "name": "cos_sim_classifier", + "in": "q_vect", + "fit_on": ["q_vect", "y"], + "top_n": 1, + "save_path": "faq/tfidf_cos_sim_classifier.pkl", + "load_path": "faq/tfidf_cos_sim_classifier.pkl", + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + + "train": { + "validate_best": false, + "test_best": false + }, + "metadata": { + "download": [ + { + "url": "http://files.deeppavlov.ai/faq/school/tfidf_cos_sim_classifier.pkl", + "subdir": "faq" + }, + { + "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", + "subdir": "vectorizer" + } + ] + } + +} + + + diff --git a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json new file mode 100644 index 0000000000..eaa562b9e0 --- /dev/null +++ b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json @@ -0,0 +1,63 @@ +{ + "dataset_reader": { + "name": "faq_reader", + "x_col_name": "Question", + "y_col_name": "Answer", + "data_url": "http://files.deeppavlov.ai/faq/school/faq_school.csv" + }, + "dataset_iterator": { + "name": "data_learning_iterator" + }, + + "chainer": { + "in": "q", + "pipe": [ + { + "name": "ru_tokenizer", + "in": "q", + "lemmas": true, + "out": "q_token_lemmas" + }, + { + "id": "vectorizer", + "name": "tfidf_vectorizer", + "in": "q_token_lemmas", + "load_path": "vectorizer/tfidf_vectorizer_ruwiki.pkl", + "out": "q_vect" + }, + { + "name": "logreg_classifier", + "in": "q_vect", + "fit_on": ["q_vect", "y"], + "top_n": 1, + "c": 1000, + "penalty": "l2", + "save_path": "faq/tfidf_logreg_classifier.pkl", + "load_path": "faq/tfidf_logreg_classifier.pkl", + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + + "train": { + "validate_best": false, + "test_best": false + }, + "metadata": { + "download": [ + { + "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier.pkl", + "subdir": "faq" + }, + { + "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", + "subdir": "vectorizer" + } + ] + } + +} + + + diff --git a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json new file mode 100644 index 0000000000..98e3c6ad86 --- /dev/null +++ b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json @@ -0,0 +1,69 @@ +{ + "dataset_reader": { + "name": "faq_reader", + "x_col_name": "Question", + "y_col_name": "Answer", + "data_url": "http://files.deeppavlov.ai/faq/mipt/faq.csv" + }, + "dataset_iterator": { + "name": "data_learning_iterator" + }, + + "chainer": { + "in": "q", + "pipe": [ + { + "name": "stream_spacy_tokenizer", + "in": "q", + "lemmas": true, + "out": "q_token_lemmas" + }, + { + "id": "vectorizer", + "name": "tfidf_vectorizer", + "in": "q_token_lemmas", + "fit_on": ["q_token_lemmas"], + "save_path": "faq/tfidf_vectorizer_en_mipt_faq.pkl", + "load_path": "faq/tfidf_vectorizer_en_mipt_faq.pkl", + "out": "q_vect" + }, + { + "name": "logreg_classifier", + "in": "q_vect", + "fit_on": ["q_vect", "y"], + "top_n": 2, + "c": 1000, + "penalty": "l2", + "save_path": "faq/tfidf_logreg_classifier_en_mipt_faq.pkl", + "load_path": "faq/tfidf_logreg_classifier_en_mipt_faq.pkl", + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + + "train": { + "validate_best": false, + "test_best": false + }, + "metadata": { + "requirements": [ + "../dp_requirements/spacy.txt", + "../dp_requirements/en_core_web_sm.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/faq/mipt/tfidf_logreg_classifier_en_mipt_faq.pkl", + "subdir": "faq" + }, + { + "url": "http://files.deeppavlov.ai/faq/mipt/tfidf_vectorizer_en_mipt_faq.pkl", + "subdir": "faq" + } + ] + } + +} + + + diff --git a/deeppavlov/configs/go_bot/gobot_dstc2.json b/deeppavlov/configs/go_bot/gobot_dstc2.json index 7b85e6f71d..a53d6ecca5 100644 --- a/deeppavlov/configs/go_bot/gobot_dstc2.json +++ b/deeppavlov/configs/go_bot/gobot_dstc2.json @@ -43,7 +43,7 @@ "network_parameters": { "load_path": "gobot_dstc2/model", "save_path": "gobot_dstc2/model", - "learning_rate": 0.004, + "learning_rate": 0.005, "dropout_rate": 0.85, "l2_reg_coef": 7e-4, "hidden_size": 128, @@ -53,7 +53,7 @@ "config_path": "../deeppavlov/configs/ner/slotfill_dstc2.json" }, "intent_classifier": { - "config_path": "../deeppavlov/configs/intents/intents_dstc2_big.json" + "config_path": "../deeppavlov/configs/classifiers/intents_dstc2_big.json" }, "embedder": null, "bow_embedder": { @@ -95,7 +95,7 @@ }, "download": [ "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/gobot_dstc2_v4.tar.gz", + "http://files.deeppavlov.ai/deeppavlov_data/gobot_dstc2_v5.tar.gz", { "url": "http://files.deeppavlov.ai/datasets/dstc2_v2.tar.gz", "subdir": "dstc2" diff --git a/deeppavlov/configs/odqa/en_odqa_infer_wiki.json b/deeppavlov/configs/odqa/en_odqa_infer_wiki.json index b66b5d9a6d..6ec1493bf4 100644 --- a/deeppavlov/configs/odqa/en_odqa_infer_wiki.json +++ b/deeppavlov/configs/odqa/en_odqa_infer_wiki.json @@ -3,12 +3,8 @@ "in": [ "question_raw" ], - "in_y": [ - "ans_raw", - "ans_raw_start" - ], "out": [ - "ans_predicted" + "best_answer" ], "pipe": [ { @@ -29,18 +25,30 @@ "context_raw" ], "data_dir": "odqa", + "join_docs": false, "shuffle": false, "data_url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki.db" }, { - "config_path": "../deeppavlov/configs/squad/squad.json", + "name": "document_chunker", + "in": ["context_raw"], + "out": ["chunks"], + "flatten_result": true + }, + { + "name": "string_multiplier", + "in": ["question_raw", "chunks"], + "out":["questions"] + }, + { + "name": "logit_ranker", + "squad_model": {"config_path": "../deeppavlov/configs/squad/squad.json"}, "in": [ - "context_raw", - "question_raw" + "chunks", + "questions" ], "out": [ - "ans_predicted", - "ans_start_predicted" + "best_answer" ] } ] @@ -56,8 +64,7 @@ }, "download": [ "http://files.deeppavlov.ai/datasets/wikipedia/enwiki.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/en_odqa.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/squad_model_1.1.tar.gz" + "http://files.deeppavlov.ai/deeppavlov_data/en_odqa.tar.gz" ] } } \ No newline at end of file diff --git a/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json b/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json index df32bcf7e1..8e784a60ca 100644 --- a/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json +++ b/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json @@ -3,12 +3,8 @@ "in": [ "question_raw" ], - "in_y": [ - "ans_raw", - "ans_raw_start" - ], "out": [ - "ans_predicted" + "best_answer" ], "pipe": [ { @@ -29,18 +25,32 @@ "context_raw" ], "data_dir": "odqa", + "join_docs": false, "shuffle": false, "data_url": "http://files.deeppavlov.ai/datasets/wikipedia/ruwiki.db" }, { - "config_path": "../deeppavlov/configs/squad/squad_ru.json", + "name": "document_chunker", + "in": ["context_raw"], + "out": ["chunks"], + "flatten_result": true + }, + { + "name": "string_multiplier", + "in": ["question_raw", "chunks"], + "out":["questions"] + }, + { + "name": "logit_ranker", + "squad_model": { + "config_path": "../deeppavlov/configs/squad/squad_ru.json" + }, "in": [ - "context_raw", - "question_raw" + "chunks", + "questions" ], "out": [ - "ans_predicted", - "ans_start_predicted" + "best_answer" ] } ] diff --git a/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json b/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json index 79fa35ec0f..f0e9266f21 100644 --- a/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json +++ b/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json @@ -13,6 +13,23 @@ "y" ], "pipe": [ + { + "name": "hashing_tfidf_vectorizer", + "id": "vectorizer", + "fit_on_batch": [ + "x" + ], + "save_path": "odqa/enwiki_tfidf_matrix.npz", + "load_path": "odqa/enwiki_tfidf_matrix.npz", + "tokenizer": { + "name": "stream_spacy_tokenizer", + "lemmas": true, + "ngram_range": [ + 1, + 2 + ] + } + }, { "name": "tfidf_ranker", "top_n": 5, @@ -23,25 +40,7 @@ "y", "score" ], - "fit_on_batch": [ - "x" - ], - "vectorizer": { - "name": "hashing_tfidf_vectorizer", - "fit_on_batch": [ - "x" - ], - "save_path": "odqa/enwiki_tfidf_matrix.npz", - "load_path": "odqa/enwiki_tfidf_matrix.npz", - "tokenizer": { - "name": "stream_spacy_tokenizer", - "lemmas": true, - "ngram_range": [ - 1, - 2 - ] - } - } + "vectorizer": "#vectorizer" } ] }, diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json index afd9626568..dd51150663 100644 --- a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json +++ b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json @@ -8,13 +8,14 @@ "shuffle": false }, "chainer": { - "in": ["x_text", "dialog_id", "kb_columns", "kb_items"], + "in": ["x_text", "dialog_id", "history", "kb_columns", "kb_items"], "in_y": ["y_text", "y_domain"], "out": ["prediction_text"], "pipe": [ { "id": "stream_spacy", "name": "stream_spacy_tokenizer", + "lowercase": true, "alphas_only": false, "in": ["x_text"], "out": ["x_tokens"] @@ -24,67 +25,91 @@ "in": ["y_text"], "out": ["y_tokens"] }, + { + "ref": "stream_spacy", + "in": ["history"], + "out": ["history_tokens"] + }, { "id": "kb", "name": "knowledge_base", "fit_on": ["dialog_id", "kb_columns", "kb_items"], "tokenizer": "#stream_spacy", + "in": ["dialog_id", "kb_columns", "kb_items"], + "out": ["kb_entries"], "save_path": "seq2seq_go_bot/kvret_kb.json", "load_path": "seq2seq_go_bot/kvret_kb.json" }, { "name": "knowledge_base_entity_normalizer", - "kb": "#kb", - "in": ["dialog_id", "y_tokens", "kb_columns", "kb_items"], + "in": ["y_tokens", "kb_entries"], "out": ["y_norm_tokens"] }, + { + "name": "knowledge_base_entity_normalizer", + "remove": true, + "in": ["y_tokens", "kb_entries"], + "out": ["y_without_entities_tokens"] + }, { "id": "src_token_vocab", - "fit_on": ["x_tokens"], - "name": "default_vocab", - "level": "token", - "special_tokens": [""], - "save_path": "vocabs/src_tokens.dict", - "load_path": "vocabs/src_tokens.dict" + "fit_on": ["x_tokens", "y_tokens"], + "name": "simple_vocab", + "min_freq": 2, + "default_token": "", + "special_tokens": ["", ""], + "save_path": "vocabs/kvret_src_tokens.dict", + "load_path": "vocabs/kvret_src_tokens.dict" }, { "id": "tgt_token_vocab", - "fit_on": ["y_norm_tokens"], - "debug": true, - "name": "default_vocab", - "level": "token", - "special_tokens": ["", ""], - "save_path": "vocabs/tgt_tokens.dict", - "load_path": "vocabs/tgt_tokens.dict" + "fit_on": ["y_without_entities_tokens"], + "name": "simple_vocab", + "default_token": "", + "special_tokens": ["", "", ""], + "save_path": "vocabs/kvret_tgt_tokens.dict", + "load_path": "vocabs/kvret_tgt_tokens.dict" + }, + { + "id": "token_embedder", + "name": "fasttext", + "dim": 300, + "save_path": "embeddings/wiki.en.bin", + "load_path": "embeddings/wiki.en.bin" }, { - "in": ["x_tokens", "dialog_id"], + "in": ["x_tokens", "history_tokens", "kb_entries"], "in_y": ["y_norm_tokens"], "out": ["prediction_norm_tokens"], "main": true, "name": "seq2seq_go_bot", "start_of_sequence_token": "", "end_of_sequence_token": "", - "network": { - "name": "seq2seq_go_bot_nn", + "embedder": "#token_embedder", + "network_parameters": { "load_path": "seq2seq_go_bot/model", "save_path": "seq2seq_go_bot/model", - "learning_rate": 0.0009, + "learning_rate": 0.0002, + "dropout_rate": 0.2, + "state_dropout_rate": 0.07, + "beam_width": 1, "target_start_of_sequence_index": "#tgt_token_vocab.__getitem__('')", "target_end_of_sequence_index": "#tgt_token_vocab.__getitem__('')", "source_vocab_size": "#src_token_vocab.__len__()", "target_vocab_size": "#tgt_token_vocab.__len__()", - "hidden_size": 256 + "hidden_size": 256, + "kb_attention_hidden_sizes": [64, 32] }, + "debug": true, + "debug": false, "source_vocab": "#src_token_vocab", "target_vocab": "#tgt_token_vocab", - "debug": false + "knowledge_base_keys": "#kb.primary_keys" }, { "name": "knowledge_base_entity_normalizer", - "kb": "#kb", "denormalize": true, - "in": ["dialog_id", "prediction_norm_tokens"], + "in": ["prediction_norm_tokens", "kb_entries"], "out": ["prediction_tokens"] }, { @@ -98,8 +123,8 @@ "epochs": 200, "batch_size": 16, - "metrics": ["per_item_bleu", "per_item_accuracy"], - "validation_patience": 20, + "metrics": ["google_bleu", "bleu", "accuracy"], + "validation_patience": 30, "val_every_n_epochs": 1, "log_every_n_batches": -1, @@ -118,7 +143,11 @@ }, "download": [ "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/seq2seq_go_bot.tar.gz" + "http://files.deeppavlov.ai/deeppavlov_data/seq2seq_go_bot_v2.tar.gz", + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", + "subdir": "embeddings" + } ] } } diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json index 03224dc249..d454741f91 100644 --- a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json +++ b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json @@ -1,86 +1,115 @@ { + "dataset_reader": { + "name": "kvret_reader", + "data_path": "kvret" + }, + "dataset_iterator": { + "name": "kvret_dialog_iterator", + "shuffle": false + }, "chainer": { - "in": ["x_text"], + "in": ["x_text", "dialog_id"], "in_y": ["y_text", "y_domain"], "out": ["prediction_text"], "pipe": [ { "id": "stream_spacy", "name": "stream_spacy_tokenizer", + "lowercase": true, "alphas_only": false, "in": ["x_text"], "out": ["x_tokens"] }, { - "ref": "stream_spacy", - "in": ["y_text"], - "out": ["y_tokens"] + "id": "dialog_history", + "name": "dialog_state", + "in": ["dialog_id"], + "out": ["history_tokens"] }, { "id": "kb", "name": "knowledge_base", "tokenizer": "#stream_spacy", + "in": ["dialog_id"], + "out": ["kb_entries"], "save_path": "seq2seq_go_bot/kvret_kb.json", "load_path": "seq2seq_go_bot/kvret_kb.json" }, { "id": "src_token_vocab", - "name": "default_vocab", - "level": "token", - "special_tokens": [""], - "save_path": "vocabs/src_tokens.dict", - "load_path": "vocabs/src_tokens.dict" + "name": "simple_vocab", + "default_token": "", + "special_tokens": ["", ""], + "save_path": "vocabs/kvret_src_tokens.dict", + "load_path": "vocabs/kvret_src_tokens.dict" }, { "id": "tgt_token_vocab", - "name": "default_vocab", - "level": "token", - "special_tokens": ["", ""], - "save_path": "vocabs/tgt_tokens.dict", - "load_path": "vocabs/tgt_tokens.dict" + "name": "simple_vocab", + "default_token": "", + "special_tokens": ["", "", ""], + "save_path": "vocabs/kvret_tgt_tokens.dict", + "load_path": "vocabs/kvret_tgt_tokens.dict" }, { - "in": ["x_tokens"], - "in_y": ["y_tokens"], - "out": ["prediction_tokens"], + "id": "token_embedder", + "name": "fasttext", + "dim": 300, + "save_path": "embeddings/wiki.en.bin", + "load_path": "embeddings/wiki.en.bin" + }, + { + "in": ["x_tokens", "history_tokens", "kb_entries"], + "out": ["prediction_norm_tokens"], "main": true, "name": "seq2seq_go_bot", "start_of_sequence_token": "", "end_of_sequence_token": "", - "network": { - "name": "seq2seq_go_bot_nn", + "embedder": "#token_embedder", + "network_parameters": { "load_path": "seq2seq_go_bot/model", "save_path": "seq2seq_go_bot/model", - "learning_rate": 0.0009, + "learning_rate": 0.0002, + "dropout_rate": 0.2, + "state_dropout_rate": 0.07, + "beam_width": 1, "target_start_of_sequence_index": "#tgt_token_vocab.__getitem__('')", "target_end_of_sequence_index": "#tgt_token_vocab.__getitem__('')", "source_vocab_size": "#src_token_vocab.__len__()", "target_vocab_size": "#tgt_token_vocab.__len__()", - "hidden_size": 256 + "hidden_size": 256, + "kb_attention_hidden_sizes": [64, 32] }, + "debug": true, + "debug": false, "source_vocab": "#src_token_vocab", "target_vocab": "#tgt_token_vocab", - "debug": false + "knowledge_base_keys": "#kb.primary_keys" + }, + { + "name": "knowledge_base_entity_normalizer", + "denormalize": true, + "in": ["prediction_norm_tokens", "kb_entries"], + "out": ["prediction_tokens"] }, { "ref": "stream_spacy", "in": ["prediction_tokens"], "out": ["prediction_text"] + }, + { + "ref": "dialog_history", + "in": ["dialog_id", "x_tokens"], + "out": ["history_tokens"] + }, + { + "ref": "dialog_history", + "in": ["dialog_id", "prediction_tokens"], + "out": ["history_tokens"] } ] }, - "train": { - "epochs": 200, - "batch_size": 16, - - "metrics": ["per_item_bleu", "per_item_accuracy"], - "validation_patience": 20, - "val_every_n_epochs": 1, - - "log_every_n_batches": -1, - "log_every_n_epochs": 1, - "show_examples": false - }, + "train": {}, "metadata": { "requirements": [ "../dp_requirements/tf.txt", @@ -93,7 +122,11 @@ }, "download": [ "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "http://files.deeppavlov.ai/deeppavlov_data/seq2seq_go_bot.tar.gz" + "http://files.deeppavlov.ai/deeppavlov_data/seq2seq_go_bot_v2.tar.gz", + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", + "subdir": "embeddings" + } ] } } diff --git a/deeppavlov/configs/squad/squad.json b/deeppavlov/configs/squad/squad.json index 30890e75a6..aaab86810d 100644 --- a/deeppavlov/configs/squad/squad.json +++ b/deeppavlov/configs/squad/squad.json @@ -81,7 +81,7 @@ "load_path": "squad_model/model", "in": ["context_tokens_idxs", "context_chars_idxs", "question_tokens_idxs", "question_chars_idxs"], "in_y": ["ans_start", "ans_end"], - "out": ["ans_start_predicted", "ans_end_predicted"] + "out": ["ans_start_predicted", "ans_end_predicted", "logits"] }, { "name": "squad_ans_postprocessor", @@ -90,7 +90,7 @@ "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] } ], - "out": ["ans_predicted", "ans_start_predicted"] + "out": ["ans_predicted", "ans_start_predicted", "logits"] }, "train": { "show_examples": false, diff --git a/deeppavlov/configs/squad/squad_ru.json b/deeppavlov/configs/squad/squad_ru.json index 8b35ee4558..be535df161 100644 --- a/deeppavlov/configs/squad/squad_ru.json +++ b/deeppavlov/configs/squad/squad_ru.json @@ -82,7 +82,7 @@ "load_path": "squad_model_ru/model", "in": ["context_tokens_idxs", "context_chars_idxs", "question_tokens_idxs", "question_chars_idxs"], "in_y": ["ans_start", "ans_end"], - "out": ["ans_start_predicted", "ans_end_predicted"] + "out": ["ans_start_predicted", "ans_end_predicted", "logits"] }, { "name": "squad_ans_postprocessor", @@ -91,7 +91,7 @@ "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] } ], - "out": ["ans_predicted", "ans_start_predicted"] + "out": ["ans_predicted", "ans_start_predicted", "logits"] }, "train": { "show_examples": false, diff --git a/deeppavlov/configs/vectorizer/tfidf_vectorizer.json b/deeppavlov/configs/vectorizer/tfidf_vectorizer.json new file mode 100644 index 0000000000..5e662fda3a --- /dev/null +++ b/deeppavlov/configs/vectorizer/tfidf_vectorizer.json @@ -0,0 +1,47 @@ +{ + "dataset_reader": { + "name": "line_reader", + "data_path": "wiki/wikitext_ru/ru.wiki.train.txt" + }, + "dataset_iterator": { + "name": "data_learning_iterator" + }, + + "chainer": { + "in": "q", + "pipe": [ + { + "name": "ru_tokenizer", + "in": "q", + "lemmas": true, + "out": "q_token_lemmas" + }, + { + "id": "vectorizer", + "name": "tfidf_vectorizer", + "in": "q_token_lemmas", + "fit_on": ["q_token_lemmas"], + "save_path": "vectorizer/tfidf_vectorizer_ruwiki.pkl", + "out": "q_vect" + } + ], + "out": "q_vect" + }, + + "train": { + "validate_best": false, + "test_best": false + }, + "metadata": { + "download": [ + { + "url": "http://files.deeppavlov.ai/datasets/wikitext_ru.zip", + "subdir": "wiki" + } + ] + } + +} + + + diff --git a/deeppavlov/core/commands/infer.py b/deeppavlov/core/commands/infer.py index 12b4375c4a..0f9c8ffe2a 100644 --- a/deeppavlov/core/commands/infer.py +++ b/deeppavlov/core/commands/infer.py @@ -1,20 +1,19 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pathlib import Path +from typing import Optional from deeppavlov.core.commands.utils import set_deeppavlov_root, import_packages from deeppavlov.core.common.chainer import Chainer @@ -28,7 +27,9 @@ log = get_logger(__name__) -def build_model_from_config(config: [str, Path, dict], mode='infer', load_trained=False, as_component=False): +def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False, + as_component: bool = False) -> Chainer: + """Build and return the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) @@ -58,14 +59,16 @@ def build_model_from_config(config: [str, Path, dict], mode='infer', load_traine return model -def build_agent_from_config(config_path: str): +def build_agent_from_config(config_path: str) -> Agent: + """Build and return the agent described in corresponding configuration file.""" config = read_json(config_path) skill_configs = config['skills'] commutator_config = config['commutator'] return Agent(skill_configs, commutator_config) -def interact_agent(config_path): +def interact_agent(config_path: str) -> None: + """Start interaction with the agent described in corresponding configuration file.""" a = build_agent_from_config(config_path) commutator = from_params(a.commutator_config) @@ -89,7 +92,8 @@ def interact_agent(config_path): log.debug("Current history: {}".format(a.history)) -def interact_model(config_path): +def interact_model(config_path: str) -> None: + """Start interaction with the model described in corresponding configuration file.""" config = read_json(config_path) model = build_model_from_config(config) @@ -109,7 +113,8 @@ def interact_model(config_path): print('>>', *pred) -def predict_on_stream(config_path, batch_size=1, file_path=None): +def predict_on_stream(config_path: str, batch_size: int = 1, file_path: Optional[str] = None) -> None: + """Make a prediction with the component described in corresponding configuration file.""" import sys import json from itertools import islice diff --git a/deeppavlov/core/commands/train.py b/deeppavlov/core/commands/train.py index 072efb2d75..931fb15ee0 100644 --- a/deeppavlov/core/commands/train.py +++ b/deeppavlov/core/commands/train.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import datetime import importlib @@ -41,10 +39,8 @@ log = get_logger(__name__) -def prettify_metrics(metrics, precision=4): - """ - Prettifies the dictionary of metrics - """ +def prettify_metrics(metrics: dict, precision: int = 4) -> OrderedDict: + """Prettifies the dictionary of metrics.""" prettified_metrics = OrderedDict() for key, value in metrics: value = round(value, precision) @@ -65,8 +61,8 @@ def _fit_batches(model: Estimator, iterator: DataFittingIterator, train_config) return model -def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingIterator]): - +def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingIterator]) -> Chainer: + """Fit and return the chainer described in corresponding configuration dictionary.""" chainer_config: dict = config['chainer'] chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) for component_config in chainer_config['pipe']: @@ -96,7 +92,8 @@ def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingI return chainer -def train_evaluate_model_from_config(config: [str, Path, dict], to_train=True, to_validate=True) -> None: +def train_evaluate_model_from_config(config: [str, Path, dict], to_train: bool = True, to_validate: bool = True) -> None: + """Make training and evaluation of the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) diff --git a/deeppavlov/core/commands/utils.py b/deeppavlov/core/commands/utils.py index d7550f6fae..9f5489e517 100644 --- a/deeppavlov/core/commands/utils.py +++ b/deeppavlov/core/commands/utils.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pathlib import Path @@ -20,11 +18,10 @@ from deeppavlov.core.common import paths +import os -def set_deeppavlov_root(config: dict): - """ - Make a serialization user dir. - """ +def set_deeppavlov_root(config: dict) -> None: + """Make a serialization user dir.""" try: deeppavlov_root = Path(config['deeppavlov_root']) except KeyError: @@ -36,22 +33,35 @@ def set_deeppavlov_root(config: dict): def get_deeppavlov_root() -> Path: + """Return DeepPavlov root directory.""" if not paths.deeppavlov_root: set_deeppavlov_root({}) return paths.deeppavlov_root def expand_path(path: Union[str, Path]) -> Path: + """Make path expansion.""" return get_deeppavlov_root() / Path(path).expanduser() +def make_all_dirs(path: Union[str, Path]) -> None: + directory = os.path.dirname(path) + if not os.path.exists(directory): + os.makedirs(directory) + +def is_file_exist(path: Union[str, Path]): + if path is None: + return False + + return os.path.exists(expand_path(path)) + + def is_empty(d: Path) -> bool: - """ - Check if directory is empty. - """ + """Check if directory is empty.""" return not bool(list(d.iterdir())) -def import_packages(packages: list): +def import_packages(packages: list) -> None: + """Simple function to import packages from list.""" for package in packages: __import__(package) diff --git a/deeppavlov/core/common/attributes.py b/deeppavlov/core/common/attributes.py index a96f28a112..c46836432f 100644 --- a/deeppavlov/core/common/attributes.py +++ b/deeppavlov/core/common/attributes.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from functools import wraps diff --git a/deeppavlov/core/common/chainer.py b/deeppavlov/core/common/chainer.py index 147fdf6121..b7a06367e6 100644 --- a/deeppavlov/core/common/chainer.py +++ b/deeppavlov/core/common/chainer.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" import inspect from deeppavlov.core.common.errors import ConfigError @@ -21,8 +20,12 @@ class Chainer(Component): - def __init__(self, in_x: [str, list]=None, out_params: [str, list]=None, in_y: [str, list]=None, - *args, as_component: bool=False, **kwargs): + """ + Builds an agent/component pipeline from heterogeneous components (Rule-based/ML/DL). It allows to train + and infer models in a pipeline as a whole. + """ + def __init__(self, in_x: [str, list] = None, out_params: [str, list] = None, in_y: [str, list] = None, + *args, as_component: bool = False, **kwargs): self.pipe = [] self.train_pipe = [] if isinstance(in_x, str): diff --git a/deeppavlov/core/common/check_gpu.py b/deeppavlov/core/common/check_gpu.py index 7ba07bdcc8..1150d6456e 100644 --- a/deeppavlov/core/common/check_gpu.py +++ b/deeppavlov/core/common/check_gpu.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" import tensorflow as tf from tensorflow.python.client import device_lib diff --git a/deeppavlov/core/common/errors.py b/deeppavlov/core/common/errors.py index b54c92563f..d5d4ce23e2 100644 --- a/deeppavlov/core/common/errors.py +++ b/deeppavlov/core/common/errors.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging @@ -20,10 +18,7 @@ class ConfigError(Exception): - """ - Any configuration error. - """ - + """Any configuration error.""" def __init__(self, message): super(ConfigError, self).__init__() self.message = message diff --git a/deeppavlov/core/common/file.py b/deeppavlov/core/common/file.py index 536f36ef83..7a0c0ac7e8 100644 --- a/deeppavlov/core/common/file.py +++ b/deeppavlov/core/common/file.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import pickle diff --git a/deeppavlov/core/common/log.py b/deeppavlov/core/common/log.py index 08f61f225b..ce45b4361b 100644 --- a/deeppavlov/core/common/log.py +++ b/deeppavlov/core/common/log.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pathlib import Path import json diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index f2dfc39a54..9cc5fa6323 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -7,6 +7,7 @@ "classification_log_loss": "deeppavlov.metrics.log_loss:classification_log_loss", "classification_roc_auc": "deeppavlov.metrics.roc_auc_score:classification_roc_auc_score", "exact_match": "deeppavlov.metrics.squad_metrics:exact_match", + "google_bleu": "deeppavlov.metrics.bleu:google_bleu", "loss": "deeppavlov.models.ranking.metrics:triplet_loss", "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", "per_item_accuracy": "deeppavlov.metrics.accuracy:per_item_accuracy", diff --git a/deeppavlov/core/common/metrics_registry.py b/deeppavlov/core/common/metrics_registry.py index add6be0c42..2a0a341fd6 100644 --- a/deeppavlov/core/common/metrics_registry.py +++ b/deeppavlov/core/common/metrics_registry.py @@ -1,6 +1,7 @@ import importlib from pathlib import Path import json +from typing import List, Callable, Any from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.log import get_logger @@ -16,7 +17,8 @@ _REGISTRY = {} -def fn_from_str(name: str) -> type: +def fn_from_str(name: str) -> Callable[..., Any]: + """Returns a function object with the name given in string.""" try: module_name, fn_name = name.split(':') except ValueError: @@ -26,7 +28,8 @@ def fn_from_str(name: str) -> type: return getattr(importlib.import_module(module_name), fn_name) -def register_metric(metric_name): +def register_metric(metric_name: str) -> Callable[..., Any]: + """Decorator for metric registration.""" def decorate(fn): fn_name = fn.__module__ + ':' + fn.__name__ if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name: @@ -37,7 +40,8 @@ def decorate(fn): return decorate -def get_metrics_by_names(names: list): +def get_metrics_by_names(names: list) -> List[Callable[..., Any]]: + """Returns a list of metric callables with corresponding names.""" not_found = [name for name in names if name not in _REGISTRY] if not_found: raise ConfigError('Names {} are not registered as metrics'.format(not_found)) diff --git a/deeppavlov/core/common/params.py b/deeppavlov/core/common/params.py index 6b3e2e3ce1..3c73f2d98a 100644 --- a/deeppavlov/core/common/params.py +++ b/deeppavlov/core/common/params.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" import importlib import inspect from typing import Dict @@ -57,7 +56,8 @@ def _init_param(param, mode): return param -def from_params(params: Dict, mode='infer', **kwargs) -> Component: +def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Component: + """Builds and returns the Component from corresponding dictionary of parameters.""" # what is passed in json: config_params = {k: _resolve(v) for k, v in params.items()} diff --git a/deeppavlov/core/common/params_search.py b/deeppavlov/core/common/params_search.py new file mode 100644 index 0000000000..43ef1dc394 --- /dev/null +++ b/deeppavlov/core/common/params_search.py @@ -0,0 +1,250 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from copy import deepcopy +import random +from typing import List, Generator, Tuple, Any + +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.log import get_logger + + +log = get_logger(__name__) + + +@register('params_search') +class ParamsSearch: + """ + Class determine the main operations for parameters search + like finding all changing parameters. + + Args: + prefix: prefix to determine special keys like `PREFIX_range`, `PREFIX_bool`, `PREFIX_choice` + seed: random seed for initialization + **kwargs: basic config with parameters + + Attributes: + basic_config: dictionary with initial evolutionary config + prefix: prefix to determine special keys like `PREFIX_range`, `PREFIX_bool`, `PREFIX_choice` + paths_to_params: list of lists of keys and/or integers (for list) + with relative paths to searched parameters + n_params: number of searched parameters + eps: EPS value + paths_to_fiton_dicts: list of lists of keys and/or integers (for list) + with relative paths to dictionaries that can be "fitted on" + n_fiton_dicts: number of dictionaries that can be "fitted on" + """ + + def __init__(self, + prefix="search", + seed: int = None, + **kwargs): + """ + Initialize evolution with random population + """ + + self.basic_config = deepcopy(kwargs) + self.prefix = prefix + + self.paths_to_params = [] + for search_type in [prefix + "_range", prefix + "_choice", prefix + "_bool"]: + for path_ in self.find_model_path(self.basic_config, search_type): + self.paths_to_params.append(path_) + + self.n_params = len(self.paths_to_params) + + self.eps = 1e-6 + + self.paths_to_fiton_dicts = [] + for path_ in self.find_model_path(self.basic_config, "fit_on"): + self.paths_to_fiton_dicts.append(path_) + self.n_fiton_dicts = len(self.paths_to_fiton_dicts) + + if seed is None: + pass + else: + np.random.seed(seed) + random.seed(seed) + + def find_model_path(self, config: dict, key_model: str, path: list = []) -> Generator: + """ + Find paths to all dictionaries in config that contain key 'key_model' + + Args: + config: dictionary + key_model: key of sub-dictionary to be found + path: list of keys and/or integers (for list) with relative path (needed for recursion) + + Returns: + path in config -- list of keys (strings and integers) + """ + config_pointer = config + if type(config_pointer) is dict and key_model in config_pointer.keys(): + yield path + else: + if type(config_pointer) is dict: + for key in list(config_pointer.keys()): + for path_ in self.find_model_path(config_pointer[key], key_model, path + [key]): + yield path_ + elif type(config_pointer) is list: + for i in range(len(config_pointer)): + for path_ in self.find_model_path(config_pointer[i], key_model, path + [i]): + yield path_ + + @staticmethod + def insert_value_or_dict_into_config(config: dict, path: list, + value: [int, float, str, bool, list, dict, np.ndarray]) -> dict: + """ + Insert value to dictionary determined by path[:-1] in field with key path[-1] + + Args: + config: dictionary + path: list of keys and/or integers (for list) + value: value to be inserted + + Returns: + config with inserted value + """ + config_copy = deepcopy(config) + config_pointer = config_copy + for el in path[:-1]: + if type(config_pointer) is dict: + config_pointer = config_pointer.setdefault(el, {}) + elif type(config_pointer) is list: + config_pointer = config_pointer[el] + else: + pass + config_pointer[path[-1]] = value + return config_copy + + @staticmethod + def get_value_from_config(config: dict, path: list) -> Any: + """ + Return value of config element determined by path + + Args: + config: dictionary + path: list of keys and/or integers (for list) + + Returns: + value + """ + config_copy = deepcopy(config) + config_pointer = config_copy + for el in path[:-1]: + if type(config_pointer) is dict: + config_pointer = config_pointer.setdefault(el, {}) + elif type(config_pointer) is list: + config_pointer = config_pointer[el] + else: + pass + return config_pointer[path[-1]] + + def initialize_params_in_config(self, basic_config: dict, paths: List[list]) -> dict: + """ + Randomly initialize all the changable parameters in config + + Args: + basic_config: config where changable parameters are dictionaries with keys + `evolve_range`, `evolve_bool`, `evolve_choice` + paths: list of paths to changable parameters + + Returns: + config + """ + config = deepcopy(basic_config) + for path_ in paths: + param_name = path_[-1] + value = self.get_value_from_config(basic_config, path_) + if type(value) is dict: + if (value.get(self.prefix + "_choice") or + value.get(self.prefix + "_range") or + value.get(self.prefix + "_bool")): + config = self.insert_value_or_dict_into_config( + config, path_, + self.sample_params(**{param_name: deepcopy(value)})[param_name]) + + return config + + def sample_params(self, **params) -> dict: + """ + Sample parameters according to the given possible values + + Args: + **params: dictionary like {"param_0": {"evolve_range": [0, 10]}, + "param_1": {"evolve_range": [0, 10], "discrete": true}, + "param_2": {"evolve_range": [0, 1], "scale": "log"}, + "param_3": {"evolve_bool": true}, + "param_4": {"evolve_choice": [0, 1, 2, 3]}} + + Returns: + dictionary with randomly sampled parameters + """ + if not params: + return {} + else: + params_copy = deepcopy(params) + params_sample = dict() + for param, param_val in params_copy.items(): + if isinstance(param_val, dict): + if self.prefix + '_bool' in param_val and param_val[self.prefix + '_bool']: + sample = bool(random.choice([True, False])) + elif self.prefix + '_range' in param_val: + sample = self._sample_from_ranges(param_val) + elif self.prefix + '_choice' in param_val: + sample = random.choice(param_val[self.prefix + '_choice']) + else: + sample = param_val + params_sample[param] = sample + else: + params_sample[param] = params_copy[param] + return params_sample + + def _sample_from_ranges(self, opts: dict) -> [int, float]: + """ + Sample parameters from ranges + + Args: + opts: dictionary {"evolve_range": [0, 10]} or \ + {"evolve_range": [0, 10], "discrete": true} or \ + {"evolve_range": [0, 1], "scale": "log"} + + Returns: + random parameter value from range + """ + from_ = opts[self.prefix + '_range'][0] + to_ = opts[self.prefix + '_range'][1] + if opts.get('scale', None) == 'log': + sample = self._sample_log(from_, to_) + else: + sample = np.random.uniform(from_, to_) + if opts.get('discrete', False): + sample = int(np.round(sample)) + return sample + + @staticmethod + def _sample_log(from_: float = 0., to_: float = 1.) -> float: + """ + Sample parameters from ranges with log scale + + Args: + from_: lower boundary of values + to_: upper boundary of values + + Returns: + random parameters value from range with log scale + """ + sample = np.exp(np.random.uniform(np.log(from_), np.log(to_))) + return float(sample) diff --git a/deeppavlov/core/common/prints.py b/deeppavlov/core/common/prints.py index 254d7edf14..1ce3f52e33 100644 --- a/deeppavlov/core/common/prints.py +++ b/deeppavlov/core/common/prints.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import sys diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index bcb2bd38fc..a5daeabfb6 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -8,20 +8,24 @@ "char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter", "char_vocab": "deeppavlov.core.data.simple_vocab:CharacterVocab", "conll2003_reader": "deeppavlov.dataset_readers.conll2003_reader:Conll2003DatasetReader", + "cos_sim_classifier": "deeppavlov.models.classifiers.cos_sim_classifier:CosineSimilarityClassifier", "data_fitting_iterator": "deeppavlov.core.data.data_fitting_iterator:DataFittingIterator", "data_learning_iterator": "deeppavlov.core.data.data_learning_iterator:DataLearningIterator", "default_vocab": "deeppavlov.core.data.vocab:DefaultVocabulary", "dialog_db_result_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDBResultDatasetIterator", "dialog_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDatasetIterator", + "dialog_state": "deeppavlov.models.seq2seq_go_bot.dialog_state:DialogState", "dialog_vocab": "deeppavlov.core.data.simple_vocab:DialogVocab", "dictionary_vectorizer": "deeppavlov.models.vectorizers.word_vectorizer:DictionaryVectorizer", "dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor", + "document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker", "dstc2_intents_iterator": "deeppavlov.dataset_iterators.dstc2_intents_iterator:Dstc2IntentsDatasetIterator", "dstc2_ner_iterator": "deeppavlov.dataset_iterators.dstc2_ner_iterator:Dstc2NerDatasetIterator", "dstc2_reader": "deeppavlov.dataset_readers.dstc2_reader:DSTC2DatasetReader", "dstc_slotfilling": "deeppavlov.models.slotfill.slotfill:DstcSlotFillingNetwork", "elmo": "deeppavlov.models.embedders.elmo_embedder:ELMoEmbedder", "emb_mat_assembler": "deeppavlov.models.preprocessors.assemble_embeddings_matrix:EmbeddingsMatrixAssembler", + "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader", "fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder", "featurized_tracker": "deeppavlov.models.go_bot.tracker:FeaturizedTracker", "glove": "deeppavlov.models.embedders.glove_embedder:GloVeEmbedder", @@ -36,28 +40,32 @@ "kvret_dialog_iterator": "deeppavlov.dataset_iterators.kvret_dialog_iterator:KvretDialogDatasetIterator", "kvret_reader": "deeppavlov.dataset_readers.kvret_reader:KvretDatasetReader", "lazy_tokenizer": "deeppavlov.models.tokenizers.lazy_tokenizer:LazyTokenizer", + "line_reader": "deeppavlov.dataset_readers.line_reader:LineReader", + "logit_ranker": "deeppavlov.models.ranking.logit_ranker:LogitRanker", + "logreg_classifier": "deeppavlov.models.classifiers.logreg_classifier:LogregClassifier", "lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:LowercasePreprocessor", "mask": "deeppavlov.models.preprocessors.mask:Mask", "morpho_tagger": "deeppavlov.models.morpho_tagger.tagger:MorphoTaggerWrapper", "morphotagger_dataset": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator", "morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader", - "negative_sber_faq_reader": "deeppavlov.dataset_readers.negative_sber_faq_reader:SberFAQReader", "ner": "deeppavlov.models.ner.network:NerNetwork", "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer", "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer", "one_hotter": "deeppavlov.models.preprocessors.one_hotter:OneHotter", "ontonotes_reader": "deeppavlov.dataset_readers.ontonotes_reader:OntonotesReader", "params_evolution": "deeppavlov.models.evolution.evolution_param_generator:ParamsEvolution", + "params_search": "deeppavlov.core.common.params_search:ParamsSearch", "pymorphy_russian_lemmatizer": "deeppavlov.models.preprocessors.russian_lemmatizer:PymorphyRussianLemmatizer", "pymorphy_vectorizer": "deeppavlov.models.vectorizers.word_vectorizer:PymorphyVectorizer", - "random_emb_mat": "deeppavlov.models.preprocessors.assemble_embeddings_matrix:RandomEmbeddingsMatrix", + "random_emb_mat": "deeppavlov.models.preprocessors.random_embeddings_matrix:RandomEmbeddingsMatrix", "ranking_iterator": "deeppavlov.dataset_iterators.ranking_iterator:RankingIterator", "ranking_model": "deeppavlov.models.ranking.ranking_model:RankingModel", "ru_sent_tokenizer": "deeppavlov.models.tokenizers.ru_sent_tokenizer:RuSentTokenizer", "ru_tokenizer": "deeppavlov.models.tokenizers.ru_tokenizer:RussianTokenizer", "russian_words_vocab": "deeppavlov.vocabs.typos:RussianWordsVocab", "sanitizer": "deeppavlov.models.preprocessors.sanitizer:Sanitizer", - "sber_faq_reader": "deeppavlov.dataset_readers.sber_faq_reader:SberFAQReader", + "sentence2vector_w2v_avg": "deeppavlov.models.vectorizers.sentence2vector_w2v_avg:SentenceAvgW2vVectorizer", + "sentence2vector_w2v_tfidf": "deeppavlov.models.vectorizers.sentence2vector_w2v_tfidf:SentenceW2vVectorizerTfidfWeights", "seq2seq_go_bot": "deeppavlov.models.seq2seq_go_bot.bot:Seq2SeqGoalOrientedBot", "seq2seq_go_bot_nn": "deeppavlov.models.seq2seq_go_bot.network:Seq2SeqGoalOrientedBotNetwork", "simple_vocab": "deeppavlov.core.data.simple_vocab:SimpleVocabulary", @@ -77,14 +85,15 @@ "static_dictionary": "deeppavlov.vocabs.typos:StaticDictionary", "str_lower": "deeppavlov.models.preprocessors.str_lower:StrLower", "stream_spacy_tokenizer": "deeppavlov.models.tokenizers.spacy_tokenizer:StreamSpacyTokenizer", + "string_multiplier": "deeppavlov.models.preprocessors.odqa_preprocessors:StringMultiplier", "tag_output_prettifier": "deeppavlov.models.morpho_tagger.common:TagOutputPrettifier", "tfidf_ranker": "deeppavlov.models.ranking.tfidf_ranker:TfidfRanker", + "tfidf_vectorizer": "deeppavlov.models.vectorizers.tfidf_vectorizer:TfIdfVectorizer", "top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector", "typos_custom_reader": "deeppavlov.dataset_readers.typos_reader:TyposCustom", "typos_iterator": "deeppavlov.dataset_iterators.typos_iterator:TyposDatasetIterator", "typos_kartaslov_reader": "deeppavlov.dataset_readers.typos_reader:TyposKartaslov", "typos_wikipedia_reader": "deeppavlov.dataset_readers.typos_reader:TyposWikipedia", - "ubuntu_v2_reader": "deeppavlov.dataset_readers.ubuntu_v2_reader:UbuntuV2Reader", "wiki_sqlite_vocab": "deeppavlov.vocabs.wiki_sqlite:WikiSQLiteVocab", "wikitionary_100K_vocab": "deeppavlov.vocabs.typos:Wiki100KDictionary" -} \ No newline at end of file +} diff --git a/deeppavlov/core/common/registry.py b/deeppavlov/core/common/registry.py index 296d439553..631af21c96 100644 --- a/deeppavlov/core/common/registry.py +++ b/deeppavlov/core/common/registry.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" import importlib import json from pathlib import Path @@ -32,6 +31,7 @@ def cls_from_str(name: str) -> type: + """Returns a class object with the name given as a string.""" try: module_name, cls_name = name.split(':') except ValueError: @@ -42,8 +42,10 @@ def cls_from_str(name: str) -> type: def register(name: str = None) -> type: - """Register model. If name is not passed, the model class name is converted to snake-case.""" - + """ + Register classes that could be initialized from JSON configuration file. + If name is not passed, the class name is converted to snake-case. + """ def decorate(model_cls: type, reg_name: str = None) -> type: model_name = reg_name or short_name(model_cls) global _REGISTRY @@ -57,14 +59,17 @@ def decorate(model_cls: type, reg_name: str = None) -> type: def short_name(cls: type) -> str: + """Returns just a class name (without package and module specification).""" return cls.__name__.split('.')[-1] def get_model(name: str) -> type: + """Returns a registered class object with the name given in the string.""" if name not in _REGISTRY: raise ConfigError("Model {} is not registered.".format(name)) return cls_from_str(_REGISTRY[name]) def list_models() -> list: + """Returns a list of names of registered classes.""" return list(_REGISTRY) diff --git a/deeppavlov/core/data/data_fitting_iterator.py b/deeppavlov/core/data/data_fitting_iterator.py index 83e125dd96..d194e30a17 100644 --- a/deeppavlov/core/data/data_fitting_iterator.py +++ b/deeppavlov/core/data/data_fitting_iterator.py @@ -31,10 +31,10 @@ class DataFittingIterator: data: list of documents doc_ids: provided document ids seed: random seed for data shuffling - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching Attributes: - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching random: instance of :class:`Random` initialized with a seed data: list of documents doc_ids: provided by a user ids or generated automatically ids @@ -76,7 +76,7 @@ def gen_batches(self, batch_size: int, shuffle: bool = None) \ Args: batch_size: a number of samples in a single batch - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching Yields: generated tuple of documents and their ids diff --git a/deeppavlov/core/data/data_learning_iterator.py b/deeppavlov/core/data/data_learning_iterator.py index 131d6b3455..ff4085b852 100644 --- a/deeppavlov/core/data/data_learning_iterator.py +++ b/deeppavlov/core/data/data_learning_iterator.py @@ -25,10 +25,10 @@ class DataLearningIterator: Args: data: list of (x, y) pairs for every data type in ``'train'``, ``'valid'`` and ``'test'`` seed: random seed for data shuffling - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching Attributes: - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching random: instance of ``Random`` initialized with a seed """ def split(self, *args, **kwargs): diff --git a/deeppavlov/core/data/simple_vocab.py b/deeppavlov/core/data/simple_vocab.py index b8c8372775..ae56f29acc 100644 --- a/deeppavlov/core/data/simple_vocab.py +++ b/deeppavlov/core/data/simple_vocab.py @@ -32,14 +32,16 @@ class SimpleVocabulary(Estimator): """Implements simple vocabulary.""" def __init__(self, special_tokens=tuple(), + default_token=None, max_tokens=2**30, - min_freq=1, + min_freq=0, pad_with_zeros=False, unk_token=None, *args, **kwargs): super().__init__(**kwargs) self.special_tokens = special_tokens + self.default_token = default_token self._max_tokens = max_tokens self._min_freq = min_freq self._pad_with_zeros = pad_with_zeros @@ -48,9 +50,11 @@ def __init__(self, if self.load_path: self.load() - def fit(self, tokens): + def fit(self, *args): self.reset() - self.freqs = Counter(chain(*tokens)) + tokens = chain(*args) + # filter(None, <>) -- to filter empty tokens + self.freqs = Counter(filter(None, chain(*tokens))) for special_token in self.special_tokens: self._t2i[special_token] = self.count self._i2t.append(special_token) @@ -145,6 +149,11 @@ def is_str_batch(self, batch): return False def reset(self): + # default index is the position of default_token + if self.default_token is not None: + default_ind = self.special_tokens.index(self.default_token) + else: + default_ind = 0 self.freqs = None unk_index = 0 if self.unk_token in self.special_tokens: @@ -153,6 +162,12 @@ def reset(self): self._i2t = [] self.count = 0 + @staticmethod + def is_empty(batch): + non_empty = [item for item in batch if len(item) > 0] + self._i2t = [] + self.count = 0 + @staticmethod def is_empty(batch): non_empty = [item for item in batch if len(item) > 0] @@ -162,7 +177,8 @@ def is_empty(batch): @register('char_vocab') class CharacterVocab(SimpleVocabulary): """Implements character vocabulary.""" - def fit(self, tokens): + def fit(self, *args): + tokens = chain(*args) chars = chain(*tokens) super().fit(chars) @@ -181,7 +197,8 @@ def __call__(self, batch, **kwargs): @register('dialog_vocab') class DialogVocab(SimpleVocabulary): """Implements dialog vocabulary.""" - def fit(self, utterances): + def fit(self, *args): + utterances = chain(*args) tokens = chain(*utterances) super().fit(tokens) diff --git a/deeppavlov/core/layers/tf_attention_mechanisms.py b/deeppavlov/core/layers/tf_attention_mechanisms.py index b22a3f1782..8ff1af33c7 100644 --- a/deeppavlov/core/layers/tf_attention_mechanisms.py +++ b/deeppavlov/core/layers/tf_attention_mechanisms.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import tensorflow as tf from tensorflow.contrib.layers import xavier_initializer as xav diff --git a/deeppavlov/core/layers/tf_csoftmax_attention.py b/deeppavlov/core/layers/tf_csoftmax_attention.py index 1f132864b1..f73bcc3079 100644 --- a/deeppavlov/core/layers/tf_csoftmax_attention.py +++ b/deeppavlov/core/layers/tf_csoftmax_attention.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" import tensorflow as tf @@ -45,11 +44,10 @@ def loop(q_, mask, mass_, found_): p = q_list[1] * (1.0 - mass_) / tf.reduce_sum(q_list[1]) p_new = tf.dynamic_stitch(condition_indices, [q_list[0], p]) - # verification of the condition and modification of masks - less_mask = tf.cast(tf.less(u, p_new), tf.int32) # 0 when u bigger than p, 1 when u less than p + # condition verification and mask modification + less_mask = tf.cast(tf.less(u, p_new), tf.int32) # 0 when u is bigger than p, 1 when u is less than p condition_indices = tf.dynamic_partition(tf.range(tf.shape(p_new)[0]), less_mask, - 2) # 0 when u bigger - # than p, 1 when u less than p + 2) # 0 when u is bigger than p, 1 when u is less than p split_p_new = tf.dynamic_partition(p_new, less_mask, 2) split_u = tf.dynamic_partition(u, less_mask, 2) @@ -73,6 +71,7 @@ def loop(q_, mask, mass_, found_): return [csoft, mask_] + def csoftmax(tensor, inv_cumulative_att): """ It is a implementation of the constrained softmax (csoftmax). Based on the paper: @@ -90,6 +89,7 @@ def csoftmax(tensor, inv_cumulative_att): cs, _ = tf.map_fn(csoftmax_for_slice, merge_tensor, dtype=[tf.float32, tf.float32]) # [bs, L] return cs + def attention_gen_step(hidden_for_sketch, hidden_for_attn_alignment, sketch, key, cum_att): """ It is a implementation one step of block of the Luong et al. attention mechanism with general score and the constrained softmax (csoftmax). Based on the papers: @@ -138,6 +138,7 @@ def attention_gen_step(hidden_for_sketch, hidden_for_attn_alignment, sketch, key aligned_hidden_sketch = tf.squeeze(tf.matmul(t_hidden_for_attn_alignment,r_att),-1) return next_sketch, att, aligned_hidden_sketch + def attention_gen_block(hidden_for_sketch, hidden_for_attn_alignment, key, attention_depth): """ It is a implementation of the Luong et al. attention mechanism with general score and the constrained softmax (csoftmax). Based on the papers: @@ -217,6 +218,7 @@ def attention_bah_step(hidden_for_sketch, hidden_for_attn_alignment, sketch, cum aligned_hidden_sketch = tf.squeeze(tf.matmul(t_hidden_for_attn_alignment,r_att),-1) return next_sketch, att, aligned_hidden_sketch + def attention_bah_block(hidden_for_sketch, hidden_for_attn_alignment, attention_depth): """ It is a implementation of the Bahdanau et al. attention mechanism with concat score and the constrained softmax (csoftmax). Based on the papers: diff --git a/deeppavlov/core/layers/tf_layers.py b/deeppavlov/core/layers/tf_layers.py index cf8b9e1e11..0ef45fd21a 100644 --- a/deeppavlov/core/layers/tf_layers.py +++ b/deeppavlov/core/layers/tf_layers.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import tensorflow as tf from tensorflow.contrib.layers import xavier_initializer import numpy as np @@ -525,7 +524,7 @@ def cudnn_gru(units, n_hidden, n_layers=1, trainable_initial_states=False, Returns: h - all hidden states along T dimension, tf.Tensor with dimensionality [B x T x F] - h_last - last hidden state, tf.Tensor with dimensionality [B x (n_layers * H)] + h_last - last hidden state, tf.Tensor with dimensionality [B x H] """ with tf.variable_scope(name, reuse=reuse): gru = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=n_layers, @@ -541,7 +540,7 @@ def cudnn_gru(units, n_hidden, n_layers=1, trainable_initial_states=False, h, h_last = gru(tf.transpose(units, (1, 0, 2)), (initial_h, )) h = tf.transpose(h, (1, 0, 2)) - h_last = tf.reshape(h_last, shape=(-1, n_hidden)) + h_last = tf.squeeze(h_last, axis=0)[-1] # extract last layer state # Extract last states if they are provided if seq_lengths is not None: @@ -574,7 +573,7 @@ def cudnn_compatible_gru(units, n_hidden, n_layers=1, trainable_initial_states=F Returns: h - all hidden states along T dimension, tf.Tensor with dimensionality [B x T x F] - h_last - last hidden state, tf.Tensor with dimensionality [B x (n_layers * H)] + h_last - last hidden state, tf.Tensor with dimensionality [B x H] """ with tf.variable_scope(name, reuse=reuse): @@ -594,9 +593,10 @@ def single_cell(): return tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(n_hidden) h, h_last = tf.nn.dynamic_rnn(cell=cell, inputs=units, time_major=True, initial_state=tuple(tf.unstack(initial_h, axis=0))) - h = tf.transpose(h, (1, 0, 2)) - h_last = tf.reshape(tf.stack(h_last, axis=0), shape=(-1, n_hidden)) + + h_last = h_last[-1] # h_last is tuple: n_layers x batch_size x n_hidden + # Extract last states if they are provided if seq_lengths is not None: indices = tf.stack([tf.range(tf.shape(h)[0]), seq_lengths], axis=1) @@ -647,9 +647,9 @@ def cudnn_lstm(units, n_hidden, n_layers=1, trainable_initial_states=None, seq_l Returns: h - all hidden states along T dimension, tf.Tensor with dimensionality [B x T x F] - h_last - last hidden state, tf.Tensor with dimensionality [B x (n_layers * H)] + h_last - last hidden state, tf.Tensor with dimensionality [B x H] where H - number of hidden units - c_last - last cell state, tf.Tensor with dimensionality [B x (n_layers * H)] + c_last - last cell state, tf.Tensor with dimensionality [B x H] where H - number of hidden units """ with tf.variable_scope(name, reuse=reuse): @@ -668,8 +668,8 @@ def cudnn_lstm(units, n_hidden, n_layers=1, trainable_initial_states=None, seq_l h, (h_last, c_last) = lstm(tf.transpose(units, (1, 0, 2)), (initial_h, initial_c)) h = tf.transpose(h, (1, 0, 2)) - h_last = tf.reshape(h_last, shape=(-1, n_hidden)) - c_last = tf.reshape(c_last, shape=(-1, n_hidden)) + h_last = h_last[-1] + c_last = c_last[-1] # Extract last states if they are provided if seq_lengths is not None: @@ -681,7 +681,6 @@ def cudnn_lstm(units, n_hidden, n_layers=1, trainable_initial_states=None, seq_l def cudnn_compatible_lstm(units, n_hidden, n_layers=1, trainable_initial_states=None, seq_lengths=None, initial_h=None, initial_c=None, name='cudnn_lstm', reuse=False): - """ CuDNN Compatible LSTM implementation. It should be used to load models saved with CudnnLSTMCell to run on CPU. @@ -706,9 +705,9 @@ def cudnn_compatible_lstm(units, n_hidden, n_layers=1, trainable_initial_states= Returns: h - all hidden states along T dimension, tf.Tensor with dimensionality [B x T x F] - h_last - last hidden state, tf.Tensor with dimensionality [B x (n_layers * H)] + h_last - last hidden state, tf.Tensor with dimensionality [B x H] where H - number of hidden units - c_last - last cell state, tf.Tensor with dimensionality [B x (n_layers * H)] + c_last - last cell state, tf.Tensor with dimensionality [B x H] where H - number of hidden units """ @@ -736,12 +735,9 @@ def single_cell(): return tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(n_hidden) h, state = tf.nn.dynamic_rnn(cell=cell, inputs=units, time_major=True, initial_state=init) - h_last = tf.stack([state[i].h for i in range(n_layers)], axis=0) - h_last = tf.reshape(h_last, shape=(-1, n_hidden)) - c_last = tf.stack([state[i].c for i in range(n_layers)], axis=0) - c_last = tf.reshape(c_last, shape=(-1, n_hidden)) - h = tf.transpose(h, (1, 0, 2)) + h_last = state[-1].h + c_last = state[-1].c # Extract last states if they are provided if seq_lengths is not None: @@ -831,26 +827,26 @@ def cudnn_bi_lstm(units, reuse=False): """ Fast CuDNN Bi-LSTM implementation - Args: - units: tf.Tensor with dimensions [B x T x F], where - B - batch size - T - number of tokens - F - features - n_hidden: dimensionality of hidden state - seq_lengths: number of tokens in each sample in the batch - n_layers: number of layers - trainable_initial_states: whether to create a special trainable variable - to initialize the hidden states of the network or use just zeros - name: name of the variable scope to use - reuse:whether to reuse already initialized variable + Args: + units: tf.Tensor with dimensions [B x T x F], where + B - batch size + T - number of tokens + F - features + n_hidden: dimensionality of hidden state + seq_lengths: number of tokens in each sample in the batch + n_layers: number of layers + trainable_initial_states: whether to create a special trainable variable + to initialize the hidden states of the network or use just zeros + name: name of the variable scope to use + reuse:whether to reuse already initialized variable - Returns: - h - all hidden states along T dimension, - tf.Tensor with dimensionality [B x T x F] - h_last - last hidden state, tf.Tensor with dimensionality [B x H * 2] - where H - number of hidden units - c_last - last cell state, tf.Tensor with dimensionality [B x H * 2] - where H - number of hidden units + Returns: + h - all hidden states along T dimension, + tf.Tensor with dimensionality [B x T x F] + h_last - last hidden state, tf.Tensor with dimensionality [B x H * 2] + where H - number of hidden units + c_last - last cell state, tf.Tensor with dimensionality [B x H * 2] + where H - number of hidden units """ with tf.variable_scope(name, reuse=reuse): if seq_lengths is None: diff --git a/deeppavlov/core/models/component.py b/deeppavlov/core/models/component.py index 5d5c74a4cc..f07c19cda9 100644 --- a/deeppavlov/core/models/component.py +++ b/deeppavlov/core/models/component.py @@ -16,7 +16,7 @@ class Component(metaclass=ABCMeta): - + """Abstract class for all callables that could be used in Chainer's pipe.""" @abstractmethod def __call__(self, *args, **kwargs): pass diff --git a/deeppavlov/core/models/estimator.py b/deeppavlov/core/models/estimator.py index bdd697d97f..9cccd305d5 100644 --- a/deeppavlov/core/models/estimator.py +++ b/deeppavlov/core/models/estimator.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" from abc import abstractmethod from .component import Component @@ -20,7 +19,7 @@ class Estimator(Component, Serializable): - + """Abstract class for components that could be fitted on the data as a whole.""" @abstractmethod def fit(self, *args, **kwargs): pass diff --git a/deeppavlov/core/models/keras_model.py b/deeppavlov/core/models/keras_model.py index 5564b591cd..3e03b3203e 100644 --- a/deeppavlov/core/models/keras_model.py +++ b/deeppavlov/core/models/keras_model.py @@ -1,18 +1,16 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from abc import abstractmethod from pathlib import Path @@ -21,17 +19,15 @@ import tensorflow as tf import keras.metrics import keras.optimizers -from typing import Dict from overrides import overrides -from .tf_backend import TfModelMeta from keras import backend as K from keras.models import Model -from keras.layers import Dense, Input from deeppavlov.core.models.nn_model import NNModel from deeppavlov.core.common.file import save_json, read_json from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.log import get_logger +from .tf_backend import TfModelMeta log = get_logger(__name__) @@ -39,7 +35,8 @@ class KerasModel(NNModel, metaclass=TfModelMeta): """ - Class builds keras model with tensorflow backend + Builds Keras model with TensorFlow backend. + Attributes: opt: dictionary with all model parameters model: keras model itself @@ -50,7 +47,7 @@ class KerasModel(NNModel, metaclass=TfModelMeta): optimizer: keras.optimizers instance """ - def __init__(self, **kwargs): + def __init__(self, **kwargs) -> None: """ Initialize model using parameters from opt Args: @@ -86,7 +83,7 @@ def _config_session(self): def init_model_from_scratch(self, model_name: str, optimizer_name: str, loss_name: str, - lear_rate: float = 0.01, lear_rate_decay: float = 0.): + lear_rate: float = 0.01, lear_rate_decay: float = 0.) -> Model: """ Initialize model from scratch with given params Args: @@ -131,7 +128,7 @@ def init_model_from_scratch(self, model_name: str, optimizer_name: str, @overrides def load(self, model_name: str, optimizer_name: str, loss_name: str, - lear_rate: float = 0.01, lear_rate_decay: float = 0.): + lear_rate: float = 0.01, lear_rate_decay: float = 0.) -> Model: """ Initialize model from saved params and weights Args: @@ -197,7 +194,7 @@ def load(self, model_name: str, optimizer_name: str, loss_name: str, return self.init_model_from_scratch(model_name, optimizer_name, loss_name, lear_rate, lear_rate_decay) @overrides - def save(self, fname: str = None): + def save(self, fname: str = None) -> None: """ Save the model parameters into <>_opt.json (or <>_opt.json) and model weights into <>.h5 (or <>.h5) @@ -230,7 +227,6 @@ def save(self, fname: str = None): if self.opt.get("save_path") != self.opt.get("load_path"): self.opt["load_path"] = str(self.opt["save_path"]) save_json(self.opt, opt_path) - return True @abstractmethod def reset(self): diff --git a/deeppavlov/core/models/nn_model.py b/deeppavlov/core/models/nn_model.py index d97485675f..94bd4cf4ee 100644 --- a/deeppavlov/core/models/nn_model.py +++ b/deeppavlov/core/models/nn_model.py @@ -1,18 +1,17 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" from abc import abstractmethod from .component import Component @@ -20,7 +19,7 @@ class NNModel(Component, Serializable): - + """Abstract class for deep learning components.""" @abstractmethod def train_on_batch(self, x: list, y: list): pass diff --git a/deeppavlov/core/models/serializable.py b/deeppavlov/core/models/serializable.py index 9cac4cae75..ebce827898 100644 --- a/deeppavlov/core/models/serializable.py +++ b/deeppavlov/core/models/serializable.py @@ -1,40 +1,35 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 +from abc import ABCMeta, abstractmethod +from typing import Union, Optional +from pathlib import Path -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.log import get_logger -from abc import ABCMeta, abstractmethod -""" -:class:`deeppavlov.models.model.Serializable` is an abstract base class that expresses the interface -for all models that can serialize data to a path. -""" log = get_logger(__name__) class Serializable(metaclass=ABCMeta): """ - :attr: `_ser_dir` is a name of a serialization dir, can be default or set in json config - :attr: `_ser_file` is a name of a serialization file (usually binary model file), - can be default or set in json config - :attr: `ser_path` is a path to model serialization dir or file (it depends on the model type). - It is always an empty string and is ignored if it is not set in json config. + :class:`deeppavlov.models.model.serializable.Serializable` is an abstract base class that expresses the interface + for all models that can serialize data to a path. """ - - def __init__(self, save_path, load_path=None, mode='infer', *args, **kwargs): + def __init__(self, save_path: Union[str, Path], load_path: Optional[Union[str, Path]] = None, mode: str = 'infer', + *args, **kwargs) -> None: if save_path: self.save_path = expand_path(save_path) diff --git a/deeppavlov/core/models/tf_backend.py b/deeppavlov/core/models/tf_backend.py index 6479248d3e..9897a75e33 100644 --- a/deeppavlov/core/models/tf_backend.py +++ b/deeppavlov/core/models/tf_backend.py @@ -1,24 +1,22 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import tensorflow as tf from abc import ABCMeta from functools import wraps from six import with_metaclass +import tensorflow as tf def _graph_wrap(func, graph): @@ -30,6 +28,7 @@ def _wrapped(*args, **kwargs): class TfModelMeta(with_metaclass(type, ABCMeta)): + """Metaclass that helps all child classes to have their own graph.""" def __call__(cls, *args, **kwargs): from .keras_model import KerasModel if issubclass(cls, KerasModel): diff --git a/deeppavlov/core/models/tf_model.py b/deeppavlov/core/models/tf_model.py index f4ccf3b389..094b1fe794 100644 --- a/deeppavlov/core/models/tf_model.py +++ b/deeppavlov/core/models/tf_model.py @@ -1,21 +1,21 @@ -""" -Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" from collections import defaultdict -import numpy as np +from typing import Iterable, Optional +import numpy as np import tensorflow as tf from tensorflow.python.ops import variables @@ -23,17 +23,19 @@ from deeppavlov.core.common.log import get_logger from .tf_backend import TfModelMeta + log = get_logger(__name__) class TFModel(NNModel, metaclass=TfModelMeta): - def __init__(self, *args, **kwargs): + """Parent class for all components using TensorFlow.""" + def __init__(self, *args, **kwargs) -> None: if not hasattr(self, 'sess'): - raise RuntimeError('Your tensorflow model {} must' + raise RuntimeError('Your TensorFlow model {} must' ' have sess attribute!'.format(self.__class__.__name__)) super().__init__(*args, **kwargs) - def load(self, exclude_scopes=['Optimizer']): + def load(self, exclude_scopes: Optional[Iterable] = ('Optimizer',)) -> None: """Load model parameters from self.load_path""" path = str(self.load_path.resolve()) # Check presence of the model files @@ -44,7 +46,7 @@ def load(self, exclude_scopes=['Optimizer']): saver = tf.train.Saver(var_list) saver.restore(self.sess, path) - def save(self, exclude_scopes=['Optimizer']): + def save(self, exclude_scopes: Optional[Iterable] = ('Optimizer',)) -> None: """Save model parameters to self.save_path""" path = str(self.save_path.resolve()) log.info('[saving model to {}]'.format(path)) @@ -52,12 +54,14 @@ def save(self, exclude_scopes=['Optimizer']): saver = tf.train.Saver(var_list) saver.save(self.sess, path) - def _get_saveable_variables(self, exclude_scopes=[]): + @staticmethod + def _get_saveable_variables(exclude_scopes=tuple()): all_vars = variables._all_saveable_objects() vars_to_train = [var for var in all_vars if all(sc not in var.name for sc in exclude_scopes)] return vars_to_train - def _get_trainable_variables(self, exclude_scopes=[]): + @staticmethod + def _get_trainable_variables(exclude_scopes=tuple()): all_vars = tf.global_variables() vars_to_train = [var for var in all_vars if all(sc not in var.name for sc in exclude_scopes)] return vars_to_train @@ -101,11 +105,16 @@ def get_train_op(self, # For batch norm it is necessary to update running averages extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(extra_update_ops): + + def clip_if_not_none(grad): + if grad is not None: + return tf.clip_by_norm(grad, clip_norm) + opt = optimizer(learning_rate) grads_and_vars = opt.compute_gradients(loss, var_list=variables_to_train) if clip_norm is not None: - grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) - for grad, var in grads_and_vars] # if grad is not None + grads_and_vars = [(clip_if_not_none(grad), var) + for grad, var in grads_and_vars] train_op = opt.apply_gradients(grads_and_vars) return train_op @@ -122,7 +131,7 @@ def print_number_of_parameters(): block_name = var.name.split('/')[0] number_of_parameters = np.prod(var.get_shape().as_list()) blocks[block_name] += number_of_parameters - for block_name in blocks: - log.info(block_name, blocks[block_name]) + for block_name, cnt in blocks.items(): + log.info("{} - {}.".format(block_name, cnt)) total_num_parameters = np.sum(list(blocks.values())) log.info('Total number of parameters equal {}'.format(total_num_parameters)) diff --git a/deeppavlov/dataset_iterators/kvret_dialog_iterator.py b/deeppavlov/dataset_iterators/kvret_dialog_iterator.py index e321a4ea33..2943719948 100644 --- a/deeppavlov/dataset_iterators/kvret_dialog_iterator.py +++ b/deeppavlov/dataset_iterators/kvret_dialog_iterator.py @@ -38,15 +38,19 @@ def _dialogs(data): task = None for x, y in data: if x.get('episode_done'): - history = [] - dialogs.append((([], [], [], []), ([], []))) + #history = [] + history = "" + dialogs.append((([], [], [], [], []), ([], []))) task = y['task'] - history.append((x, y)) - x['history'] = history[:-1] + #history.append((x, y)) + history = history + ' ' + x['text'] + ' ' + y['text'] + #x['history'] = history[:-1] + x['history'] = history[:-len(x['text'])-len(y['text'])-2] dialogs[-1][0][0].append(x['text']) dialogs[-1][0][1].append(x['dialog_id']) - dialogs[-1][0][2].append(x.get('kb_columns', None)) - dialogs[-1][0][3].append(x.get('kb_items', None)) + dialogs[-1][0][2].append(x['history']) + dialogs[-1][0][3].append(x.get('kb_columns', None)) + dialogs[-1][0][4].append(x.get('kb_items', None)) dialogs[-1][1][0].append(y['text']) dialogs[-1][1][1].append(task) return dialogs @@ -57,11 +61,17 @@ def _utterances(data): history = [] for x, y in data: if x.get('episode_done'): - history = [] - history.append((x, y)) - x['history'] = history[:-1] - x_tuple = (x['text'], x['dialog_id'], x['kb_columns'], x['kb_items']) - y_tuple = (y['text'], y['task']) + # x_hist, y_hist = [], [] + history = "" + # x_hist.append(x['text']) + # y_hist.append(y['text']) + history = history + ' ' + x['text'] + ' ' + y['text'] + # x['x_hist'] = x_hist[:-1] + # x['y_hist'] = y_hist[:-1] + x['history'] = history[:-len(x['text'])-len(y['text'])-2] + x_tuple = (x['text'], x['dialog_id'], x['history'], + x['kb_columns'], x['kb_items']) + y_tuple = (y['text'], y['task']['intent']) utters.append((x_tuple, y_tuple)) return utters diff --git a/deeppavlov/dataset_iterators/morphotagger_iterator.py b/deeppavlov/dataset_iterators/morphotagger_iterator.py index de10b5f55a..c2efa77181 100644 --- a/deeppavlov/dataset_iterators/morphotagger_iterator.py +++ b/deeppavlov/dataset_iterators/morphotagger_iterator.py @@ -14,35 +14,26 @@ import random import numpy as np +from typing import Tuple, List, Dict, Any, Iterator from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.models.preprocessors.capitalization import process_word -def process_word(word, to_lower=False, append_case=None): - if all(x.isupper() for x in word) and len(word) > 1: - uppercase = "" - elif word[0].isupper(): - uppercase = "" - else: - uppercase = None - if to_lower: - word = word.lower() - if word.isdigit(): - answer = [""] - elif word.startswith("http://") or word.startswith("www."): - answer = [""] - else: - answer = list(word) - if to_lower and uppercase is not None: - if append_case == "first": - answer = [uppercase] + answer - elif append_case == "last": - answer = answer + [uppercase] - return tuple(answer) +def preprocess_data(data: List[Tuple[List[str], List[str]]], to_lower: bool = True, + append_case: str = "first") -> List[Tuple[List[Tuple[str]], List[str]]]: + """Processes all words in data using + :func:`~deeppavlov.dataset_iterators.morphotagger_iterator.process_word`. + Args: + data: a list of pairs (words, tags), each pair corresponds to a single sentence + to_lower: whether to lowercase + append_case: whether to add case mark -def preprocess_data(data, to_lower=True, append_case="first"): + Returns: + a list of preprocessed sentences + """ new_data = [] for words, tags in data: new_words = [process_word(word, to_lower=to_lower, append_case=append_case) @@ -58,14 +49,21 @@ class MorphoTaggerDatasetIterator(DataLearningIterator): """ Iterates over data for Morphological Tagging. A subclass of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`. + + Args: + seed: random seed for data shuffling + shuffle: whether to shuffle data during batching + validation_split: the fraction of validation data + (is used only if there is no `valid` subset in `data`) """ - def __init__(self, data, seed=None, shuffle=True, - validation_split=0.2, bucket=True): - self.bucket = bucket + def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, + shuffle: bool = True, validation_split: float = 0.2): self.validation_split = validation_split super().__init__(data, seed, shuffle) def split(self): + """Splits the `train` part to `train` and `valid`, if no `valid` part is specified. + """ if len(self.valid) == 0: if self.shuffle: random.shuffle(self.train) @@ -74,7 +72,19 @@ def split(self): return def gen_batches(self, batch_size: int, data_type: str = 'train', - shuffle: bool = None, return_indexes: bool = False): + shuffle: bool = None, return_indexes: bool = False) -> Iterator[tuple]: + """Generate batches of inputs and expected output to train neural networks + + Args: + batch_size: number of samples in batch + data_type: can be either 'train', 'test', or 'valid' + shuffle: whether to shuffle dataset before batching + return_indexes: whether to return indexes of batch elements in initial dataset + + Yields: + a tuple of a batch of inputs and a batch of expected outputs. + If `return_indexes` is True, also yields indexes of batch elements. + """ if shuffle is None: shuffle = self.shuffle data = self.data[data_type] diff --git a/deeppavlov/dataset_iterators/sqlite_iterator.py b/deeppavlov/dataset_iterators/sqlite_iterator.py index 0a77f3c9ff..9ea66be46b 100644 --- a/deeppavlov/dataset_iterators/sqlite_iterator.py +++ b/deeppavlov/dataset_iterators/sqlite_iterator.py @@ -39,7 +39,7 @@ class SQLiteDataIterator(DataFittingIterator): data_dir: a directory where to save downloaded DB to data_url: an URL where to download a DB from batch_size: a number of samples in a single batch - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching seed: random seed for data shuffling Attributes: @@ -48,7 +48,7 @@ class SQLiteDataIterator(DataFittingIterator): doc_ids: DB document ids doc2index: a dictionary of document indices and their titles batch_size: a number of samples in a single batch - shuffle: whether to shuffle data when batching + shuffle: whether to shuffle data during batching random: an instance of :class:`Random` class. """ diff --git a/deeppavlov/dataset_readers/faq_reader.py b/deeppavlov/dataset_readers/faq_reader.py new file mode 100644 index 0000000000..826fb40883 --- /dev/null +++ b/deeppavlov/dataset_readers/faq_reader.py @@ -0,0 +1,59 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, softwaredata +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from pandas import read_csv + +from deeppavlov.core.data.dataset_reader import DatasetReader +from deeppavlov.core.common.registry import register + + +@register('faq_reader') +class FaqDatasetReader(DatasetReader): + """Reader for FAQ dataset""" + + def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict: + """ + Read FAQ dataset from specified csv file or remote url + + Parameters: + data_path: path to csv file of FAQ + data_url: url to csv file of FAQ + x_col_name: name of Question column in csv file + y_col_name: name of Answer column in csv file + + Returns: + A dictionary containing training, validation and test parts of the dataset obtainable via + ``train``, ``valid`` and ``test`` keys. + """ + + if data_url is not None: + data = read_csv(data_url) + elif data_path is not None: + data = read_csv(data_path) + else: + raise ValueError("Please specify data_path or data_url parameter") + + x = data[x_col_name] + y = data[y_col_name] + + train_xy_tuples = [(x[i].strip(), y[i].strip()) for i in range(len(x))] + + dataset = dict() + dataset["train"] = train_xy_tuples + dataset["valid"] = [] + dataset["test"] = [] + + return dataset diff --git a/deeppavlov/dataset_readers/line_reader.py b/deeppavlov/dataset_readers/line_reader.py new file mode 100644 index 0000000000..166195ec09 --- /dev/null +++ b/deeppavlov/dataset_readers/line_reader.py @@ -0,0 +1,43 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, softwaredata +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from deeppavlov.core.data.dataset_reader import DatasetReader +from deeppavlov.core.common.registry import register + + +@register('line_reader') +class LineReader(DatasetReader): + """Read txt file by lines""" + + def read(self, data_path: str = None, *args, **kwargs) -> Dict: + """Read lines from txt file + + Args: + data_path: path to txt file + + Returns: + A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. + """ + + with open(data_path) as f: + content = f.readlines() + + dataset = dict() + dataset["train"] = [(line, ) for line in content] + dataset["valid"] = [] + dataset["test"] = [] + + return dataset diff --git a/deeppavlov/dataset_readers/morphotagging_dataset_reader.py b/deeppavlov/dataset_readers/morphotagging_dataset_reader.py index 642657da07..a4140dcd24 100644 --- a/deeppavlov/dataset_readers/morphotagging_dataset_reader.py +++ b/deeppavlov/dataset_readers/morphotagging_dataset_reader.py @@ -32,7 +32,7 @@ def get_language(filepath: str) -> str: """ return filepath.split("-")[0] -def read_infile(infile: str, word_column: int = WORD_COLUMN, pos_column: int = POS_COLUMN, +def read_infile(infile: Union[Path, str], word_column: int = WORD_COLUMN, pos_column: int = POS_COLUMN, tag_column: int = TAG_COLUMN, max_sents: int = -1, read_only_words: bool = False) -> List[Tuple[List, Union[List, None]]]: """Reads input file in CONLL-U format diff --git a/deeppavlov/evolve.py b/deeppavlov/evolve.py index 548f7cd324..04bb88f1e4 100644 --- a/deeppavlov/evolve.py +++ b/deeppavlov/evolve.py @@ -294,10 +294,22 @@ def results_to_table(population, evolution, considered_metrics, result_file, res if len(reports) == 2 and "valid" in reports[0].keys() and "test" in reports[1].keys(): val_results = reports[0]["valid"]["metrics"] test_results = reports[1]["test"]["metrics"] + elif len(reports) == 2 and "valid" in reports[0].keys() and "valid" in reports[1].keys(): + val_results = reports[1]["valid"]["metrics"] + elif len(reports) == 2 and "test" in reports[0].keys() and "test" in reports[1].keys(): + val_results = reports[1]["test"]["metrics"] + elif len(reports) == 2 and "train" in reports[0].keys() and "valid" in reports[1].keys(): + val_results = reports[1]["valid"]["metrics"] + elif len(reports) == 2 and "train" in reports[0].keys() and "test" in reports[1].keys(): + val_results = reports[1]["test"]["metrics"] + elif len(reports) == 2 and "train" in reports[0].keys() and "train" in reports[1].keys(): + val_results = reports[1]["train"]["metrics"] elif len(reports) == 1 and "valid" in reports[0].keys(): val_results = reports[0]["valid"]["metrics"] elif len(reports) == 1 and "test" in reports[0].keys(): test_results = reports[0]["test"]["metrics"] + else: + raise ConfigError("Can not proceed output files: didn't find valid and/or test results") result_table_dict = {} for el in result_table_columns: diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index f61c94677d..4308ee30f6 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -55,7 +55,7 @@ def sets_accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) - @register_metric('classification_accuracy') -def classification_accuracy(y_true: List[list], y_predicted: List[Tuple[np.ndarray, dict]]) -> float: +def classification_accuracy(y_true: List[list], y_predicted: List[Tuple[list, dict]]) -> float: """ Calculate accuracy in terms of sets coincidence for special case of predictions \ (from classification KerasIntentModel) diff --git a/deeppavlov/metrics/bleu.py b/deeppavlov/metrics/bleu.py index d4d7e49333..985b924982 100644 --- a/deeppavlov/metrics/bleu.py +++ b/deeppavlov/metrics/bleu.py @@ -13,35 +13,40 @@ # limitations under the License. import itertools -from nltk.translate.bleu_score import sentence_bleu +from nltk.translate.bleu_score import corpus_bleu +from deeppavlov.metrics.google_bleu import compute_bleu from deeppavlov.core.common.metrics_registry import register_metric @register_metric('bleu') def bleu(y_true, y_predicted): - examples_len = len(y_true) - bleu_list = (sentence_bleu([y2.lower().split()], y1.lower().split())\ - for y1, y2 in zip(y_true, y_predicted)) - return sum(bleu_list) / examples_len if examples_len else 0. + if isinstance(y_true[0], (tuple, list)): + y_true = (y[0] for y in y_true) + return corpus_bleu([[y_t.lower().split()] for y_t in y_true], + [y_p.lower().split() for y_p in y_predicted]) + + +@register_metric('google_bleu') +def google_bleu(y_true, y_predicted): + if isinstance(y_true[0], (tuple, list)): + y_true = (y[0] for y in y_true) + return compute_bleu(([y_t.lower().split()] for y_t in y_true), + (y_p.lower().split() for y_p in y_predicted))[0] + @register_metric('per_item_bleu') def per_item_bleu(y_true, y_predicted): - if isinstance(y_true[0], (tuple, list)): - y_true = map(lambda y: y[0], y_true) - y_true = list(itertools.chain(*y_true)) y_predicted = itertools.chain(*y_predicted) - examples_len = len(y_true) - bleu_list = (sentence_bleu([y2.lower().split()], y1.lower().split())\ - for y1, y2 in zip(y_true, y_predicted)) - return sum(bleu_list) / examples_len if examples_len else 0. + if isinstance(y_true[0][0], (tuple, list)): + y_true = (y[0] for y_list in y_true for y in y_list) + return corpus_bleu([[y_t.lower().split()] for y_t in y_true], + [y_p.lower().split() for y_p in y_predicted]) + @register_metric('per_item_dialog_bleu') def per_item_dialog_bleu(y_true, y_predicted): - y_true = [y['text'] for dialog in y_true for y in dialog] - y_predicted = itertools.chain(*y_predicted) - examples_len = len(y_true) - bleu_list = (sentence_bleu([y2.lower().split()], y1.lower().split())\ - for y1, y2 in zip(y_true, y_predicted)) - return sum(bleu_list) / examples_len if examples_len else 0. + y_true = (y['text'] for dialog in y_true for y in dialog) + return corpus_bleu([[y_t.lower().split()] for y_t in y_true], + [y_p.lower().split() for y_p in y_predicted]) diff --git a/deeppavlov/metrics/fmeasure_classification.py b/deeppavlov/metrics/fmeasure_classification.py index 28cd574518..e50d0ce56a 100644 --- a/deeppavlov/metrics/fmeasure_classification.py +++ b/deeppavlov/metrics/fmeasure_classification.py @@ -23,7 +23,7 @@ @register_metric('classification_f1') -def classification_fmeasure(y_true: List[list], y_predicted: List[Tuple[np.ndarray, dict]], average="macro") -> float: +def classification_fmeasure(y_true: List[list], y_predicted: List[Tuple[list, dict]], average="macro") -> float: """ Calculate F1-measure macro @@ -47,7 +47,7 @@ def classification_fmeasure(y_true: List[list], y_predicted: List[Tuple[np.ndarr @register_metric('classification_f1_weighted') -def classification_fmeasure_weighted(y_true: List[list], y_predicted: List[Tuple[np.ndarray, dict]], +def classification_fmeasure_weighted(y_true: List[list], y_predicted: List[Tuple[list, dict]], average="weighted") -> float: """ Calculate F1-measure weighted diff --git a/deeppavlov/metrics/google_bleu.py b/deeppavlov/metrics/google_bleu.py new file mode 100644 index 0000000000..9fe6466ad4 --- /dev/null +++ b/deeppavlov/metrics/google_bleu.py @@ -0,0 +1,112 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Python implementation of BLEU and smooth-BLEU. + +This module provides a Python implementation of BLEU and smooth-BLEU. +Smooth BLEU is computed following the method outlined in the paper: +Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic +evaluation metrics for machine translation. COLING 2004. +""" + +import collections +import math + + +def _get_ngrams(segment, max_order): + """Extracts all n-grams upto a given maximum order from an input segment. + + Args: + segment: text segment from which n-grams will be extracted. + max_order: maximum length in tokens of the n-grams returned by this + methods. + + Returns: + The Counter containing all n-grams upto max_order in segment + with a count of how many times each n-gram occurred. + """ + ngram_counts = collections.Counter() + for order in range(1, max_order + 1): + for i in range(0, len(segment) - order + 1): + ngram = tuple(segment[i:i+order]) + ngram_counts[ngram] += 1 + return ngram_counts + + +def compute_bleu(reference_corpus, translation_corpus, max_order=4, + smooth=False): + """Computes BLEU score of translated segments against one or more references. + + Args: + reference_corpus: list of lists of references for each translation. Each + reference should be tokenized into a list of tokens. + translation_corpus: list of translations to score. Each translation + should be tokenized into a list of tokens. + max_order: Maximum n-gram order to use when computing BLEU score. + smooth: Whether or not to apply Lin et al. 2004 smoothing. + + Returns: + 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram + precisions and brevity penalty. + """ + matches_by_order = [0] * max_order + possible_matches_by_order = [0] * max_order + reference_length = 0 + translation_length = 0 + for (references, translation) in zip(reference_corpus, + translation_corpus): + reference_length += min(len(r) for r in references) + translation_length += len(translation) + + merged_ref_ngram_counts = collections.Counter() + for reference in references: + merged_ref_ngram_counts |= _get_ngrams(reference, max_order) + translation_ngram_counts = _get_ngrams(translation, max_order) + overlap = translation_ngram_counts & merged_ref_ngram_counts + for ngram in overlap: + matches_by_order[len(ngram)-1] += overlap[ngram] + for order in range(1, max_order+1): + possible_matches = len(translation) - order + 1 + if possible_matches > 0: + possible_matches_by_order[order-1] += possible_matches + + precisions = [0] * max_order + for i in range(0, max_order): + if smooth: + precisions[i] = ((matches_by_order[i] + 1.) / + (possible_matches_by_order[i] + 1.)) + else: + if possible_matches_by_order[i] > 0: + precisions[i] = (float(matches_by_order[i]) / + possible_matches_by_order[i]) + else: + precisions[i] = 0.0 + + if min(precisions) > 0: + p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) + geo_mean = math.exp(p_log_sum) + else: + geo_mean = 0 + + ratio = float(translation_length) / reference_length + + if ratio > 1.0: + bp = 1. + else: + bp = math.exp(1 - 1. / ratio) + + bleu = geo_mean * bp + + return (bleu, precisions, bp, ratio, translation_length, reference_length) diff --git a/deeppavlov/metrics/log_loss.py b/deeppavlov/metrics/log_loss.py index 40e08f1248..804851d5c5 100644 --- a/deeppavlov/metrics/log_loss.py +++ b/deeppavlov/metrics/log_loss.py @@ -23,7 +23,7 @@ @register_metric('classification_log_loss') -def classification_log_loss(y_true: List[list], y_predicted: List[Tuple[np.ndarray, dict]]) -> float: +def classification_log_loss(y_true: List[list], y_predicted: List[Tuple[list, dict]]) -> float: """ Calculate log loss for classification module diff --git a/deeppavlov/metrics/roc_auc_score.py b/deeppavlov/metrics/roc_auc_score.py index ec94ca74ee..83d0f4b0e6 100644 --- a/deeppavlov/metrics/roc_auc_score.py +++ b/deeppavlov/metrics/roc_auc_score.py @@ -39,7 +39,7 @@ def roc_auc_score_np(y_true: [list, np.ndarray], y_pred: [list, np.ndarray]) -> @register_metric('classification_roc_auc') -def classification_roc_auc_score(y_true: List[list], y_predicted: List[Tuple[np.ndarray, dict]]) -> float: +def classification_roc_auc_score(y_true: List[list], y_predicted: List[Tuple[list, dict]]) -> float: """ Compute Area Under the Curve (AUC) from prediction scores. diff --git a/deeppavlov/metrics/squad_metrics.py b/deeppavlov/metrics/squad_metrics.py index 9f710d9b9f..9161db2384 100644 --- a/deeppavlov/metrics/squad_metrics.py +++ b/deeppavlov/metrics/squad_metrics.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Tuple import re import string @@ -21,39 +22,39 @@ @register_metric('exact_match') -def exact_match(y_true, y_predicted): +def exact_match(y_true: List[Tuple[List[str], List[int]]], y_predicted: List[Tuple[str, int, float]]): """ Calculates Exact Match score between y_true and y_predicted EM score uses the best matching y_true answer: if y_pred equal at least to one answer in y_true then EM = 1, else EM = 0 Args: y_true: list of tuples (y_true_text, y_true_start), y_true_text and y_true_start are lists of len num_answers - y_predicted: list of tuples (y_pred_text, y_pred_start), y_pred_text : str, y_pred_start : int + y_predicted: list of tuples (y_pred_text, y_pred_start, logit), y_pred_text : str, y_pred_start : int, logit: float Returns: exact match score : float """ EM_total = 0 - for (ground_truth, _), (prediction, _) in zip(y_true, y_predicted): + for (ground_truth, _), (prediction, *_) in zip(y_true, y_predicted): EMs = [int(normalize_answer(gt) == normalize_answer(prediction)) for gt in ground_truth] EM_total += max(EMs) return 100 * EM_total / len(y_true) if len(y_true) > 0 else 0 @register_metric('squad_f1') -def squad_f1(y_true, y_predicted): +def squad_f1(y_true: List[Tuple[List[str], List[int]]], y_predicted: List[Tuple[str, int, float]]): """ Calculates F-1 score between y_true and y_predicted F-1 score uses the best matching y_true answer Args: y_true: list of tuples (y_true_text, y_true_start), y_true_text and y_true_start are lists of len num_answers - y_predicted: list of tuples (y_pred_text, y_pred_start), y_pred_text : str, y_pred_start : int + y_predicted: list of tuples (y_pred_text, y_pred_start, logit), y_pred_text : str, y_pred_start : int, logit: float Returns: F-1 score : float """ f1_total = 0.0 - for (ground_truth, _), (prediction, _) in zip(y_true, y_predicted): + for (ground_truth, _), (prediction, *_) in zip(y_true, y_predicted): prediction_tokens = normalize_answer(prediction).split() f1s = [] for gt in ground_truth: diff --git a/deeppavlov/models/__init__.py b/deeppavlov/models/__init__.py index e69de29bb2..2094742013 100644 --- a/deeppavlov/models/__init__.py +++ b/deeppavlov/models/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nltk + +from deeppavlov.core.common.prints import RedirectedPrints + + +with RedirectedPrints(): + nltk.download('punkt') + nltk.download('stopwords') + nltk.download('perluniprops') + nltk.download('nonbreaking_prefixes') diff --git a/deeppavlov/models/classifiers/cos_sim_classifier.py b/deeppavlov/models/classifiers/cos_sim_classifier.py new file mode 100644 index 0000000000..46cedd3397 --- /dev/null +++ b/deeppavlov/models/classifiers/cos_sim_classifier.py @@ -0,0 +1,130 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, softwaredata +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Tuple, Union + +import numpy as np +from scipy.sparse.linalg import norm as sparse_norm +from scipy.sparse import vstack +from scipy.sparse import csr_matrix + +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.models.estimator import Estimator +from deeppavlov.core.common.file import save_pickle +from deeppavlov.core.common.file import load_pickle +from deeppavlov.core.commands.utils import expand_path, make_all_dirs +from deeppavlov.core.models.serializable import Serializable + +logger = get_logger(__name__) + + +@register("cos_sim_classifier") +class CosineSimilarityClassifier(Estimator, Serializable): + """ + Classifier based on cosine similarity between vectorized sentences + + Parameters: + save_path: path to save the model + load_path: path to load the model + + Returns: + None + """ + + def __init__(self, top_n: int = 1, save_path: str = None, load_path: str = None, **kwargs) -> None: + self.save_path = save_path + self.load_path = load_path + self.top_n = top_n + if kwargs['mode'] != 'train': + self.load() + + def __call__(self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]: + """Found most similar answer for input vectorized question + + Parameters: + q_vects: vectorized questions + + Returns: + Tuple of Answer and Score + """ + + if isinstance(q_vects[0], csr_matrix): + norm = sparse_norm(q_vects) * sparse_norm(self.x_train_features, axis=1) + cos_similarities = np.array(q_vects.dot(self.x_train_features.T).todense())/norm + elif isinstance(q_vects[0], np.ndarray): + q_vects = np.array(q_vects) + norm = np.linalg.norm(q_vects)*np.linalg.norm(self.x_train_features, axis=1) + cos_similarities = q_vects.dot(self.x_train_features.T)/norm + elif q_vects[0] is None: + cos_similarities = np.zeros(len(self.x_train_features)) + else: + raise NotImplementedError('Not implemented this type of vectors') + + # get cosine similarity for each class + y_labels = np.unique(self.y_train) + labels_scores = np.zeros((len(cos_similarities), len(y_labels))) + for i, label in enumerate(y_labels): + labels_scores[:, i] = np.max([cos_similarities[:, i] for i, value in enumerate(self.y_train) if value == label], axis=0) + + # normalize for each class + labels_scores = labels_scores/labels_scores.sum(axis=1, keepdims=True) + answer_ids = np.argsort(labels_scores)[:, -self.top_n:] + + # generate top_n asnwers and scores + answers = [] + scores = [] + for i in range(len(answer_ids)): + answers.append([y_labels[id] for id in answer_ids[i, ::-1]]) + scores.append([np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]]) + + return answers, scores + + def fit(self, x_train_vects: Tuple[Union[csr_matrix, List]], y_train: Tuple[str]) -> None: + """Train classifier + + Parameters: + x_train_vects: vectorized question for train dataset + y_train: answers for train dataset + + Returns: + None + """ + if isinstance(x_train_vects, tuple): + if len(x_train_vects) != 0: + if isinstance(x_train_vects[0], csr_matrix): + self.x_train_features = vstack(list(x_train_vects)) + elif isinstance(x_train_vects[0], np.ndarray): + self.x_train_features = np.vstack(list(x_train_vects)) + else: + raise NotImplementedError('Not implemented this type of vectors') + else: + raise ValueError("Train vectors can't be empty") + else: + self.x_train_features = x_train_vects + + self.y_train = list(y_train) + + def save(self) -> None: + """Save classifier parameters""" + logger.info("Saving faq_model to {}".format(self.save_path)) + path = expand_path(self.save_path) + make_all_dirs(path) + save_pickle((self.x_train_features, self.y_train), path) + + def load(self) -> None: + """Load classifier parameters""" + logger.info("Loading faq_model from {}".format(self.load_path)) + self.x_train_features, self.y_train = load_pickle(expand_path(self.load_path)) diff --git a/deeppavlov/models/classifiers/keras_classification_model.py b/deeppavlov/models/classifiers/keras_classification_model.py index 446088cf97..d34c03b25c 100644 --- a/deeppavlov/models/classifiers/keras_classification_model.py +++ b/deeppavlov/models/classifiers/keras_classification_model.py @@ -34,6 +34,7 @@ from deeppavlov.core.common.log import get_logger from deeppavlov.core.layers.keras_layers import additive_self_attention, multiplicative_self_attention +from keras import backend as K log = get_logger(__name__) @@ -47,6 +48,7 @@ class KerasClassificationModel(KerasModel): text_size: maximal length of text in tokens (words), longer texts are cutted, shorter ones are padded by zeros (pre-padding) + embedding_size: embedding_size from embedder in pipeline model_name: particular method of this class to initialize model configuration optimizer: function name from keras.optimizers loss: function name from keras.losses. @@ -62,8 +64,6 @@ class KerasClassificationModel(KerasModel): If `last_layer_activation` is `softmax` (not multi-label classification), assign to 1. classes: list of classes names presented in the dataset (in config it is determined as keys of vocab over `y`) - embedder: embedder - tokenizer: tokenizer Attributes: opt: dictionary with all model parameters @@ -79,7 +79,7 @@ class KerasClassificationModel(KerasModel): optimizer: keras.optimizers instance """ - def __init__(self, text_size: int, + def __init__(self, text_size: int, embedding_size: int, model_name: str, optimizer: str = "Adam", loss: str = "binary_crossentropy", lear_rate: float = 0.01, lear_rate_decay: float = 0., last_layer_activation="sigmoid", @@ -89,13 +89,11 @@ def __init__(self, text_size: int, Initialize and train vocabularies, initializes embedder, tokenizer, and then initialize model using parameters from opt dictionary (from config), if model is being initialized from saved. """ - super().__init__(text_size=text_size, model_name=model_name, + super().__init__(text_size=text_size, embedding_size=embedding_size, model_name=model_name, optimizer=optimizer, loss=loss, lear_rate=lear_rate, lear_rate_decay=lear_rate_decay, last_layer_activation=last_layer_activation, confident_threshold=confident_threshold, **kwargs) # self.opt = copy(kwargs) initialized in here - self.tokenizer = self.opt.pop('tokenizer') - self.fasttext_model = self.opt.pop('embedder') self.classes = list(np.sort(np.array(list(self.opt.get('classes'))))) self.opt['classes'] = self.classes @@ -103,10 +101,7 @@ def __init__(self, text_size: int, if self.n_classes == 0: ConfigError("Please, provide vocabulary with considered intents.") - self.opt['embedding_size'] = self.fasttext_model.dim - - if self.fasttext_model.load_path: - current_fasttext_md5 = md5_hashsum([self.fasttext_model.load_path]) + self.opt['embedding_size'] = embedding_size # Parameters required to init model params = {"model_name": self.opt.get('model_name'), @@ -115,28 +110,15 @@ def __init__(self, text_size: int, "lear_rate": self.opt.get('lear_rate'), "lear_rate_decay": self.opt.get('lear_rate_decay')} - self.model: Model = self.load(**params) - self._change_not_fixed_params(text_size=text_size, model_name=model_name, + self.model = self.load(**params) + self._change_not_fixed_params(text_size=text_size, embedding_size=embedding_size, model_name=model_name, optimizer=optimizer, loss=loss, lear_rate=lear_rate, lear_rate_decay=lear_rate_decay, last_layer_activation=last_layer_activation, confident_threshold=confident_threshold, **kwargs) - # Check if md5 hash sum of current loaded fasttext model - # is equal to saved - try: - self.opt['fasttext_md5'] - except KeyError: - self.opt['fasttext_md5'] = current_fasttext_md5 - else: - if self.opt['fasttext_md5'] != current_fasttext_md5: - raise ConfigError( - "Given fasttext model does NOT match fasttext model used previously to train loaded model") - - summary = ['Model was successfully initialized!', 'Model summary:'] - self.model.summary(print_fn=summary.append) - log.info('\n'.join(summary)) + print("Model was successfully initialized!\nModel summary:\n{}".format(self.model.summary())) def _change_not_fixed_params(self, **kwargs) -> None: """ @@ -167,9 +149,9 @@ def _change_not_fixed_params(self, **kwargs) -> None: self.opt[param] = kwargs.get(param) return - def texts2vec(self, sentences: List[List[str]]) -> np.ndarray: + def pad_texts(self, sentences: List[List[np.ndarray]]) -> np.ndarray: """ - Convert texts to vector representations using embedder and padding up to self.opt["text_size"] tokens + Cut and pad tokenized texts to self.opt["text_size"] tokens Args: sentences: list of lists of tokens @@ -178,60 +160,56 @@ def texts2vec(self, sentences: List[List[str]]) -> np.ndarray: array of embedded texts """ pad = np.zeros(self.opt['embedding_size']) - embeddings_batch = self.fasttext_model([sen[:self.opt['text_size']] for sen in sentences]) - embeddings_batch = [[pad] * (self.opt['text_size'] - len(tokens)) + tokens for tokens in embeddings_batch] - - embeddings_batch = np.asarray(embeddings_batch) - return embeddings_batch + cutted_batch = [sen[:self.opt['text_size']] for sen in sentences] + cutted_batch = [[pad] * (self.opt['text_size'] - len(tokens)) + list(tokens) for tokens in cutted_batch] + return np.asarray(cutted_batch) - def train_on_batch(self, texts: List[str], labels: list) -> [float, List[float]]: + def train_on_batch(self, texts: List[List[np.ndarray]], labels: list) -> [float, List[float]]: """ Train the model on the given batch Args: - texts: list of texts + texts: list of tokenized text samples labels: list of labels Returns: metrics values on the given batch """ - if isinstance(texts[0], str): - texts = self.tokenizer(list(texts)) - features = self.texts2vec(texts) + K.set_session(self.sess) + features = self.pad_texts(texts) onehot_labels = labels2onehot(labels, classes=self.classes) metrics_values = self.model.train_on_batch(features, onehot_labels) return metrics_values - def infer_on_batch(self, texts: List[str], labels: list = None) -> [float, List[float], np.ndarray]: + def infer_on_batch(self, texts: List[List[np.ndarray]], labels: list = None) -> [float, List[float], np.ndarray]: """ Infer the model on the given batch Args: - texts: list of texts + texts: list of tokenized text samples labels: list of labels Returns: metrics values on the given batch, if labels are given predictions, otherwise """ - if isinstance(texts[0], str): - texts = self.tokenizer(list(texts)) + K.set_session(self.sess) if labels: - features = self.texts2vec(texts) + features = self.pad_texts(texts) onehot_labels = labels2onehot(labels, classes=self.classes) metrics_values = self.model.test_on_batch(features, onehot_labels) return metrics_values else: - features = self.texts2vec(texts) + features = self.pad_texts(texts) predictions = self.model.predict(features) return predictions - def __call__(self, data: List[str], *args) -> Tuple[np.ndarray, List[dict]]: + def __call__(self, data: List[List[str]], *args) -> Tuple[List[list], List[dict]]: """ Infer on the given data Args: - data: list of sentences + data: list of tokenized text samples *args: additional arguments Returns: @@ -239,7 +217,7 @@ def __call__(self, data: List[str], *args) -> Tuple[np.ndarray, List[dict]]: vector of probabilities to belong with each class or list of labels sentence belongs with """ - preds = np.array(self.infer_on_batch(data)) + preds = np.array(self.infer_on_batch(data), dtype="float64") labels = proba2labels(preds, confident_threshold=self.opt['confident_threshold'], classes=self.classes) return labels, [dict(zip(self.classes, preds[i])) for i in range(preds.shape[0])] diff --git a/deeppavlov/models/classifiers/logreg_classifier.py b/deeppavlov/models/classifiers/logreg_classifier.py new file mode 100644 index 0000000000..8b45d7911b --- /dev/null +++ b/deeppavlov/models/classifiers/logreg_classifier.py @@ -0,0 +1,114 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple, Union + +import numpy as np +from scipy.sparse import vstack +from scipy.sparse import csr_matrix +from sklearn.linear_model import LogisticRegression + +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.models.estimator import Estimator +from deeppavlov.core.common.file import save_pickle +from deeppavlov.core.common.file import load_pickle +from deeppavlov.core.commands.utils import expand_path, make_all_dirs +from deeppavlov.core.models.serializable import Serializable + +logger = get_logger(__name__) + + +@register("logreg_classifier") +class LogregClassifier(Estimator, Serializable): + """ + Logistic Regression Classifier + + Parameters: + top_n: how many top answers classifier'll return for input vectorized question + c: regularization strength in logistic regression model + penalty: regularization penalty type in logistic regression model + save_path: path to save the model + load_path: path to load the model + + Returns: + None + """ + def __init__(self, top_n: int = 1, c: int = 1, penalty: str = 'l2', save_path: str = None, load_path: str = None, **kwargs) -> None: + self.save_path = save_path + self.load_path = load_path + self.top_n = top_n + self.c = c + self.penalty = penalty + if kwargs['mode'] != 'train': + self.load() + + def __call__(self, q_vects: List) -> Tuple[List[str], List[int]]: + """Found most similar answer for input vectorized questions + + Parameters: + q_vects: vectorized questions + + Returns: + Tuple of Answer and Score + """ + + probs = self.logreg.predict_proba(q_vects) + answer_ids = np.argsort(probs)[:, -self.top_n:] + + answers = [] + scores = [] + for i in range(len(answer_ids)): + answers.append([self.logreg.classes_[id] for id in answer_ids[i, ::-1]]) + scores.append([np.round(probs[i, id], 2) for id in answer_ids[i, ::-1]]) + + return answers, scores + + def fit(self, x_train_vects: Tuple[Union[csr_matrix, List]], y_train: Tuple[str]) -> None: + """Train classifier + + Parameters: + x_train_vects: vectorized questions for train dataset + y_train: answers for train dataset + + Returns: + None + """ + if isinstance(x_train_vects, tuple): + if len(x_train_vects) != 0: + if isinstance(x_train_vects[0], csr_matrix): + x_train_features = vstack(list(x_train_vects)) + elif isinstance(x_train_vects[0], np.ndarray): + x_train_features = np.vstack(list(x_train_vects)) + else: + raise NotImplementedError('Not implemented this type of vectors') + else: + raise ValueError("Train vectors can't be empty") + else: + x_train_features = x_train_vects + + self.logreg = LogisticRegression(C=self.c, penalty=self.penalty) + self.logreg.fit(x_train_features, list(y_train)) + + def save(self) -> None: + """Save classifier parameters""" + logger.info("Saving faq_logreg_model to {}".format(self.save_path)) + path = expand_path(self.save_path) + make_all_dirs(path) + save_pickle(self.logreg, path) + + def load(self) -> None: + """Load classifier parameters""" + logger.info("Loading faq_logreg_model from {}".format(self.load_path)) + self.logreg = load_pickle(expand_path(self.load_path)) diff --git a/deeppavlov/models/classifiers/utils.py b/deeppavlov/models/classifiers/utils.py index ed4364cf64..02cbc158b1 100644 --- a/deeppavlov/models/classifiers/utils.py +++ b/deeppavlov/models/classifiers/utils.py @@ -49,7 +49,7 @@ def labels2onehot(labels: [list, np.ndarray], classes: [list, np.ndarray]) -> n return y -def proba2labels(proba: [list, np.ndarray], confident_threshold: float, classes: [list, np.ndarray]) -> np.ndarray: +def proba2labels(proba: [list, np.ndarray], confident_threshold: float, classes: [list, np.ndarray]) -> List[List]: """ Convert vectors of probabilities to labels using confident threshold (if probability to belong with the class is bigger than confident_threshold, sample belongs with the class; @@ -61,16 +61,16 @@ def proba2labels(proba: [list, np.ndarray], confident_threshold: float, classes: classes: array of classes' names Returns: - array of lists of labels for each sample + list of lists of labels for each sample """ y = [] for sample in proba: to_add = np.where(sample > confident_threshold)[0] if len(to_add) > 0: - y.append(np.array(classes)[to_add]) + y.append(np.array(classes)[to_add].tolist()) else: - y.append(np.array([np.array(classes)[np.argmax(sample)]])) - y = np.asarray(y) + y.append(np.array([np.array(classes)[np.argmax(sample)]]).tolist()) + return y diff --git a/deeppavlov/models/embedders/elmo_embedder.py b/deeppavlov/models/embedders/elmo_embedder.py index cd505c9826..5d646ddbb6 100644 --- a/deeppavlov/models/embedders/elmo_embedder.py +++ b/deeppavlov/models/embedders/elmo_embedder.py @@ -40,6 +40,7 @@ class ELMoEmbedder(Component): ``tenserflow_hub.load_module_spec`` by using `TensorFlow Hub `__. dim: Dimensionality of output token embeddings of ELMo model. pad_zero: Whether to use pad samples or not. + mean: Whether to return a mean ELMo embedding of tokens per sample. Examples: You can use ELMo models from DeepPavlov as usual `TensorFlow Hub Module @@ -67,11 +68,13 @@ class ELMoEmbedder(Component): """ - def __init__(self, spec: str, dim: int = 1024, pad_zero: bool = False, **kwargs) -> None: + def __init__(self, spec: str, dim: int = 1024, pad_zero: bool = False, mean: bool = False, + **kwargs) -> None: self.spec = spec if '://' in spec else str(expand_path(spec)) self.dim = dim self.pad_zero = pad_zero + self.mean = mean self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load() def _load(self): @@ -104,21 +107,27 @@ def _load(self): return elmo_outputs, sess, tokens_ph, tokens_length_ph @overrides - def __call__(self, batch: List[List[str]], mean: bool = False, + def __call__(self, batch: List[List[str]], *args, **kwargs) -> Union[List[np.ndarray], np.ndarray]: """ Embed sentences from a batch. Args: batch: A list of tokenized text samples. - mean: Whether to return a mean ELMo embedding of tokens per sample. Returns: A batch of ELMo embeddings. """ - if not (batch and batch[0]): + if not batch: empty_vec = np.zeros(self.dim, dtype=np.float32) - return [empty_vec] if mean else [[empty_vec]] + return [empty_vec] if self.mean else [[empty_vec]] + + filled_batch = [] + for batch_line in batch: + batch_line = batch_line if batch_line else [''] + filled_batch.append(batch_line) + + batch = filled_batch tokens_length = [len(batch_line) for batch_line in batch] tokens_length_max = max(tokens_length) @@ -133,7 +142,7 @@ def __call__(self, batch: List[List[str]], mean: bool = False, } ) - if mean: + if self.mean: batch = elmo_outputs['default'] dim0, dim1 = batch.shape diff --git a/deeppavlov/models/evolution/Results_analysis.ipynb b/deeppavlov/models/evolution/Results_analysis.ipynb index 1fe799787e..9fd9f2615c 100644 --- a/deeppavlov/models/evolution/Results_analysis.ipynb +++ b/deeppavlov/models/evolution/Results_analysis.ipynb @@ -272,7 +272,7 @@ " evolution.get_value_from_config(params_dictionaries[i], param_path),\n", " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5)\n", " elif param_dict.get(\"evolve_choice\"):\n", - " values = np.array(param_dict.get(\"values\"))\n", + " values = np.array(param_dict.get(\"evolve_choice\"))\n", " plt.scatter(i // POPULATION_SIZE, \n", " np.where(values == evolution.get_value_from_config(\n", " params_dictionaries[i], param_path))[0][0],\n", diff --git a/deeppavlov/models/evolution/evolution_param_generator.py b/deeppavlov/models/evolution/evolution_param_generator.py index 80241082be..be98da6385 100644 --- a/deeppavlov/models/evolution/evolution_param_generator.py +++ b/deeppavlov/models/evolution/evolution_param_generator.py @@ -15,19 +15,18 @@ import numpy as np from copy import deepcopy from pathlib import Path -import random from typing import List, Generator, Tuple, Any from deeppavlov.core.common.registry import register from deeppavlov.core.common.file import read_json from deeppavlov.core.common.log import get_logger - +from deeppavlov.core.common.params_search import ParamsSearch log = get_logger(__name__) @register('params_evolution') -class ParamsEvolution: +class ParamsEvolution(ParamsSearch): """ Class performs full evolutionary process (task scores -> max): 1. initializes random population @@ -49,11 +48,13 @@ class ParamsEvolution: seed: random seed for initialization train_partition: integer number of train data parts elitism_with_weights: whether to save elite models with weigths or without + prefix: prefix to determine special keys like `PREFIX_range`, `PREFIX_bool`, `PREFIX_choice` **kwargs: basic config with parameters Attributes: basic_config: dictionary with initial evolutionary config main_model_path: list of keys and/or integers (for list) with relative path to main model (subdictionary) + prefix: prefix to determine special keys like `PREFIX_range`, `PREFIX_bool`, `PREFIX_choice` population_size: number of individuums per generation p_crossover: probability to cross over for current replacement p_mutation: probability of mutation for current replacement @@ -62,9 +63,9 @@ class ParamsEvolution: elitism_with_weights: whether to save elite models with weigths or without n_saved_best_pretrained: number of saved models per current generation train_partition: integer number of train data parts - paths_to_evolving_params: list of lists of keys and/or integers (for list) + paths_to_params: list of lists of keys and/or integers (for list) with relative paths to evolving parameters - n_evolving_params: number of evolving parameters + n_params: number of evolving parameters evolution_model_id: identity number of model (the same for loaded pre-trained models) eps: EPS value paths_to_fiton_dicts: list of lists of keys and/or integers (for list) @@ -82,12 +83,13 @@ def __init__(self, seed:int = None, train_partition: int = 1, elitism_with_weights: bool = False, + prefix="evolve", **kwargs): """ Initialize evolution with random population """ + super().__init__(prefix=prefix, seed=seed, **kwargs) - self.basic_config = deepcopy(kwargs) self.main_model_path = list(self.find_model_path(self.basic_config, key_main_model))[0] log.info("Main model path in config: {}".format(self.main_model_path)) @@ -100,20 +102,7 @@ def __init__(self, self.n_saved_best_pretrained = 0 self.train_partition = train_partition - - self.paths_to_evolving_params = [] - for evolve_type in ["evolve_range", "evolve_choice", "evolve_bool"]: - for path_ in self.find_model_path(self.basic_config, evolve_type): - self.paths_to_evolving_params.append(path_) - - self.n_evolving_params = len(self.paths_to_evolving_params) self.evolution_model_id = 0 - self.eps = 1e-6 - - self.paths_to_fiton_dicts = [] - for path_ in self.find_model_path(self.basic_config, "fit_on"): - self.paths_to_fiton_dicts.append(path_) - self.n_fiton_dicts = len(self.paths_to_fiton_dicts) try: self.evolve_metric_optimization = self.get_value_from_config( @@ -122,112 +111,6 @@ def __init__(self, except: self.evolve_metric_optimization = "maximize" - if seed is None: - pass - else: - np.random.seed(seed) - random.seed(seed) - - def find_model_path(self, config: dict, key_model: str, path: list = []) -> Generator: - """ - Find path to dictionary in config that contains key 'key_model' - - Args: - config: dictionary - key_model: key of sub-dictionary to be found - path: list of keys and/or integers (for list) with relative path (needed for recursion) - - Returns: - path in config -- list of keys (strings and integers) - """ - config_pointer = config - if type(config_pointer) is dict and key_model in config_pointer.keys(): - # main model is an element of chainer.pipe list - # main model is a dictionary and has key key_main_model - yield path - else: - if type(config_pointer) is dict: - for key in list(config_pointer.keys()): - for path_ in self.find_model_path(config_pointer[key], key_model, path + [key]): - yield path_ - elif type(config_pointer) is list: - for i in range(len(config_pointer)): - for path_ in self.find_model_path(config_pointer[i], key_model, path + [i]): - yield path_ - - @staticmethod - def insert_value_or_dict_into_config(config: dict, path: list, - value: [int, float, str, bool, list, dict, np.ndarray]) -> dict: - """ - Insert value to dictionary determined by path[:-1] in field with key path[-1] - - Args: - config: dictionary - path: list of keys and/or integers (for list) - value: value to be inserted - - Returns: - config with inserted value - """ - config_copy = deepcopy(config) - config_pointer = config_copy - for el in path[:-1]: - if type(config_pointer) is dict: - config_pointer = config_pointer.setdefault(el, {}) - elif type(config_pointer) is list: - config_pointer = config_pointer[el] - else: - pass - config_pointer[path[-1]] = value - return config_copy - - @staticmethod - def get_value_from_config(config: dict, path: list) -> Any: - """ - Return value of config element determined by path - - Args: - config: dictionary - path: list of keys and/or integers (for list) - - Returns: - value - """ - config_copy = deepcopy(config) - config_pointer = config_copy - for el in path[:-1]: - if type(config_pointer) is dict: - config_pointer = config_pointer.setdefault(el, {}) - elif type(config_pointer) is list: - config_pointer = config_pointer[el] - else: - pass - return config_pointer[path[-1]] - - def initialize_params_in_config(self, basic_config: dict, paths: List[list]) -> dict: - """ - Randomly initialize all the changable parameters in config - - Args: - basic_config: config where changable parameters are dictionaries with keys - `evolve_range`, `evolve_bool`, `evolve_choice` - paths: list of paths to changable parameters - - Returns: - config - """ - config = deepcopy(basic_config) - for path_ in paths: - param_name = path_[-1] - value = self.get_value_from_config(basic_config, path_) - if type(value) is dict: - if value.get("evolve_choice") or value.get("evolve_range") or value.get("evolve_bool"): - config = self.insert_value_or_dict_into_config( - config, path_, - self.sample_params(**{param_name: deepcopy(value)})[param_name]) - - return config - def first_generation(self, iteration: int = 0) -> List[dict]: """ Initialize first generation randomly according to the given constraints is self.params @@ -240,7 +123,7 @@ def first_generation(self, iteration: int = 0) -> List[dict]: """ population = [] for i in range(self.population_size): - population.append(self.initialize_params_in_config(self.basic_config, self.paths_to_evolving_params)) + population.append(self.initialize_params_in_config(self.basic_config, self.paths_to_params)) for which_path in ["save_path", "load_path"]: population[-1] = self.insert_value_or_dict_into_config( population[-1], self.main_model_path + [which_path], @@ -460,28 +343,28 @@ def crossover(self, population: List[dict], scores: List[float]) -> List[dict]: parents = population[np.where(rs[0] > intervals)[0][-1]], population[np.where(rs[1] > intervals)[0][-1]] if self.decision(self.p_crossover): - params_perm = np.random.permutation(self.n_evolving_params) + params_perm = np.random.permutation(self.n_params) curr_offsprings = [deepcopy(parents[0]), deepcopy(parents[1])] - part = int(self.crossover_power * self.n_evolving_params) + part = int(self.crossover_power * self.n_params) - for j in range(self.n_evolving_params - part, self.n_evolving_params): + for j in range(self.n_params - part, self.n_params): curr_offsprings[0] = self.insert_value_or_dict_into_config(curr_offsprings[0], - self.paths_to_evolving_params[ + self.paths_to_params[ params_perm[j]], self.get_value_from_config( parents[1], - self.paths_to_evolving_params[ + self.paths_to_params[ params_perm[j]])) curr_offsprings[1] = self.insert_value_or_dict_into_config(curr_offsprings[1], - self.paths_to_evolving_params[ + self.paths_to_params[ params_perm[j]], self.get_value_from_config( parents[0], - self.paths_to_evolving_params[ + self.paths_to_params[ params_perm[j]])) offsprings.append(deepcopy(curr_offsprings[0])) else: @@ -503,7 +386,7 @@ def mutation(self, population: List[dict]) -> List[dict]: for individuum in population: mutated_individuum = deepcopy(individuum) - for path_ in self.paths_to_evolving_params: + for path_ in self.paths_to_params: param_value = self.get_value_from_config(individuum, path_) mutated_individuum = self.insert_value_or_dict_into_config( mutated_individuum, path_, @@ -570,72 +453,3 @@ def decision(self, probability: float = 1.) -> bool: return True else: return False - - def sample_params(self, **params) -> dict: - """ - Sample parameters according to the given possible values - - Args: - **params: dictionary like {"param_0": {"evolve_range": [0, 10]}, - "param_1": {"evolve_range": [0, 10], "discrete": true}, - "param_2": {"evolve_range": [0, 1], "scale": "log"}, - "param_3": {"evolve_bool": true}, - "param_4": [0, 1, 2, 3]} - - Returns: - random parameter value - """ - if not params: - return {} - else: - params_copy = deepcopy(params) - params_sample = dict() - for param, param_val in params_copy.items(): - if isinstance(param_val, dict): - if 'evolve_bool' in param_val and param_val['evolve_bool']: - sample = bool(random.choice([True, False])) - elif 'evolve_range' in param_val: - sample = self._sample_from_ranges(param_val) - elif 'evolve_choice' in param_val: - sample = random.choice(param_val['values']) - params_sample[param] = sample - else: - params_sample[param] = params_copy[param] - return params_sample - - def _sample_from_ranges(self, opts: dict) -> [int, float]: - """ - Sample parameters from ranges - - Args: - opts: dictionary {"param_0": {"evolve_range": [0, 10]}, - "param_1": {"evolve_range": [0, 10], "discrete": true}, - "param_2": {"evolve_range": [0, 1], "scale": "log"}} - - Returns: - random parameter value from range - """ - from_ = opts['evolve_range'][0] - to_ = opts['evolve_range'][1] - if opts.get('scale', None) == 'log': - sample = self._sample_log(from_, to_) - else: - sample = np.random.uniform(from_, to_) - if opts.get('discrete', False): - sample = int(np.round(sample)) - return sample - - @staticmethod - def _sample_log(from_: float = 0., to_: float = 1.) -> float: - """ - Sample parameters from ranges with log scale - - Args: - from_: lower boundary of values - to_: upper boundary of values - - Returns: - random parameters value from range with log scale - """ - sample = np.exp(np.random.uniform(np.log(from_), np.log(to_))) - return float(sample) diff --git a/deeppavlov/models/go_bot/bot.py b/deeppavlov/models/go_bot/bot.py index 8ba3c3a3c2..6606d70b9a 100644 --- a/deeppavlov/models/go_bot/bot.py +++ b/deeppavlov/models/go_bot/bot.py @@ -197,7 +197,7 @@ def _encode_context(self, context, db_result=None): # Intent features intent_features = [] if callable(self.intent_classifier): - intent, intent_probs = self.intent_classifier([tokens]) + intent, intent_probs = self.intent_classifier([context]) intent_features = np.array([intent_probs[0][i] for i in self.intents], dtype=np.float32) if self.debug: diff --git a/deeppavlov/models/go_bot/network.py b/deeppavlov/models/go_bot/network.py index ce638e8e1f..618f0c5703 100644 --- a/deeppavlov/models/go_bot/network.py +++ b/deeppavlov/models/go_bot/network.py @@ -46,19 +46,20 @@ class GoalOrientedBotNetwork(TFModel): hidden_size: size of rnn hidden layer. action_size: size of rnn output (equals to number of bot actions). obs_size: input features' size (must be equal to sum of output sizes of - ``bow_embedder``, ``embedder``, ``intent_classifier``, ``tracker.num_features`` - plus size of context features(=6) and ``action_size``). + ``bow_embedder``, ``embedder``, ``intent_classifier``, + ``tracker.num_features`` plus size of context features(=6) and + ``action_size``). learning_rate: learning rate during training. end_learning_rate: if set, learning rate starts from ``learning rate`` value and decays polynomially to the value of ``end_learning_rate``. decay_steps: number of steps for learning rate to decay. decay_power: power used to calculate learning rate decay for polynomial strategy. - dropout_rate: Probability of dropping out. Default: ``0.0``. + dropout_rate: probability of weights dropping out. l2_reg_coef: l2 regularization weight (applied to input and output layer). dense_size: rnn input size. optimizer: one of tf.train.Optimizer subclasses as a string. attention_mechanism: describes attention applied to embeddings of input tokens. - + * **type** – type of attention mechanism, possible values are ``'general'``, ``'bahdanau'``, ``'light_general'``, ``'light_bahdanau'``, ``'cs_general'`` and ``'cs_bahdanau'``. * **hidden_size** – attention hidden state size. * **max_num_tokens** – maximum number of input tokens. diff --git a/deeppavlov/models/morpho_tagger/cells.py b/deeppavlov/models/morpho_tagger/cells.py index 5fdceaaba4..dee161bdfb 100644 --- a/deeppavlov/models/morpho_tagger/cells.py +++ b/deeppavlov/models/morpho_tagger/cells.py @@ -160,7 +160,7 @@ def TemporalDropout(inputs, dropout=0.0): def positions_func(inputs, pad=0): """ A layer filling i-th column of a 2D tensor with - 1+ln(1+i) when it contaings a meaningful symbol + 1+ln(1+i) when it contains a meaningful symbol and with 0 when it contains PAD """ position_inputs = kb.cumsum(kb.ones_like(inputs, dtype="float32"), axis=1) diff --git a/deeppavlov/models/morpho_tagger/network.py b/deeppavlov/models/morpho_tagger/network.py index debce29a6c..62240539b5 100644 --- a/deeppavlov/models/morpho_tagger/network.py +++ b/deeppavlov/models/morpho_tagger/network.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Union, Tuple, Iterable import keras.layers as kl import keras.optimizers as ko @@ -30,22 +30,54 @@ class CharacterTagger: + """A class for character-based neural morphological tagger + + Parameters: + symbols: character vocabulary + tags: morphological tags vocabulary + word_rnn: the type of character-level network (only `cnn` implemented) + char_embeddings_size: the size of character embeddings + char_conv_layers: the number of convolutional layers on character level + char_window_size: the width of convolutional filter (filters) + char_filters: the number of convolutional filters for each window width + char_filter_multiple: the ratio between filters number and window width + char_highway_layers: the number of highway layers on character level + conv_dropout: the ratio of dropout between convolutional layers + highway_dropout: the ratio of dropout between highway layers, + intermediate_dropout: the ratio of dropout between convolutional + and highway layers on character level + lstm_dropout: dropout ratio in word-level LSTM + word_vectorizers: list of parameters for additional word-level vectorizers, + for each vectorizer it stores a pair of vectorizer dimension and + the dimension of the corresponding word embedding + word_lstm_layers: the number of word-level LSTM layers + word_lstm_units: hidden dimensions of word-level LSTMs + word_dropout: the ratio of dropout before word level (it is applied to word embeddings) + regularizer: l2 regularization parameter + verbose: the level of verbosity """ - A class for character-based neural morphological tagger - """ - def __init__(self, symbols: DefaultVocabulary, tags: DefaultVocabulary, - reverse=False, word_rnn="cnn", - char_embeddings_size=16, char_conv_layers=1, - char_window_size=5, char_filters=None, - char_filter_multiple=25, char_highway_layers=1, - conv_dropout=0.0, highway_dropout=0.0, - intermediate_dropout=0.0, lstm_dropout=0.0, - word_vectorizers=None, - word_lstm_layers=1, word_lstm_units=128, - word_dropout=0.0, regularizer=None, verbose=1): + def __init__(self, + symbols: DefaultVocabulary, + tags: DefaultVocabulary, + word_rnn: str = "cnn", + char_embeddings_size: int = 16, + char_conv_layers: int = 1, + char_window_size: Union[int, List[int]] = 5, + char_filters: Union[int, List[int]] = None, + char_filter_multiple: int = 25, + char_highway_layers: int = 1, + conv_dropout: float = 0.0, + highway_dropout: float = 0.0, + intermediate_dropout: float = 0.0, + lstm_dropout: float = 0.0, + word_vectorizers: List[Tuple[int, int]] = None, + word_lstm_layers: int = 1, + word_lstm_units: Union[int, List[int]] = 128, + word_dropout: float = 0.0, + regularizer: float = None, + verbose: int = 1): self.symbols = symbols self.tags = tags - self.reverse = reverse self.word_rnn = word_rnn self.char_embeddings_size = char_embeddings_size self.char_conv_layers = char_conv_layers @@ -63,11 +95,10 @@ def __init__(self, symbols: DefaultVocabulary, tags: DefaultVocabulary, self.word_lstm_units = word_lstm_units self.regularizer = regularizer self.verbose = verbose - self.initialize() - log.info("{} symbols, {} tags in CharacterTagger".format(self.symbols_number_, self.tags_number_)) + self._initialize() self.build() - def initialize(self): + def _initialize(self): if isinstance(self.char_window_size, int): self.char_window_size = [self.char_window_size] if self.char_filters is None or isinstance(self.char_filters, int): @@ -82,19 +113,27 @@ def initialize(self): self.word_vectorizers = [] if self.regularizer is not None: self.regularizer = kreg.l2(self.regularizer) + if self.verbose > 0: + log.info("{} symbols, {} tags in CharacterTagger".format(self.symbols_number_, self.tags_number_)) @property - def symbols_number_(self): + def symbols_number_(self) -> int: + """Character vocabulary size + """ return len(self.symbols) @property - def tags_number_(self): + def tags_number_(self) -> int: + """Tag vocabulary size + """ return len(self.tags) def build(self): + """Builds the network using Keras. + """ word_inputs = kl.Input(shape=(None, MAX_WORD_LENGTH+2), dtype="int32") inputs = [word_inputs] - word_outputs = self.build_word_cnn(word_inputs) + word_outputs = self._build_word_cnn(word_inputs) if len(self.word_vectorizers) > 0: additional_word_inputs = [kl.Input(shape=(None, input_dim), dtype="float32") for input_dim, dense_dim in self.word_vectorizers] @@ -102,7 +141,7 @@ def build(self): additional_word_embeddings = [kl.Dense(dense_dim)(additional_word_inputs[i]) for i, (_, dense_dim) in enumerate(self.word_vectorizers)] word_outputs = kl.Concatenate()([word_outputs] + additional_word_embeddings) - outputs, lstm_outputs = self.build_basic_network(word_outputs) + outputs, lstm_outputs = self._build_basic_network(word_outputs) compile_args = {"optimizer": ko.nadam(lr=0.002, clipnorm=5.0), "loss": "categorical_crossentropy", "metrics": ["accuracy"]} self.model_ = Model(inputs, outputs) @@ -111,8 +150,9 @@ def build(self): self.model_.summary(print_fn=log.info) return self - def build_word_cnn(self, inputs): - # inputs = kl.Input(shape=(MAX_WORD_LENGTH,), dtype="int32") + def _build_word_cnn(self, inputs): + """Builds word-level network + """ inputs = kl.Lambda(kb.one_hot, arguments={"num_classes": self.symbols_number_}, output_shape=lambda x: tuple(x) + (self.symbols_number_,))(inputs) char_embeddings = kl.Dense(self.char_embeddings_size, use_bias=False)(inputs) @@ -147,7 +187,7 @@ def build_word_cnn(self, inputs): highway_output = Highway(activation="relu")(highway_input) return highway_output - def build_basic_network(self, word_outputs): + def _build_basic_network(self, word_outputs): """ Creates the basic network architecture, transforming word embeddings to intermediate outputs @@ -182,24 +222,29 @@ def _transform_batch(self, data, labels=None, transform_to_one_hot=True): else: return X - def train_on_batch(self, data, labels): - """ - Trains model on a single batch + def train_on_batch(self, data: List[Iterable], labels: Iterable[list]): + """Trains model on a single batch - data: a batch of word sequences - labels: a batch of correct tag sequences + Args: + data: a batch of word sequences + labels: a batch of correct tag sequences + Returns: + the trained model """ X, Y = self._transform_batch(data, labels) - # TO_DO: add weights to deal with padded instances return self.model_.train_on_batch(X, Y) - def predict_on_batch(self, data: [list, tuple], return_indexes=False): + def predict_on_batch(self, data: Union[list, tuple], + return_indexes: bool = False) -> List[List[str]]: """ Makes predictions on a single batch - data: a batch of word sequences, - ----------------------------------------- - answer: a batch of label sequences + Args: + data: a batch of word sequences together with additional inputs + return_indexes: whether to return tag indexes in vocabulary or tags themselves + + Returns: + a batch of label sequences """ X = self._transform_batch(data) objects_number, lengths = len(X[0]), [len(elem) for elem in data[0]] @@ -211,7 +256,17 @@ def predict_on_batch(self, data: [list, tuple], return_indexes=False): answer[i] = elem if return_indexes else self.tags.idxs2toks(elem) return answer - def _make_sent_vector(self, sent, bucket_length=None): + def _make_sent_vector(self, sent: List, bucket_length: int =None) -> np.array: + """Transforms a sentence to Numpy array, which will be the network input. + + Args: + sent: input sentence + bucket_length: the width of the bucket + + Returns: + A 3d array, answer[i][j][k] contains the index of k-th letter + in j-th word of i-th input sentence. + """ bucket_length = bucket_length or len(sent) answer = np.zeros(shape=(bucket_length, MAX_WORD_LENGTH+2), dtype=np.int32) for i, word in enumerate(sent): @@ -224,6 +279,15 @@ def _make_sent_vector(self, sent, bucket_length=None): return answer def _make_tags_vector(self, tags, bucket_length=None): + """Transforms a sentence of tags to Numpy array, which will be the network target. + + Args: + sent: input sentence of tags + bucket_length: the width of the bucket + + Returns: + A 2d array, answer[i][j] contains the index of j-th tag in i-th input sentence. + """ bucket_length = bucket_length or len(tags) answer = np.zeros(shape=(bucket_length,), dtype=np.int32) for i, tag in enumerate(tags): @@ -231,10 +295,17 @@ def _make_tags_vector(self, tags, bucket_length=None): return answer def save(self, outfile): - """ - outfile: file with model weights (other model components should be given in config) + """Saves model weights to a file + + Args: + outfile: file with model weights (other model components should be given in config) """ self.model_.save_weights(outfile) def load(self, infile): + """Loads model weights from a file + + Args: + infile: file to load model weights from + """ self.model_.load_weights(infile) diff --git a/deeppavlov/models/morpho_tagger/tagger.py b/deeppavlov/models/morpho_tagger/tagger.py index 19bf5fdaac..72d9fa4384 100644 --- a/deeppavlov/models/morpho_tagger/tagger.py +++ b/deeppavlov/models/morpho_tagger/tagger.py @@ -26,56 +26,46 @@ @register("morpho_tagger") class MorphoTaggerWrapper(NNModel): - """ - Initialize the Model and additional parent classes attributes. + """A wrapper over morphological tagger, implemented in + :class:~deeppavlov.models.morpho_tagger.network.CharacterTagger. + A subclass of :class:`~deeppavlov.core.models.nn_model.NNModel` Args: - **kwargs: a dictionary containing parameters for model and parameters for training it formed from json config - file part that correspond to your model. - - Todo: - Add detailed arguments description + save_path: the path where model is saved + load_path: the path from where model is loaded + mode: usage mode + **kwargs: a dictionary containing model parameters specified in the main part + of json config that corresponds to the model """ - def __init__(self, **kwargs): - # Parameters for parent classes - save_path = kwargs.get('save_path', None) - load_path = kwargs.get('load_path', None) - train_now = kwargs.get('train_now', None) - mode = kwargs.get('mode', None) - - # Call parent constructors. Results in addition of attributes (save_path, - # load_path, train_now, mode to current instance) and creation of save_folder - # if it doesn't exist - super().__init__(save_path=save_path, load_path=load_path, - train_now=train_now, mode=mode) + def __init__(self, save_path: str = None, load_path: str = None, mode: str = None, **kwargs): + # Calls parent constructor. Results in creation of save_folder if it doesn't exist + super().__init__(save_path=save_path, load_path=load_path, mode=mode) # Dicts are mutable! To prevent changes in config dict outside this class # we use deepcopy opt = copy.deepcopy(kwargs) - # Find all input parameters of the network __init__ to pass them into network later + # Finds all input parameters of the network __init__ to pass them into network later network_parameter_names = list(inspect.signature(CharacterTagger.__init__).parameters) - # Fill all provided parameters from opt (opt is a dictionary formed from the model + # Fills all provided parameters from opt (opt is a dictionary formed from the model # json config file, except the "name" field) network_parameters = {par: opt[par] for par in network_parameter_names if par in opt} self._net = CharacterTagger(**network_parameters) - # Find all parameters for network train to pass them into train method later + # Finds all parameters for network train to pass them into train method later train_parameters_names = list(inspect.signature(self._net.train_on_batch).parameters) - # Fill all provided parameters from opt + # Fills all provided parameters from opt train_parameters = {par: opt[par] for par in train_parameters_names if par in opt} - self.train_parameters = train_parameters - self.opt = opt - # Try to load the model (if there are some model files the model will be loaded from them) + # Tries to load the model from model `load_path`, if it is available self.load() def load(self): - """Check existence of the model file, load the model if the file exists""" + """Checks existence of the model file, loads the model if the file exists""" # General way (load path from config assumed to be the path # to the file including extension of the file model) @@ -87,21 +77,19 @@ def load(self): self._net.load(path) def save(self): - """Save model to the save_path, provided in config. The directory is - already created by super().__init__ part in called in __init__ of this class""" + """Saves model to the save_path, provided in config. The directory is + already created by super().__init__, which is called in __init__ of this class""" path = str(self.save_path.absolute()) log.info('[saving model to {}]'.format(path)) self._net.save(path) def train_on_batch(self, *args): - """ Perform training of the network given the dataset data + """Trains the model on a single batch. Args: - x: an x batch - y: an y batch - - Returns: - + *args: the list of network inputs. + Last element of `args` is the batch of targets, + all previous elements are training data batches """ *data, labels = args self._net.train_on_batch(data, labels) @@ -112,9 +100,6 @@ def __call__(self, *x_batch, **kwargs): Args: instance: a batch to predict answers on - """ - # if len(args) > 0: - # x_batch = [x_batch] + list(args) return self._net.predict_on_batch(x_batch, **kwargs) diff --git a/deeppavlov/models/preprocessors/assemble_embeddings_matrix.py b/deeppavlov/models/preprocessors/assemble_embeddings_matrix.py index cebc33571b..c1f0a8648d 100644 --- a/deeppavlov/models/preprocessors/assemble_embeddings_matrix.py +++ b/deeppavlov/models/preprocessors/assemble_embeddings_matrix.py @@ -1,12 +1,62 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + import numpy as np -from deeppavlov.core.common.registry import register from sklearn.decomposition import PCA +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.simple_vocab import SimpleVocabulary +from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder +from deeppavlov.models.embedders.glove_embedder import GloVeEmbedder + @register('emb_mat_assembler') class EmbeddingsMatrixAssembler: - """Assembles matrix of embeddings obtained from some embedder.""" - def __init__(self, embedder, vocab, character_level=False, emb_dim=None, estimate_by_n=10000, *args, **kwargs): + """For a given Vocabulary assembles matrix of embeddings obtained from some `Embedder`. This + class also can assemble embeddins of characters using + + Args: + embedder: an instance of the class that convertes tokens to vectors. + For example :class:`~deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder` or + :class:`~deeppavlov.models.embedders.glove_embedder.GloVeEmbedder` + vocab: instance of :class:`~deeppavlov.core.data.SimpleVocab`. The matrix of embeddings + will be assembled relying on every token in the vocabulary. the indexing will match + vocabulary indexing. + character_level: whether to perform assembling on character level. This procedure will + assemble matrix with embeddings for every character using averaged embeddings of + words, that contain this character. + emb_dim: dimensionality of the resulting embeddings. If not `None` it should be less + or equal to the dimensionality of the embeddings provided by `Embedder`. The + reduction of dimensionality is performed by taking main components of PCA. + estimate_by_n: how much samples to use to estimate covariance matrix for PCA. + 10000 seems to be enough. + + Attributes: + dim: dimensionality of the embeddings (can be less than dimensionality of + embeddings produced by `Embedder`. + """ + + def __init__(self, + embedder: Union[FasttextEmbedder, GloVeEmbedder], + vocab: SimpleVocabulary, + character_level: bool = False, + emb_dim: int = None, + estimate_by_n: int = 10000, + *args, + **kwargs) -> None: if emb_dim is None: emb_dim = embedder.dim self.emb_mat = np.zeros([len(vocab), emb_dim], dtype=np.float32) @@ -38,17 +88,9 @@ def __init__(self, embedder, vocab, character_level=False, emb_dim=None, estimat self.emb_mat[n] = pca(embedder([[token]])[0])[0] else: self.emb_mat[n] = embedder([[token]])[0][0] - except KeyError: self.emb_mat[n] = np.random.randn(emb_dim) * emb_std @property def dim(self): return self.emb_mat.shape[1] - - -@register('random_emb_mat') -class RandomEmbeddingsMatrix: - """Assembles matrix of random embeddings.""" - def __init__(self, vocab_len, emb_dim, *args, **kwargs): - self.emb_mat = np.random.randn(vocab_len, emb_dim).astype(np.float32) / np.sqrt(emb_dim) diff --git a/deeppavlov/models/preprocessors/capitalization.py b/deeppavlov/models/preprocessors/capitalization.py index 8d4f0eada0..9ddc94ccb6 100644 --- a/deeppavlov/models/preprocessors/capitalization.py +++ b/deeppavlov/models/preprocessors/capitalization.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np + import re +from typing import Union, Tuple, List + +import numpy as np from deeppavlov.core.models.component import Component from deeppavlov.core.data.utils import zero_pad @@ -23,13 +26,20 @@ @register('capitalization_featurizer') class CapitalizationPreprocessor(Component): """ - Featurizer useful for NER task. It detects following patterns: + Featurizer useful for NER task. It detects following patterns in the words: - no capitals - single capital single character - single capital multiple characters - all capitals multiple characters + + Args: + pad_zeros: whether to pad capitalization features batch with zeros up + to maximal length or not. + + Attributes: + dim: dimensionality of the feature vectors, produced by the featurizer """ - def __init__(self, pad_zeros=True, *args, **kwargs): + def __init__(self, pad_zeros: bool = True, *args, **kwargs) -> None: self.pad_zeros = pad_zeros self._num_of_features = 4 @@ -63,7 +73,20 @@ def __call__(self, tokens_batch, **kwargs): return cap_batch -def process_word(word, to_lower=False, append_case=None): +def process_word(word: str, to_lower: bool = False, + append_case: str = None) -> Tuple[str]: + """Converts word to a tuple of symbols, optionally converts it to lowercase + and adds capitalization label. + + Args: + word: input word + to_lower: whether to lowercase + append_case: whether to add case mark + ('' for first capital and '' for all caps) + + Returns: + a preprocessed word + """ if all(x.isupper() for x in word) and len(word) > 1: uppercase = "" elif word[0].isupper(): @@ -88,15 +111,18 @@ def process_word(word, to_lower=False, append_case=None): @register('lowercase_preprocessor') class LowercasePreprocessor(Component): + """A callable wrapper over :func:`process_word`. + Takes as input a batch of sentences and returns a batch of preprocessed sentences. + """ - def __init__(self, to_lower=True, append_case="first", *args, **kwargs): + def __init__(self, to_lower: bool = True, append_case: str = "first", *args, **kwargs): self.to_lower = to_lower self.append_case = append_case - def __call__(self, tokens_batch, **kwargs): + def __call__(self, tokens_batch: List[Union[List[str], str]], **kwargs) -> List[List[Tuple[str]]]: answer = [] for elem in tokens_batch: if isinstance(elem, str): elem = [x for x in re.split("(\w+|[,.])", elem) if x.strip() != ""] answer.append([process_word(x, self.to_lower, self.append_case) for x in elem]) - return answer + return answer \ No newline at end of file diff --git a/deeppavlov/models/preprocessors/dirty_comments_preprocessor.py b/deeppavlov/models/preprocessors/dirty_comments_preprocessor.py index f6d02131d8..93aa58267c 100644 --- a/deeppavlov/models/preprocessors/dirty_comments_preprocessor.py +++ b/deeppavlov/models/preprocessors/dirty_comments_preprocessor.py @@ -69,7 +69,7 @@ def __call__(self, batch: List[str], **kwargs) -> List[str]: f = [re.sub('\?!+', ' ?! ', x) for x in f] f = [re.sub('\.\.+', '..', x) for x in f] - f = [re.sub(" [*$%&#@][*$%&#@]+", " xexp ", x) for x in f] + f = [re.sub("[*$%&#@()]", " ", x) for x in f] f = [re.sub(" [0-9]+ ", " DD ", x) for x in f] f = [re.sub("<\S*>", "", x) for x in f] f = [re.sub('\s+', ' ', x) for x in f] diff --git a/deeppavlov/models/preprocessors/odqa_preprocessors.py b/deeppavlov/models/preprocessors/odqa_preprocessors.py new file mode 100644 index 0000000000..7e442543dd --- /dev/null +++ b/deeppavlov/models/preprocessors/odqa_preprocessors.py @@ -0,0 +1,126 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Callable, Union +from itertools import chain + +from nltk import sent_tokenize + +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +logger = get_logger(__name__) + + +@register('document_chunker') +class DocumentChunker(Component): + """ Make chunks from a document or a list of documents. Don't tear up sentences if needed. + + Args: + sentencize_fn: a function for sentence segmentation + keep_sentences: whether to tear up sentences between chunks or not + tokens_limit: a number of tokens in a single chunk (usually this number corresponds to the squad model limit) + flatten_result: whether to flatten the resulting list of lists of chunks + + Attributes: + keep_sentences: whether to tear up sentences between chunks or not + tokens_limit: a number of tokens in a single chunk + flatten_result: whether to flatten the resulting list of lists of chunks + + """ + + def __init__(self, sentencize_fn: Callable = sent_tokenize, keep_sentences: bool = True, + tokens_limit: int = 400, flatten_result: bool = False, *args, **kwargs): + self._sentencize_fn = sentencize_fn + self.keep_sentences = keep_sentences + self.tokens_limit = tokens_limit + self.flatten_result = flatten_result + + def __call__(self, batch_docs: List[Union[str, List[str]]]) -> List[Union[List[str], List[List[str]]]]: + """ Make chunks from a batch of documents. There can be several documents in each batch. + + Args: + batch_docs: a batch of documents / a batch of lists of documents + + Returns: + chunks of docs, flattened or not + + """ + + result = [] + + for docs in batch_docs: + batch_chunks = [] + if isinstance(docs, str): + docs = [docs] + for doc in docs: + doc_chunks = [] + if self.keep_sentences: + sentences = sent_tokenize(doc) + n_tokens = 0 + keep = [] + for s in sentences: + n_tokens += len(s.split()) + if n_tokens > self.tokens_limit: + if keep: + doc_chunks.append(' '.join(keep)) + n_tokens = 0 + keep.clear() + keep.append(s) + if keep: + doc_chunks.append(' '.join(keep)) + batch_chunks.append(doc_chunks) + else: + split_doc = doc.split() + doc_chunks = [split_doc[i:i + self.tokens_limit] for i in + range(0, len(split_doc), self.tokens_limit)] + batch_chunks.append(doc_chunks) + result.append(batch_chunks) + + if self.flatten_result: + if isinstance(result[0][0], list): + for i in range(len(result)): + flattened = list(chain.from_iterable(result[i])) + result[i] = flattened + + return result + + +@register('string_multiplier') +class StringMultiplier(Component): + """Make a list of strings from a provided string. A length of the resulting list equals a length + of a provided reference argument. + + """ + + def __init__(self, **kwargs): + pass + + def __call__(self, batch_s: List[str], ref: List[str]) -> List[List[str]]: + """ Multiply each string in a provided batch of strings. + + Args: + batch_s: a batch of strings to be multiplied + ref: a reference to obtain a length of the resulting list + + Returns: + a multiplied s as list + + """ + res = [] + for s, r in zip(batch_s, ref): + res.append([s]*len(r)) + + return res diff --git a/deeppavlov/models/preprocessors/random_embeddings_matrix.py b/deeppavlov/models/preprocessors/random_embeddings_matrix.py new file mode 100644 index 0000000000..1fae19b372 --- /dev/null +++ b/deeppavlov/models/preprocessors/random_embeddings_matrix.py @@ -0,0 +1,36 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from deeppavlov.core.common.registry import register + + +@register('random_emb_mat') +class RandomEmbeddingsMatrix: + """Assembles matrix of random embeddings. + + Args: + vocab_len: length of the vocabulary (number of tokens in it) + emb_dim: dimensionality of the embeddings + + Attributes: + dim: dimensionality of the embeddings + """ + def __init__(self, vocab_len: int, emb_dim: int, *args, **kwargs) -> None: + self.emb_mat = np.random.randn(vocab_len, emb_dim).astype(np.float32) / np.sqrt(emb_dim) + + @property + def dim(self): + return self.emb_mat.shape[1] diff --git a/deeppavlov/models/preprocessors/sanitizer.py b/deeppavlov/models/preprocessors/sanitizer.py index 6ffc7fd0cf..d4416cabe2 100644 --- a/deeppavlov/models/preprocessors/sanitizer.py +++ b/deeppavlov/models/preprocessors/sanitizer.py @@ -22,8 +22,17 @@ @register('sanitizer') class Sanitizer(Component): - """Remove all combining characters like diacritical marks from tokens""" - def __init__(self, diacritical=True, nums=False, *args, **kwargs): + """Remove all combining characters like diacritical marks from tokens + + Args: + diacritical: whether to remove diacritical signs or not + diacritical signs are something like hats and stress marks + nums: whether to replace all digits with 1 or not + """ + def __init__(self, + diacritical: bool = True, + nums: bool = False, + *args, **kwargs) -> None: self.diacritical = diacritical self.nums = nums self.combining_characters = dict.fromkeys([c for c in range(sys.maxunicode) diff --git a/deeppavlov/models/preprocessors/squad_preprocessor.py b/deeppavlov/models/preprocessors/squad_preprocessor.py index c8647f211f..24e6aa6af2 100644 --- a/deeppavlov/models/preprocessors/squad_preprocessor.py +++ b/deeppavlov/models/preprocessors/squad_preprocessor.py @@ -73,7 +73,6 @@ def __call__(self, contexts_raw: Tuple[str, ...], questions_raw: Tuple[str, ...] questions_tokens: batch of tokenized questions questions_chars: batch of tokenized and split on chars questions spans: batch of mapping tokens to position in context - """ contexts = [] contexts_tokens = [] diff --git a/deeppavlov/models/ranking/logit_ranker.py b/deeppavlov/models/ranking/logit_ranker.py new file mode 100644 index 0000000000..29ae867ffc --- /dev/null +++ b/deeppavlov/models/ranking/logit_ranker.py @@ -0,0 +1,60 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from operator import itemgetter + +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.models.estimator import Component + +logger = get_logger(__name__) + + +@register("logit_ranker") +class LogitRanker(Component): + """Select best answer using squad model logits. Make several batches for a single batch, send each batch + to the squad model separately and get a single best answer for each batch. + + Args: + squad_model: a loaded squad model + + Attributes: + squad_model: a loaded squad model + + """ + + def __init__(self, squad_model, **kwargs): + self.squad_model = squad_model + + def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[str]]) -> List[str]: + """ + Sort obtained results from squad reader by logits and get the answer with a maximum logit. + + Args: + contexts_batch: a batch of contexts which should be treated as a single batch in the outer JSON config + questions_batch: a batch of questions which should be treated as a single batch in the outer JSON config + + Returns: + a batch of best answers + + """ + + batch_best_answers = [] + for contexts, questions in zip(contexts_batch, questions_batch): + results = zip(*self.squad_model(contexts, questions)) + best_answer = sorted(results, key=itemgetter(2), reverse=True)[0][0] + batch_best_answers.append(best_answer) + + return batch_best_answers diff --git a/deeppavlov/models/ranking/tfidf_ranker.py b/deeppavlov/models/ranking/tfidf_ranker.py index 98c3a314c5..ecb4bba970 100644 --- a/deeppavlov/models/ranking/tfidf_ranker.py +++ b/deeppavlov/models/ranking/tfidf_ranker.py @@ -12,21 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Dict, Any, Tuple +from typing import List, Any, Tuple import numpy as np -from deeppavlov.core.common.registry import register from deeppavlov.core.common.log import get_logger -from deeppavlov.core.models.estimator import Estimator +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.estimator import Component from deeppavlov.models.vectorizers.hashing_tfidf_vectorizer import HashingTfIdfVectorizer -from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator logger = get_logger(__name__) @register("tfidf_ranker") -class TfidfRanker(Estimator): +class TfidfRanker(Component): """Rank documents according to input strings. Args: @@ -39,56 +38,17 @@ class TfidfRanker(Estimator): top_n: a number of doc ids to return vectorizer: an instance of vectorizer class active: whether to return a number specified by :attr:`top_n` or all ids - tfidf_matrix: a loaded tfidf matrix - ngram_range: ngram range used when tfidf matrix was created - hash_size: hash size of the tfidf matrix - term_freqs: a dictionary with tfidf terms and their frequences - doc_index: a dictionary of doc ids and corresponding doc titles index2doc: inverted :attr:`doc_index` iterator: a dataset iterator used for generating batches while fitting the vectorizer """ - def get_main_component(self) -> 'TfidfRanker': - """Temporary stub to run REST API - - Returns: - self - """ - return self - def __init__(self, vectorizer: HashingTfIdfVectorizer, top_n=5, active: bool = True, **kwargs): self.top_n = top_n self.vectorizer = vectorizer self.active = active - if kwargs['mode'] != 'train': - if self.vectorizer.load_path.exists(): - self.tfidf_matrix, opts = self.vectorizer.load() - self.ngram_range = opts['ngram_range'] - self.hash_size = opts['hash_size'] - self.term_freqs = opts['term_freqs'].squeeze() - self.doc_index = opts['doc_index'] - - self.vectorizer.doc_index = self.doc_index - self.vectorizer.term_freqs = self.term_freqs - self.vectorizer.hash_size = self.hash_size - - self.index2doc = self.get_index2doc() - else: - self.iterator = None - logger.warning("TfidfRanker load_path doesn't exist, is waiting for training.") - - def get_index2doc(self) -> Dict[Any, int]: - """Invert doc_index. - - Returns: - inverted doc_index dict - - """ - return dict(zip(self.doc_index.values(), self.doc_index.keys())) - def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]: """Rank documents and return top n document titles with scores. @@ -104,14 +64,14 @@ def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]: q_tfidfs = self.vectorizer(questions) for q_tfidf in q_tfidfs: - scores = q_tfidf * self.tfidf_matrix + scores = q_tfidf * self.vectorizer.tfidf_matrix scores = np.squeeze( scores.toarray() + 0.0001) # add a small value to eliminate zero scores if self.active: thresh = self.top_n else: - thresh = len(self.doc_index) + thresh = len(self.vectorizer.doc_index) if thresh >= len(scores): o = np.argpartition(-scores, len(scores) - 1)[0:thresh] @@ -120,50 +80,8 @@ def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]: o_sort = o[np.argsort(-scores[o])] doc_scores = scores[o_sort] - doc_ids = [self.index2doc[i] for i in o_sort] + doc_ids = [self.vectorizer.index2doc[i] for i in o_sort] batch_doc_ids.append(doc_ids) batch_docs_scores.append(doc_scores) return batch_doc_ids, batch_docs_scores - - def fit_batches(self, iterator: DataFittingIterator, batch_size: int) -> None: - """Generate a batch to be fit to a vectorizer. - - Args: - iterator: an instance of an iterator class - batch_size: a size of a generated batch - - Returns: - None - - """ - self.vectorizer.doc_index = iterator.doc2index - for x, y in iterator.gen_batches(batch_size): - self.vectorizer.fit_batch(x, y) - - def fit(self) -> None: - """Pass method to :class:`Chainer`. - - Returns: - None - - """ - pass - - def save(self) -> None: - """Pass method to :attr:`vectorizer`. - - Returns: - None - - """ - self.vectorizer.save() - - def load(self) -> None: - """Pass method to :attr:`vectorizer`. - - Returns: - None - - """ - self.vectorizer.load() diff --git a/deeppavlov/models/seq2seq_go_bot/bot.py b/deeppavlov/models/seq2seq_go_bot/bot.py index 216b71e11d..573323d130 100644 --- a/deeppavlov/models/seq2seq_go_bot/bot.py +++ b/deeppavlov/models/seq2seq_go_bot/bot.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import itertools +import numpy as np +from typing import Dict +# import itertools from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.nn_model import NNModel from deeppavlov.core.common.log import get_logger +from deeppavlov.models.seq2seq_go_bot.network import Seq2SeqGoalOrientedBotNetwork log = get_logger(__name__) @@ -26,41 +29,83 @@ @register("seq2seq_go_bot") class Seq2SeqGoalOrientedBot(NNModel): """ - A goal-oriented bot based on a sequence-to-sequence rnn. For implementation details see :class:`~deeppavlov.models.seq2seq_go_bot.network.Seq2SeqGoalOrientedBotNetwork`. Pretrained for :class:`~deeppavlov.dataset_readers.kvret_reader.KvretDatasetReader` dataset. + A goal-oriented bot based on a sequence-to-sequence rnn. For implementation details see + :class:`~deeppavlov.models.seq2seq_go_bot.network.Seq2SeqGoalOrientedBotNetwork`. + Pretrained for :class:`~deeppavlov.dataset_readers.kvret_reader.KvretDatasetReader` dataset. Parameters: - network: object of :class:`~deeppavlov.models.seq2seq_go_bot.network.Seq2SeqGoalOrientedBotNetwork` class. + network_parameters: parameters passed to object of + :class:`~deeppavlov.models.seq2seq_go_bot.network.Seq2SeqGoalOrientedBotNetwork` class. + embedder: word embeddings model, see + :doc:`deeppavlov.models.embedders `. source_vocab: vocabulary of input tokens. target_vocab: vocabulary of bot response tokens. start_of_sequence_token: token that defines start of input sequence. - end_of_sequence_token: token that defines end of input sequence and start of output sequence. + end_of_sequence_token: token that defines end of input sequence and start of + output sequence. debug: whether to display debug output. - **kwargs: parameters passed to parent :class:`~deeppavlov.core.models.nn_model.NNModel` class. + **kwargs: parameters passed to parent + :class:`~deeppavlov.core.models.nn_model.NNModel` class. """ def __init__(self, - network: Component, + network_parameters: Dict, + embedder: Component, source_vocab: Component, target_vocab: Component, start_of_sequence_token: str, end_of_sequence_token: str, + knowledge_base_keys, debug: bool = False, save_path: str = None, **kwargs) -> None: super().__init__(save_path=save_path, **kwargs) - self.network = network + self.embedder = embedder + self.embedding_size = embedder.dim self.src_vocab = source_vocab self.tgt_vocab = target_vocab + self.tgt_vocab_size = len(target_vocab) + self.kb_keys = knowledge_base_keys + self.kb_size = len(self.kb_keys) self.sos_token = start_of_sequence_token self.eos_token = end_of_sequence_token self.debug = debug - def train_on_batch(self, *batch): - b_enc_ins, b_src_lens = [], [] - b_dec_ins, b_dec_outs, b_tgt_lens, b_tgt_weights = [], [], [], [] - for x_tokens, dialog_id, y_tokens in zip(*batch): + self.network = self._init_network(network_parameters) + + def _init_network(self, params): + if 'target_start_of_sequence_index' not in params: + params['target_start_of_sequence_index'] = self.tgt_vocab[self.sos_token] + if 'target_end_of_sequence_index' not in params: + params['target_end_of_sequence_index'] = self.tgt_vocab[self.eos_token] + if 'source_vocab_size' not in params: + params['source_vocab_size'] = len(self.src_vocab) + if 'target_vocab_size' not in params: + params['target_vocab_soze'] = len(self.tgt_vocab) + # contruct matrix of knowledge bases values embeddings + params['knowledge_base_entry_embeddings'] = \ + [self._embed_kb_key(val) for val in self.kb_keys] + # contrcust matrix of decoder input token embeddings (zeros for sos_token) + dec_embs = self.embedder([[self.tgt_vocab[idx] + for idx in range(self.tgt_vocab_size)]])[0] + dec_embs[self.tgt_vocab[self.sos_token]][:] = 0. + params['decoder_embeddings'] = dec_embs + return Seq2SeqGoalOrientedBotNetwork(**params) + + def _embed_kb_key(self, key): +# TODO: fasttext embedder to work with tokens + emb = np.array(self.embedder([key.split('_')], mean=True)[0]) + if self.debug: + log.debug("embedding key tokens='{}', embedding shape = {}" + .format(key.split('_'), emb.shape)) + return emb - enc_in = self._encode_context(x_tokens, dialog_id) + def train_on_batch(self, utters, history_list, kb_entry_list, responses): + b_enc_ins, b_src_lens = [], [] + b_dec_ins, b_dec_outs, b_tgt_lens = [], [], [] + for x_tokens, history, y_tokens in zip(utters, history_list, responses): + x_tokens = history + x_tokens + enc_in = self._encode_context(x_tokens) b_enc_ins.append(enc_in) b_src_lens.append(len(enc_in)) @@ -68,62 +113,109 @@ def train_on_batch(self, *batch): b_dec_ins.append(dec_in) b_dec_outs.append(dec_out) b_tgt_lens.append(len(dec_out)) - b_tgt_weights.append([1] * len(dec_out)) # Sequence padding + batch_size = len(b_enc_ins) max_src_len = max(b_src_lens) max_tgt_len = max(b_tgt_lens) - for i, (src_len, tgt_len) in enumerate(zip(b_src_lens, b_tgt_lens)): - src_padd_len = max_src_len - src_len - tgt_padd_len = max_tgt_len - tgt_len - b_enc_ins[i].extend([self.src_vocab[self.sos_token]] * src_padd_len) - b_dec_ins[i].extend([self.tgt_vocab[self.eos_token]] * tgt_padd_len) - b_dec_outs[i].extend([self.tgt_vocab[self.eos_token]] * tgt_padd_len) - b_tgt_weights[i].extend([0] * tgt_padd_len) - - self.network.train_on_batch(b_enc_ins, b_dec_ins, b_dec_outs, - b_src_lens, b_tgt_lens, b_tgt_weights) - - def _encode_context(self, tokens, dialog_id=None): + # b_enc_ins_np = self.src_vocab[self.sos_token] *\ + # np.ones((batch_size, max_src_len), dtype=np.float32) + b_enc_ins_np = np.zeros((batch_size, max_src_len, self.embedding_size), + dtype=np.float32) + b_dec_ins_np = self.tgt_vocab[self.eos_token] *\ + np.ones((batch_size, max_tgt_len), dtype=np.float32) + b_dec_outs_np = self.tgt_vocab[self.eos_token] *\ + np.ones((batch_size, max_tgt_len), dtype=np.float32) + b_tgt_weights_np = np.zeros((batch_size, max_tgt_len), dtype=np.float32) + b_kb_masks_np = np.zeros((batch_size, self.kb_size), np.float32) + for i, (src_len, tgt_len, kb_entries) in \ + enumerate(zip(b_src_lens, b_tgt_lens, kb_entry_list)): + b_enc_ins_np[i, :src_len] = b_enc_ins[i] + b_dec_ins_np[i, :tgt_len] = b_dec_ins[i] + b_dec_outs_np[i, :tgt_len] = b_dec_outs[i] + b_tgt_weights_np[i, :tgt_len] = 1. + if self.debug: + if len(kb_entries) != len(set([e[0] for e in kb_entries])): + log.debug("Duplicates in kb_entries = {}".format(kb_entries)) + for k, v in kb_entries: + b_kb_masks_np[i, self.kb_keys.index(k)] = 1. + + """if self.debug: + log.debug("b_enc_ins = {}".format(b_enc_ins)) + log.debug("b_dec_ins = {}".format(b_dec_ins)) + log.debug("b_dec_outs = {}".format(b_dec_outs)) + log.debug("b_src_lens = {}".format(b_src_lens)) + log.debug("b_tgt_lens = {}".format(b_tgt_lens)) + log.debug("b_tgt_weights = {}".format(b_tgt_weights))""" + + self.network.train_on_batch(b_enc_ins_np, b_dec_ins_np, b_dec_outs_np, + b_src_lens, b_tgt_lens, b_tgt_weights_np, + b_kb_masks_np) + + def _encode_context(self, tokens): if self.debug: log.debug("Context tokens = \"{}\"".format(tokens)) - token_idxs = self.src_vocab(tokens) - return token_idxs + # token_idxs = self.src_vocab([tokens])[0] + # return token_idxs + return np.array(self.embedder([tokens])[0]) def _encode_response(self, tokens): if self.debug: log.debug("Response tokens = \"{}\"".format(tokens)) - token_idxs = self.tgt_vocab(tokens) + token_idxs = [] + for token in tokens: + if token in self.kb_keys: + token_idxs.append(self.tgt_vocab_size + self.kb_keys.index(token)) + else: + token_idxs.append(self.tgt_vocab[token]) + # token_idxs = self.tgt_vocab([tokens])[0] return ([self.tgt_vocab[self.sos_token]] + token_idxs, token_idxs + [self.tgt_vocab[self.eos_token]]) + def _decode_response(self, token_idxs): + def _idx2token(idxs): + for idx in idxs: + if idx < self.tgt_vocab_size: + token = self.tgt_vocab([[idx]])[0][0] + if token == self.eos_token: + break + yield token + else: + yield self.kb_keys[idx - self.tgt_vocab_size] + return [list(_idx2token(utter_idxs)) for utter_idxs in token_idxs] + def __call__(self, *batch): return self._infer_on_batch(*batch) - def _infer_on_batch(self, utters, dialog_ids=itertools.repeat(None)): - def _filter(tokens): - for t in tokens: - if t == self.eos_token: - break - yield t -# TODO: history as input + # def _infer_on_batch(self, utters, kb_entry_list=itertools.repeat([])): + def _infer_on_batch(self, utters, history_list, kb_entry_list): b_enc_ins, b_src_lens = [], [] if (len(utters) == 1) and not utters[0]: utters = [['hi']] - for utter, dialog_id in zip(utters, dialog_ids): - enc_in = self._encode_context(utter, dialog_id) + for utter, history in zip(utters, history_list): + utter = history + utter + enc_in = self._encode_context(utter) + b_enc_ins.append(enc_in) b_src_lens.append(len(enc_in)) # Sequence padding + batch_size = len(b_enc_ins) max_src_len = max(b_src_lens) - for i, src_len in enumerate(b_src_lens): - src_padd_len = max_src_len - src_len - b_enc_ins[i].extend([self.src_vocab[self.eos_token]] * src_padd_len) - - pred_idxs = self.network(b_enc_ins, b_src_lens) - preds = [list(_filter(self.tgt_vocab(utter_idxs))) - for utter_idxs in pred_idxs] + # b_enc_ins_np = self.src_vocab[self.sos_token] * \ + # p.ones((batch_size, max_src_len), dtype=np.float32) + b_enc_ins_np = np.zeros((batch_size, max_src_len, self.embedding_size), + dtype=np.float32) + b_kb_masks_np = np.zeros((batch_size, self.kb_size), dtype=np.float32) + for i, (src_len, kb_entries) in enumerate(zip(b_src_lens, kb_entry_list)): + b_enc_ins_np[i, :src_len] = b_enc_ins[i] + if self.debug: + log.debug("infer: kb_entries = {}".format(kb_entries)) + for k, v in kb_entries: + b_kb_masks_np[i, self.kb_keys.index(k)] = 1. + + pred_idxs = self.network(b_enc_ins_np, b_src_lens, b_kb_masks_np) + preds = self._decode_response(pred_idxs) if self.debug: log.debug("Dialog prediction = \"{}\"".format(preds[-1])) return preds diff --git a/deeppavlov/models/seq2seq_go_bot/dialog_state.py b/deeppavlov/models/seq2seq_go_bot/dialog_state.py new file mode 100644 index 0000000000..49f8f0e463 --- /dev/null +++ b/deeppavlov/models/seq2seq_go_bot/dialog_state.py @@ -0,0 +1,33 @@ +""" +Copyright 2017 Neural Networks and Deep Learning lab, MIPT + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + + +@register("dialog_state") +class DialogState(Component): + def __init__(self, *args, **kwargs): + self.states = {} + + def __call__(self, user_ids, utterances=None, *args, **kwargs): + if utterances is None: + return [self.states.get(u, []) for u in user_ids] + + for user, utter in zip(user_ids, utterances): + self.states[user] = self.states.get(user, []) + utter + return + diff --git a/deeppavlov/models/seq2seq_go_bot/kb.py b/deeppavlov/models/seq2seq_go_bot/kb.py index 4f9c2e46c3..1fe587b4d6 100644 --- a/deeppavlov/models/seq2seq_go_bot/kb.py +++ b/deeppavlov/models/seq2seq_go_bot/kb.py @@ -15,7 +15,7 @@ import itertools import json import re -from typing import Callable, List, Dict +from typing import Callable, List, Tuple from collections import defaultdict from deeppavlov.core.common.registry import register @@ -30,17 +30,18 @@ @register("knowledge_base") class KnowledgeBase(Estimator): """ - A custom dictionary that encodes knowledge facts from :class:`~deeppavlov.dataset_readers.kvret_reader.KvretDatasetReader` data. + A custom dictionary that encodes knowledge facts from + :class:`~deeppavlov.dataset_readers.kvret_reader.KvretDatasetReader` data. Example: .. code:: python >>> from models.seq2seq_go_bot.kb import KnowledgeBase >>> kb = KnowledgeBase(save_path="kb.json", load_path="kb.json") - >>> kb.fit(['person1'], [['name', 'hair', 'eyes']], [[{'name': 'Sasha', 'hair': 'long dark', 'eyes': 'light blue '}]]) + >>> kb.fit(['person1'], [['name', 'hair', 'eyes']], [[{'name': 'Sasha', 'hair': 'long dark', 'eyes': 'light blue '}]]) >>> kb(['person1']) - [[('sasha_hair', 'long dark'), ('sasha_eyes', 'light blue ')]] + [[('sasha_name', 'Sasha'), ('sasha_hair', 'long dark'), ('sasha_eyes', 'light blue ')]] >>> kb(['person_that_doesnt_exist']) [[]] @@ -48,8 +49,10 @@ class KnowledgeBase(Estimator): Parameters: save_path: path to save the dictionary with knowledge. load_path: path to load the json with knowledge. - tokenizer: tokenizer used to split entity values into tokens. - **kwargs: parameters passed to parent :class:`~deeppavlov.core.models.estimator.Estimator`. + tokenizer: tokenizer used to split entity values into tokens (inputs batch + of strings and outputs batch of lists of tokens). + **kwargs: parameters passed to parent + :class:`~deeppavlov.core.models.estimator.Estimator`. """ def __init__(self, save_path: str, @@ -61,6 +64,7 @@ def __init__(self, *args, **kwargs) self.tokenizer = tokenizer self.kb = defaultdict(lambda: []) + self.primary_keys = [] if self.load_path and self.load_path.is_file(): self.load() @@ -68,27 +72,39 @@ def fit(self, *args): self.reset() self._update(*args) - def _update(self, keys, kb_columns_list, kb_items_list): + def _update(self, keys, kb_columns_list, kb_items_list, update_primary_keys=True): for key, cols, items in zip(keys, kb_columns_list, kb_items_list): if (None not in (key, items, cols)) and (key not in self.kb): - kv_entry_list = (self._key_value_entries(item, cols) + kv_entry_list = (self._key_value_entries(item, cols, + update=update_primary_keys) for item in items) self.kb[key] = list(itertools.chain(*kv_entry_list)) - def _key_value_entries(self, kb_item, kb_columns): - first_key = re.sub('\s+', '_', kb_item[kb_columns[0]].lower().strip()) - for col in kb_columns[1:]: + def _key_value_entries(self, kb_item, kb_columns, update=True): + def _format(s): + return re.sub('\s+', '_', s.lower().strip()) + first_key = _format(kb_item[kb_columns[0]]) + for col in kb_columns: + key = first_key + '_' + _format(col) + if update and (key not in self.primary_keys): + self.primary_keys.append(key) if col in kb_item: - second_key = re.sub('\s+', '_', col.lower().strip()) - key = first_key + '_' + second_key if self.tokenizer is not None: yield (key, self.tokenizer([kb_item[col]])[0]) else: yield (key, kb_item[col]) - def __call__(self, keys: List[str]) -> List[str]: -# TODO: check if during validation kv is updated - return [self.kb[key] for key in keys] + def __call__(self, keys, kb_columns_list=None, kb_items_list=None): + if None not in (kb_columns_list, kb_items_list): + self._update(keys, kb_columns_list, kb_items_list, update_primary_keys=False) + res = [] + for key in keys: + res.append(self.kb[key]) + for k, value in res[-1]: + if k not in self.primary_keys: + raise ValueError("Primary key `{}` is not present in knowledge base" + .format(k)) + return res def __len__(self): return len(self.kb) @@ -98,86 +114,92 @@ def keys(self): def reset(self): self.kb = defaultdict(lambda: []) + self.primary_keys = [] def save(self): log.info("[saving knowledge base to {}]".format(self.save_path)) - with self.save_path.open('wt', encoding='utf8') as fp: - json.dump(self.kb, fp) + json.dump(self.kb, self.save_path.open('wt')) + json.dump(self.primary_keys, self.save_path.with_suffix('.keys.json').open('wt')) def load(self): log.info("[loading knowledge base from {}]".format(self.load_path)) - with self.load_path.open('rt', encoding='utf8') as fp: - self.kb.update(json.load(fp)) + self.kb.update(json.load(self.load_path.open('rt')), primary_keys=False) + self.primary_keys = json.load(self.load_path.with_suffix('.keys.json').open('rt')) @register("knowledge_base_entity_normalizer") class KnowledgeBaseEntityNormalizer(Component): """ - Uses instance of :class:`~deeppavlov.models.seq2seq_go_bot.kb.KnowledgeBase` to normalize or to undo normalization of entities in the input utterance. + Uses instance of :class:`~deeppavlov.models.seq2seq_go_bot.kb.KnowledgeBase` + to normalize or to undo normalization of entities in the input utterance. - To normalize is to substitute all mentions of database entities with their normalized form. + To normalize is to substitute all mentions of database entities with their + normalized form. - To undo normalization is to substitute all mentions of database normalized entities with their original form. + To undo normalization is to substitute all mentions of database normalized entities + with their original form. Example: .. code:: python >>> from models.seq2seq_go_bot.kb import KnowledgeBase - >>> kb = KnowledgeBase(save_path="kb.json", load_path="kb.json") - >>> kb.fit(['person1'], [['name', 'hair', 'eyes']], [[{'name': 'Sasha', 'hair': 'long dark', 'eyes': 'light blue '}]]) + >>> kb = KnowledgeBase(save_path="kb.json", load_path="kb.json", tokenizer=lambda strings: [s.split() for s in strings]) + >>> kb.fit(['person1'], [['name', 'hair', 'eyes']], [[{'name': 'Sasha', 'hair': 'long dark', 'eyes': 'light blue '}]]) >>> kb(['person1']) - [[('sasha_hair', 'long dark'), ('sasha_eyes', 'light blue ')]] + [[('sasha_name', ['Sasha']), ('sasha_hair', ['long', 'dark']), ('sasha_eyes', ['light','blue'])]] >>> from models.seq2seq_go_bot.kb import KnowledgeBaseEntityNormalizer - >>> normalizer = KnowledgeBaseEntityNormalizer(kb=kb, denormalize=False) - >>> normalizer(['person1'], [["some", "guy", "with", "long", "dark", "hair", "said", "hi"]]) + >>> normalizer = KnowledgeBaseEntityNormalizer(denormalize=False, remove=False) + >>> normalizer([["some", "guy", "with", "long", "dark", "hair", "said", "hi"]], kb(['person1'])) [['some', 'guy', 'with', 'sasha_hair', 'hair', 'said', 'hi']] - >>> denormalizer = KnowledgeBaseEntityNormalizer(kb=kb, denormalize=True) - >>> denormalizer(['person1'], [['some', 'guy', 'with', 'sasha_hair', 'hair', 'said', 'hi']]) + >>> denormalizer = KnowledgeBaseEntityNormalizer(denormalize=True) + >>> denormalizer([['some', 'guy', 'with', 'sasha_hair', 'hair', 'said', 'hi']], kb(['person1'])) [['some', 'guy', 'with', 'long', 'dark', 'hair', 'said', 'hi']] + >>> remover = KnowledgeBaseEntityNormalizer(denormalize=False, remove=True) + >>> remover([["some", "guy", "with", "long", "dark", "hair", "said", "hi"]], kb(['person1'])) + [['some', 'guy', 'with', 'hair', 'said', 'hi'] + Parameters: - kb: knowledge base of type :class:`~deeppavlov.models.seq2seq_go_bot.KnowledgeBase`. - denormalize: flag indicates whether to normalize or to undo normalization ("denormalize"). - **kwargs: parameters passed to parent :class:`~deeppavlov.core.models.component.Component` class. + denormalize: flag indicates whether to normalize or to undo normalization + ("denormalize"). + remove: flag indicates whether to remove entities or not while normalizing + (``denormalize=False``). Is ignored for ``denormalize=True``. + **kwargs: parameters passed to parent + :class:`~deeppavlov.core.models.component.Component` class. """ - def __init__(self, kb: KnowledgeBase, denormalize: bool = False, **kwargs) -> None: - self.kb = kb + def __init__(self, + remove: bool = False, + denormalize: bool = False, + **kwargs): self.denormalize_flag = denormalize + self.remove = remove + + def normalize(self, tokens, entries): + for entity, ent_tokens in sorted(entries, key=lambda e: -len(e[1])): + ent_num_tokens = len(ent_tokens) + if ' '.join(ent_tokens).strip(): + for i in range(len(tokens)): + if tokens[i:i+ent_num_tokens] == ent_tokens: + if self.remove: + tokens = tokens[:i] + tokens[i+ent_num_tokens:] + else: + tokens = tokens[:i] + [entity] + tokens[i+ent_num_tokens:] + return tokens - def normalize(self, key, tokens): - utter = ' '.join(tokens) - for entity, value in self.kb([key])[0]: - # is value is tokens, get string - if isinstance(value, (list, tuple)): - value = ' '.join(value) - if value: - utter = utter.replace(value, entity) - else: - log.debug("Empty value for knowledge base entry with key = {}" - .format(key)) - return utter.split() - - def denormalize(self, key, tokens): - for entity, value in self.kb([key])[0]: - if entity in tokens: - entity_pos = tokens.index(entity) - # if value is string, split to tokens - if isinstance(value, str): - value = value.split() - tokens = tokens[:entity_pos] + value + tokens[entity_pos + 1:] + def denormalize(self, tokens, entries): + for entity, ent_tokens in entries: + while (entity in tokens): + ent_pos = tokens.index(entity) + tokens = tokens[:ent_pos] + ent_tokens + tokens[ent_pos+1:] return tokens def __call__(self, - keys: List[str], - values: List[str], - kb_columns_list: List[List[str]] = None, - kb_items_list: List[List[Dict[str, str]]] = None) -> List[List[str]]: - if None not in (kb_columns_list, kb_items_list): - self.kb._update(keys, kb_columns_list, kb_items_list) + tokens_list: List[List[str]], + entries_list: List[Tuple[str, List[str]]]) -> List[List[str]]: if self.denormalize_flag: - return [self.denormalize(key, val) for key, val in zip(keys, values)] - return [self.normalize(key, val) for key, val in zip(keys, values)] + return [self.denormalize(t, e) for t, e in zip(tokens_list, entries_list)] + return [self.normalize(t, e) for t, e in zip(tokens_list, entries_list)] diff --git a/deeppavlov/models/seq2seq_go_bot/kb_attn_layer.py b/deeppavlov/models/seq2seq_go_bot/kb_attn_layer.py new file mode 100644 index 0000000000..997dd59fa6 --- /dev/null +++ b/deeppavlov/models/seq2seq_go_bot/kb_attn_layer.py @@ -0,0 +1,180 @@ +""" +Copyright 2017 Neural Networks and Deep Learning lab, MIPT + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import tensorflow as tf +from tensorflow.python.layers import base +from tensorflow.python.ops import init_ops +from tensorflow.python.framework import tensor_shape + + +class KBAttention(base.Layer): +# TODO: update class doc + """Densely-connected layer class. + Arguments: + units: Integer or Long, dimensionality of the output space. + activation: Activation function (callable). Set it to None to maintain a + linear activation. + use_bias: Boolean, whether the layer uses a bias. + kernel_initializer: Initializer function for the weight matrix. + If `None` (default), weights are initialized using the default + initializer used by `tf.get_variable`. + bias_initializer: Initializer function for the bias. + kernel_regularizer: Regularizer function for the weight matrix. + bias_regularizer: Regularizer function for the bias. + activity_regularizer: Regularizer function for the output. + kernel_constraint: An optional projection function to be applied to the + kernel after being updated by an `Optimizer` (e.g. used to implement + norm constraints or value constraints for layer weights). The function + must take as input the unprojected variable and must return the + projected variable (which must have the same shape). Constraints are + not safe to use when doing asynchronous distributed training. + bias_constraint: An optional projection function to be applied to the + bias after being updated by an `Optimizer`. + trainable: Boolean, if `True` also add variables to the graph collection + `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). + name: String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require reuse=True in such cases. + reuse: Boolean, whether to reuse the weights of a previous layer + by the same name. + Properties: + units: Python integer, dimensionality of the output space. + activation: Activation function (callable). + use_bias: Boolean, whether the layer uses a bias. + kernel_initializer: Initializer instance (or name) for the kernel matrix. + bias_initializer: Initializer instance (or name) for the bias. + kernel_regularizer: Regularizer instance for the kernel matrix (callable) + bias_regularizer: Regularizer instance for the bias (callable). + activity_regularizer: Regularizer instance for the output (callable) + kernel_constraint: Constraint function for the kernel matrix. + bias_constraint: Constraint function for the bias. + kernel: Weight matrix (TensorFlow variable or tensor). + bias: Bias vector, if applicable (TensorFlow variable or tensor). + """ + + def __init__(self, units, hidden_sizes, + kb_inputs, + kb_mask, + activation=None, + use_bias=True, + kernel_initializer=None, + bias_initializer=init_ops.zeros_initializer(), + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + trainable=True, + name=None, + reuse=None, + **kwargs): + super(KBAttention, self).__init__(trainable=trainable, name=name, + activity_regularizer=activity_regularizer, + *kwargs) + self.units = units + self.hidden_sizes = hidden_sizes + self.kb_inputs = kb_inputs + self.kb_mask = kb_mask + self.kb_input_shape = kb_inputs.get_shape().as_list() + self.dense_name = name or "mlp" + self.dense_params = { + "activation": activation, + "use_bias": use_bias, + "kernel_initializer": kernel_initializer, + "bias_initializer": bias_initializer, + "kernel_regularizer": kernel_regularizer, + "bias_regularizer": bias_regularizer, + "activity_regularizer": activity_regularizer, + "kernel_constraint": kernel_constraint, + "bias_constraint": bias_constraint, + "trainable": trainable, + "dtype": self.kb_inputs.dtype.base_dtype, + "_reuse": reuse + } + # print("KB shape =", self.kb_input_shape) + + def build(self, input_shape): + # if in_shape[:-1] != self.kb_inputs.shape +# TODO: check input shape + # print("in build") + in_shape = input_shape[:1].concatenate(self.kb_input_shape) + in_shape = in_shape[:-1].concatenate(in_shape[-1] + input_shape[-1]) + # print("first in_shape =", in_shape) + self.layers = [] + for i, size in enumerate(self.hidden_sizes): + name = self.dense_name + if name is not None: + name = name + '{:d}'.format(i) + layer = tf.layers.Dense(size, name=name, _scope=name, **self.dense_params) + layer.build(in_shape) + in_shape = layer.compute_output_shape(in_shape) + + self.layers.append(layer) + + # print("input_shape =", input_shape) + # print("last in_shape =", in_shape) + # in_shape = in_shape[:-2].concatenate(in_shape[-2] + input_shape[-1]) + # print("last in_shape =", in_shape) + self.output_layer = tf.layers.Dense(self.units, **self.dense_params) + self.output_layer.build(input_shape) + # print("build = True") + self.built = True + + def call(self, inputs): + # print("in call") +# TODO: check input dtype + + # Tile kb_inputs + kb_inputs = self.kb_inputs + for i in range(inputs.shape.ndims - 1): + kb_inputs = tf.expand_dims(kb_inputs, 0) + kb_inputs = tf.tile(kb_inputs, tf.concat((tf.shape(inputs)[:-1], [1, 1]), 0)) + + # Expand kb_mask + kb_mask = self.kb_mask + for i in range(inputs.shape.ndims - 2): + kb_mask = tf.expand_dims(kb_mask, 1) + kb_mask = tf.expand_dims(kb_mask, -1) + + # Tile inputs + kb_size = tf.shape(self.kb_inputs)[0] + tiling = tf.concat(([1] * (inputs.shape.ndims - 1), [kb_size], [1]), 0) + cell_inputs = tf.tile(tf.expand_dims(inputs, -2), tiling) + + outputs = tf.concat([kb_inputs, cell_inputs], -1) + outputs = tf.multiply(outputs, kb_mask) + for layer in self.layers: + outputs = layer.call(outputs) + # outputs = tf.Print(outputs, [outputs], "KB attention pre-last layer output =") + outputs = tf.squeeze(outputs, [-1]) + # print("inputs shape =", inputs.shape) + # print("outputs shape =", outputs.shape) + outputs = tf.concat([self.output_layer(inputs), outputs], -1) + # print("out of call") + return outputs + + def _compute_output_shape(self, input_shape): + input_shape = tensor_shape.TensorShape(input_shape) + input_shape = input_shape.with_rank_at_least(2) + if input_shape[-1].value is None: + raise ValueError( + 'The innermost dimension of input_shape must be defined, but saw: %s' + % input_shape) + output_shape = input_shape[:-1].concatenate(self.units + self.kb_input_shape[0]) + # print("computed output shape is", output_shape) + return output_shape + + def compute_output_shape(self, input_shape): + return self._compute_output_shape(input_shape) diff --git a/deeppavlov/models/seq2seq_go_bot/network.py b/deeppavlov/models/seq2seq_go_bot/network.py index dde85879a6..5b2b0dc0b2 100644 --- a/deeppavlov/models/seq2seq_go_bot/network.py +++ b/deeppavlov/models/seq2seq_go_bot/network.py @@ -14,11 +14,15 @@ import json import tensorflow as tf +import numpy as np +from typing import List +import math from deeppavlov.core.common.registry import register from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.models.tf_model import TFModel from deeppavlov.core.common.log import get_logger +from deeppavlov.models.seq2seq_go_bot.kb_attn_layer import KBAttention log = get_logger(__name__) @@ -27,21 +31,40 @@ @register("seq2seq_go_bot_nn") class Seq2SeqGoalOrientedBotNetwork(TFModel): """ - The :class:`~deeppavlov.models.seq2seq_go_bot.bot.GoalOrientedBotNetwork` is a recurrent network that encodes user utterance and generates response in a sequence-to-sequence manner. + The :class:`~deeppavlov.models.seq2seq_go_bot.bot.GoalOrientedBotNetwork` + is a recurrent network that encodes user utterance and generates response + in a sequence-to-sequence manner. For network architecture is similar to https://arxiv.org/abs/1705.05414 . Parameters: hidden_size: RNN hidden layer size. - target_start_of_sequence_index: index of a start of sequence token during decoding. - target_end_of_sequence_index: index of an end of sequence token during decoding. source_vocab_size: size of a vocabulary of encoder tokens. target_vocab_size: size of a vocabulary of decoder tokens. - learning_rate: training learning rate. - **kwargs: parameters passed to a parent :class:`~deeppavlov.core.models.tf_model.TFModel` class. + target_start_of_sequence_index: index of a start of sequence token during + decoding. + target_end_of_sequence_index: index of an end of sequence token during decoding. + knowledge_base_entry_embeddings: matrix with embeddings of knowledge base entries, + size is (number of entries, embedding size). + kb_attention_hidden_sizes: list of sizes for attention hidden units. + decoder_embeddings: matrix with embeddings for decoder output tokens, size is + (`targer_vocab_size` + number of knowledge base entries, embedding size). + beam_width: width of beam search decoding. + learning_rate: learning rate during training. + end_learning_rate: if set, learning rate starts from ``learning_rate`` value + and decays polynomially to the value of ``end_learning_rate``. + decay_steps: number of steps of learning rate decay. + decay_power: power used to calculate learning rate decay for polynomial strategy. + dropout_rate: probability of weights' dropout. + state_dropout_rate: probability of rnn state dropout. + optimizer: one of tf.train.Optimizer subclasses as a string. + **kwargs: parameters passed to a parent + :class:`~deeppavlov.core.models.tf_model.TFModel` class. """ - GRAPH_PARAMS = ['source_vocab_size', 'target_vocab_size', 'hidden_size'] + GRAPH_PARAMS = ['knowledge_base_size', 'source_vocab_size', + 'target_vocab_size', 'hidden_size', 'embedding_size', + 'kb_embedding_control_sum', 'kb_attention_hidden_sizes'] def __init__(self, hidden_size: int, @@ -49,8 +72,30 @@ def __init__(self, target_vocab_size: int, target_start_of_sequence_index: int, target_end_of_sequence_index: int, + knowledge_base_entry_embeddings: np.ndarray, + kb_attention_hidden_sizes: List[int], + decoder_embeddings: np.ndarray, learning_rate: float, + beam_width: int = 1, + end_learning_rate: float = None, + decay_steps: int = 1000, + decay_power: float = 1.0, + dropout_rate: float = 0.0, + state_dropout_rate: float = 0.0, + optimizer: str = 'AdamOptimizer', **kwargs) -> None: + end_learning_rate = end_learning_rate or learning_rate + + # initialize knowledge base embeddings + self.kb_embedding = np.array(knowledge_base_entry_embeddings) + log.debug("recieved knowledge_base_entry_embeddings with shape = {}" + .format(self.kb_embedding.shape)) + # initialize decoder embeddings + self.decoder_embedding = np.array(decoder_embeddings) + if self.kb_embedding.shape[1] != self.decoder_embedding.shape[1]: + raise ValueError("decoder embeddings should have the same dimension" + " as knowledge base entries' embeddings") + # specify model options self.opt = { 'hidden_size': hidden_size, @@ -58,20 +103,35 @@ def __init__(self, 'target_vocab_size': target_vocab_size, 'target_start_of_sequence_index': target_start_of_sequence_index, 'target_end_of_sequence_index': target_end_of_sequence_index, - 'learning_rate': learning_rate + 'kb_attention_hidden_sizes': kb_attention_hidden_sizes, + 'kb_embedding_control_sum': float(np.sum(self.kb_embedding)), + 'knowledge_base_size': self.kb_embedding.shape[0], + 'embedding_size': self.kb_embedding.shape[1], + 'learning_rate': learning_rate, + 'beam_width': beam_width, + 'end_learning_rate': end_learning_rate, + 'decay_steps': decay_steps, + 'decay_power': decay_power, + 'dropout_rate': dropout_rate, + 'state_dropout_rate': state_dropout_rate, + 'optimizer': optimizer } - # initialize parameters + + # initialize other parameters self._init_params() # build computational graph self._build_graph() # initialize session self.sess = tf.Session() + # from tensorflow.python import debug as tf_debug + # self.sess = tf_debug.TensorBoardDebugWrapperSession(self.sess, "vimary-pc:7019") + self.global_step = 0 self.sess.run(tf.global_variables_initializer()) super().__init__(**kwargs) - if tf.train.checkpoint_exists(str(self.save_path.resolve())): + if tf.train.checkpoint_exists(str(self.load_path.resolve())): log.info("[initializing `{}` from saved]".format(self.__class__.__name__)) self.load() else: @@ -84,6 +144,23 @@ def _init_params(self): self.tgt_sos_id = self.opt['target_start_of_sequence_index'] self.tgt_eos_id = self.opt['target_end_of_sequence_index'] self.learning_rate = self.opt['learning_rate'] + self.kb_attn_hidden_sizes = self.opt['kb_attention_hidden_sizes'] + self.embedding_size = self.opt['embedding_size'] + self.kb_size = self.opt['knowledge_base_size'] + self.beam_width = self.opt['beam_width'] + self.learning_rate = self.opt['learning_rate'] + self.end_learning_rate = self.opt['end_learning_rate'] + self.dropout_rate = self.opt['dropout_rate'] + self.state_dropout_rate = self.opt['state_dropout_rate'] + self.decay_steps = self.opt['decay_steps'] + self.decay_power = self.opt['decay_power'] + + self._optimizer = None + if hasattr(tf.train, self.opt['optimizer']): + self._optimizer = getattr(tf.train, self.opt['optimizer']) + if not issubclass(self._optimizer, tf.train.Optimizer): + raise ConfigError("`optimizer` parameter should be a name of" + " tf.train.Optimizer subclass") def _build_graph(self): @@ -98,130 +175,285 @@ def _build_graph(self): weights=_weights, reduction=tf.losses.Reduction.NONE) # normalize loss by batch_size + _loss_tensor = \ + tf.verify_tensor_all_finite(_loss_tensor, "Non finite values in loss tensor.") self._loss = tf.reduce_sum(_loss_tensor) / tf.cast(self._batch_size, tf.float32) - #self._loss = tf.reduce_mean(_loss_tensor, name='loss') + # self._loss = tf.reduce_mean(_loss_tensor, name='loss') # TODO: tune clip_norm self._train_op = \ - self.get_train_op(self._loss, self.learning_rate, clip_norm=10.) + self.get_train_op(self._loss, + learning_rate=self._learning_rate, + optimizer=self._optimizer, + clip_norm=2.) + # log.info("Trainable variables") + # for v in tf.trainable_variables(): + # log.info(v) + # self.print_number_of_parameters() def _add_placeholders(self): + self._dropout_keep_prob = tf.placeholder_with_default( + 1.0, shape=[], name='dropout_keep_prob') + self._state_dropout_keep_prob = tf.placeholder_with_default( + 1.0, shape=[], name='state_dropout_keep_prob') + self._learning_rate = tf.placeholder(tf.float32, + shape=[], + name='learning_rate') # _encoder_inputs: [batch_size, max_input_time] - self._encoder_inputs = tf.placeholder(tf.int32, - [None, None], + # _encoder_inputs: [batch_size, max_input_time, embedding_size] + self._encoder_inputs = tf.placeholder(tf.float32, + [None, None, self.embedding_size], name='encoder_inputs') self._batch_size = tf.shape(self._encoder_inputs)[0] # _decoder_inputs: [batch_size, max_output_time] - self._decoder_inputs = tf.placeholder(tf.int32, + self._decoder_inputs = tf.placeholder(tf.int32, [None, None], name='decoder_inputs') + # _decoder_embedding: [tgt_vocab_size + kb_size, embedding_size] + self._decoder_embedding = \ + tf.get_variable("decoder_embedding", + shape=(self.tgt_vocab_size + self.kb_size, + self.embedding_size), + dtype=tf.float32, + initializer=tf.constant_initializer(self.decoder_embedding), + trainable=False) # _decoder_outputs: [batch_size, max_output_time] - self._decoder_outputs = tf.placeholder(tf.int32, + self._decoder_outputs = tf.placeholder(tf.int32, [None, None], - name='decoder_inputs') -#TODO: compute sequence lengths on the go + name='decoder_outputs') + # _kb_embedding: [kb_size, embedding_size] +# TODO: try training embeddings + kb_W = np.array(self.kb_embedding)[:, :self.embedding_size] + self._kb_embedding = tf.get_variable("kb_embedding", + shape=(kb_W.shape[0], kb_W.shape[1]), + dtype=tf.float32, + initializer=tf.constant_initializer(kb_W), + trainable=True) + # _kb_mask: [batch_size, kb_size] + self._kb_mask = tf.placeholder(tf.float32, [None, None], name='kb_mask') + +# TODO: compute sequence lengths on the go # _src_sequence_lengths, _tgt_sequence_lengths: [batch_size] self._src_sequence_lengths = tf.placeholder(tf.int32, - [None], - name='input_sequence_lengths') + [None], + name='input_sequence_lengths') self._tgt_sequence_lengths = tf.placeholder(tf.int32, - [None], - name='output_sequence_lengths') + [None], + name='output_sequence_lengths') # _tgt_weights: [batch_size, max_output_time] self._tgt_weights = tf.placeholder(tf.int32, [None, None], name='target_weights') def _build_body(self): -#TODO: try learning embeddings - # Encoder embedding - #_encoder_embedding = tf.get_variable( - # "encoder_embedding", [self.src_vocab_size, self.embedding_size]) - #_encoder_emb_inp = tf.nn.embedding_lookup(_encoder_embedding, - # self._encoder_inputs) - _encoder_emb_inp = tf.one_hot(self._encoder_inputs, self.src_vocab_size) - - # Decoder embedding - #_decoder_embedding = tf.get_variable( - # "decoder_embedding", [self.tgt_vocab_size, self.embedding_size]) - #_decoder_emb_inp = tf.nn.embedding_lookup(_decoder_embedding, - # self._decoder_inputs) - _decoder_emb_inp = tf.one_hot(self._decoder_inputs, self.tgt_vocab_size) + self._build_encoder() + self._build_decoder() + return self._logits, self._predictions + def _build_encoder(self): with tf.variable_scope("Encoder"): - _encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size) + # Encoder embedding + # _encoder_embedding = tf.get_variable( + # "encoder_embedding", [self.src_vocab_size, self.embedding_size]) + # _encoder_emb_inp = tf.nn.embedding_lookup(_encoder_embedding, + # self._encoder_inputs) + # _encoder_emb_inp = tf.one_hot(self._encoder_inputs, self.src_vocab_size) + _encoder_emb_inp = self._encoder_inputs + + _encoder_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, + name='basic_lstm_cell') + _encoder_cell = tf.contrib.rnn.DropoutWrapper( + _encoder_cell, + input_size=self.embedding_size, + dtype=tf.float32, + input_keep_prob=self._dropout_keep_prob, + output_keep_prob=self._dropout_keep_prob, + state_keep_prob=self._state_dropout_keep_prob, + variational_recurrent=True) # Run Dynamic RNN - # _encoder_outputs: [max_time, batch_size, num_units] - # _encoder_state: [batch_size, num_units] + # _encoder_outputs: [max_time, batch_size, hidden_size] + # _encoder_state: [batch_size, hidden_size] # input_states? _encoder_outputs, _encoder_state = tf.nn.dynamic_rnn( _encoder_cell, _encoder_emb_inp, dtype=tf.float32, sequence_length=self._src_sequence_lengths, time_major=False) + self._encoder_outputs = _encoder_outputs + self._encoder_state = _encoder_state + + def _build_decoder(self): with tf.variable_scope("Decoder"): - _decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size) - # Helper - _helper = tf.contrib.seq2seq.TrainingHelper( - _decoder_emb_inp, self._tgt_sequence_lengths, time_major=False) + # Decoder embedding + # _decoder_embedding = tf.get_variable( + # "decoder_embedding", [self.tgt_vocab_size + self.kb_size, + # self.embedding_size]) + # _decoder_emb_inp = tf.one_hot(self._decoder_inputs, + # self.tgt_vocab_size + self.kb_size) + _decoder_emb_inp = tf.nn.embedding_lookup(self._decoder_embedding, + self._decoder_inputs) + + # Tiling outputs, states, sequence lengths + _tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( + self._encoder_outputs, multiplier=self.beam_width) + _tiled_encoder_state = tf.contrib.seq2seq.tile_batch( + self._encoder_state, multiplier=self.beam_width) + _tiled_src_sequence_lengths = tf.contrib.seq2seq.tile_batch( + self._src_sequence_lengths, multiplier=self.beam_width) + + with tf.variable_scope("AttentionOverKB"): + _kb_attn_layer = KBAttention(self.tgt_vocab_size, + self.kb_attn_hidden_sizes + [1], + self._kb_embedding, + self._kb_mask, + activation=tf.nn.relu, + use_bias=False) # Output dense layer - _projection_layer = \ - tf.layers.Dense(self.tgt_vocab_size, use_bias=False) - # Decoder - _decoder = tf.contrib.seq2seq.BasicDecoder( - _decoder_cell, _helper, _encoder_state, - output_layer=_projection_layer) - # Dynamic decoding -# NOTE: pass extra arguments to dynamic_decode? -# TRY: impute_finished = True, - _outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(_decoder, - output_time_major=False) - _logits = _outputs.rnn_output - - with tf.variable_scope("DecoderOnInfer"): - _maximum_iterations = \ - tf.round(tf.reduce_max(self._src_sequence_lengths) * 2) - # Helper - _helper_infer = tf.contrib.seq2seq.GreedyEmbeddingHelper( - lambda d: tf.one_hot(d, self.tgt_vocab_size), - tf.fill([self._batch_size], self.tgt_sos_id), self.tgt_eos_id) - - # Decoder - _decoder_infer = tf.contrib.seq2seq.BasicDecoder( - _decoder_cell, _helper_infer, _encoder_state, - output_layer=_projection_layer) - # Dynamic decoding - _outputs_infer, _, _ = tf.contrib.seq2seq.dynamic_decode( - _decoder_infer, maximum_iterations=_maximum_iterations) - _predictions = _outputs_infer.sample_id - return _logits, _predictions - - def __call__(self, enc_inputs, src_seq_lengths, prob=False): + # _projection_layer = \ + # tf.layers.Dense(self.tgt_vocab_size, use_bias=False, _reuse=reuse) + + # Decoder Cell + _decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, + name='basic_lstm_cell') + _decoder_cell = tf.contrib.rnn.DropoutWrapper( + _decoder_cell, + input_size=self.embedding_size + self.hidden_size, + dtype=tf.float32, + input_keep_prob=self._dropout_keep_prob, + output_keep_prob=self._dropout_keep_prob, + state_keep_prob=self._state_dropout_keep_prob, + variational_recurrent=True) + + def build_dec_cell(enc_out, enc_seq_len, reuse=None): + with tf.variable_scope("dec_cell_attn", reuse=reuse): + # Create an attention mechanism + # _attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( + _attention_mechanism = tf.contrib.seq2seq.LuongAttention( + self.hidden_size, + memory=enc_out, + memory_sequence_length=enc_seq_len) + _cell = tf.contrib.seq2seq.AttentionWrapper( + _decoder_cell, + _attention_mechanism, + attention_layer_size=self.hidden_size) + return _cell + + # TRAIN MODE + _decoder_cell_tr = build_dec_cell(self._encoder_outputs, + self._src_sequence_lengths) + self._decoder_cell_tr = _decoder_cell_tr + # Train Helper to feed inputs for training: + # read inputs from dense ground truth vectors + _helper_tr = tf.contrib.seq2seq.TrainingHelper( + _decoder_emb_inp, self._tgt_sequence_lengths, time_major=False) + # Copy encoder hidden state to decoder inital state + _decoder_init_state = \ + _decoder_cell_tr.zero_state(self._batch_size, dtype=tf.float32)\ + .clone(cell_state=self._encoder_state) + _decoder_tr = \ + tf.contrib.seq2seq.BasicDecoder(_decoder_cell_tr, _helper_tr, + initial_state=_decoder_init_state, + output_layer=_kb_attn_layer) + # Wrap into variable scope to share attention parameters + # Required! + with tf.variable_scope('decode_with_shared_attention'): + _outputs_inf, _, _ = \ + tf.contrib.seq2seq.dynamic_decode(_decoder_tr, + impute_finished=False, + output_time_major=False) + # _logits = decode(_helper, "decode").beam_search_decoder_output.scores + _logits = _outputs_inf.rnn_output + + # INFER MODE + _decoder_cell_inf = build_dec_cell(_tiled_encoder_outputs, + _tiled_src_sequence_lengths, + reuse=True) + self._decoder_cell_inf = _decoder_cell_inf + # Infer Helper + _max_iters = tf.round(tf.reduce_max(self._src_sequence_lengths) * 2) + # NOTE: helper is not needed? + # _helper_inf = tf.contrib.seq2seq.GreedyEmbeddingHelper( + # self._decoder_embedding, + # tf.fill([self._batch_size], self.tgt_sos_id), self.tgt_eos_id) + # lambda d: tf.one_hot(d, self.tgt_vocab_size + self.kb_size), + # Decoder Init State + _decoder_init_state = \ + _decoder_cell_inf.zero_state(tf.shape(_tiled_encoder_outputs)[0], + dtype=tf.float32)\ + .clone(cell_state=_tiled_encoder_state) + # Define a beam-search decoder + _start_tokens = tf.tile(tf.constant([self.tgt_sos_id], tf.int32), + [self._batch_size]) + # _start_tokens = tf.fill([self._batch_size], self.tgt_sos_id) + _decoder_inf = tf.contrib.seq2seq.BeamSearchDecoder( + cell=_decoder_cell_inf, + embedding=self._decoder_embedding, + start_tokens=_start_tokens, + end_token=self.tgt_eos_id, + initial_state=_decoder_init_state, + beam_width=self.beam_width, + output_layer=_kb_attn_layer, + length_penalty_weight=0.0) + + # Wrap into variable scope to share attention parameters + # Required! + with tf.variable_scope("decode_with_shared_attention", reuse=True): + # TODO: try impute_finished = True, + _outputs_inf, _, _ = \ + tf.contrib.seq2seq.dynamic_decode(_decoder_inf, + impute_finished=False, + maximum_iterations=_max_iters, + output_time_major=False) + _predictions = _outputs_inf.predicted_ids[:, :, 0] + # TODO: rm indexing + # _predictions = \ + # decode(_helper_infer, "decode", _max_iters, reuse=True).sample_id + self._logits = _logits + self._predictions = _predictions + + def __call__(self, enc_inputs, src_seq_lengths, kb_masks, prob=False): predictions = self.sess.run( self._predictions, feed_dict={ + self._dropout_keep_prob: 1., + self._state_dropout_keep_prob: 1., + self._learning_rate: 1., self._encoder_inputs: enc_inputs, - self._src_sequence_lengths: src_seq_lengths + self._src_sequence_lengths: src_seq_lengths, + self._kb_mask: kb_masks } ) # TODO: implement infer probabilities if prob: raise NotImplementedError("Probs not available for now.") return predictions - - def train_on_batch(self, enc_inputs, dec_inputs, dec_outputs, - src_seq_lengths, tgt_seq_lengths, tgt_weights): + + def train_on_batch(self, enc_inputs, dec_inputs, dec_outputs, + src_seq_lengths, tgt_seq_lengths, tgt_weights, kb_masks): _, loss_value = self.sess.run( - [ self._train_op, self._loss ], + [self._train_op, self._loss], feed_dict={ + self._dropout_keep_prob: 1 - self.dropout_rate, + self._state_dropout_keep_prob: 1 - self.state_dropout_rate, + self._learning_rate: self.get_learning_rate(), self._encoder_inputs: enc_inputs, self._decoder_inputs: dec_inputs, self._decoder_outputs: dec_outputs, self._src_sequence_lengths: src_seq_lengths, self._tgt_sequence_lengths: tgt_seq_lengths, - self._tgt_weights: tgt_weights + self._tgt_weights: tgt_weights, + self._kb_mask: kb_masks } ) return loss_value + def get_learning_rate(self): + # polynomial decay + global_step = min(self.global_step, self.decay_steps) + decayed_learning_rate = \ + (self.learning_rate - self.end_learning_rate) *\ + (1 - global_step / self.decay_steps) ** self.decay_power +\ + self.end_learning_rate + return decayed_learning_rate + def load(self, *args, **kwargs): self.load_params() super().load(*args, **kwargs) @@ -233,6 +465,9 @@ def load_params(self): params = json.load(fp) for p in self.GRAPH_PARAMS: if self.opt.get(p) != params.get(p): + if p in ('kb_embedding_control_sum') and\ + (math.abs(self.opt.get(p, 0.) - params.get(p, 0.)) < 1e-3): + continue raise ConfigError("`{}` parameter must be equal to saved model" " parameter value `{}`, but is equal to `{}`" .format(p, params.get(p), self.opt.get(p))) @@ -247,5 +482,11 @@ def save_params(self): with open(path, 'w', encoding='utf8') as fp: json.dump(self.opt, fp) + def process_event(self, event_name, data): + if event_name == 'after_epoch': + log.info("Updating global step, learning rate = {:.6f}." + .format(self.get_learning_rate())) + self.global_step += 1 + def shutdown(self): self.sess.close() diff --git a/deeppavlov/models/squad/squad.py b/deeppavlov/models/squad/squad.py index ea828d2582..1337266d9b 100644 --- a/deeppavlov/models/squad/squad.py +++ b/deeppavlov/models/squad/squad.py @@ -186,11 +186,13 @@ def _init_graph(self): logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask) with tf.variable_scope("predict"): + outer_logits = tf.exp(tf.expand_dims(logits1, axis=2) + tf.expand_dims(logits2, axis=1)) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, tf.cast(tf.minimum(15, self.c_maxlen), tf.int64)) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) + self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1) loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(loss_1 + loss_2) @@ -231,8 +233,7 @@ def _init_optimizer(self): self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr_ph, epsilon=1e-6) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) - - capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip) + capped_grads = [tf.clip_by_norm(g, self.grad_clip) for g in gradients] self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step) def _build_feed_dict(self, c_tokens, c_chars, q_tokens, q_chars, y1=None, y2=None): @@ -278,7 +279,7 @@ def train_on_batch(self, c_tokens: np.ndarray, c_chars: np.ndarray, q_tokens: np return loss def __call__(self, c_tokens: np.ndarray, c_chars: np.ndarray, q_tokens: np.ndarray, q_chars: np.ndarray, - *args, **kwargs) -> Tuple[np.ndarray, np.ndarray]: + *args, **kwargs) -> Tuple[np.ndarray, np.ndarray, List[float]]: """ Predicts answer start and end positions by given context and question. @@ -289,7 +290,7 @@ def __call__(self, c_tokens: np.ndarray, c_chars: np.ndarray, q_tokens: np.ndarr q_chars: batch of tokenized questions, each token split on chars Returns: - answer_start and answer_end positions + answer_start, answer_end positions, answer logits which represent models confidence """ if any(np.sum(c_tokens, axis=-1) == 0) or any(np.sum(q_tokens, axis=-1) == 0): logger.info('SQuAD model: Warning! Empty question or context was found.') @@ -297,8 +298,8 @@ def __call__(self, c_tokens: np.ndarray, c_chars: np.ndarray, q_tokens: np.ndarr return noanswers, noanswers feed_dict = self._build_feed_dict(c_tokens, c_chars, q_tokens, q_chars) - yp1, yp2 = self.sess.run([self.yp1, self.yp2], feed_dict=feed_dict) - return yp1, yp2 + yp1, yp2, logits = self.sess.run([self.yp1, self.yp2, self.yp_logits], feed_dict=feed_dict) + return yp1, yp2, [float(logit) for logit in logits] def process_event(self, event_name: str, data) -> None: """ diff --git a/deeppavlov/models/tokenizers/nltk_tokenizer.py b/deeppavlov/models/tokenizers/nltk_tokenizer.py index 1cb8e77046..d2b68d96d5 100644 --- a/deeppavlov/models/tokenizers/nltk_tokenizer.py +++ b/deeppavlov/models/tokenizers/nltk_tokenizer.py @@ -15,18 +15,10 @@ import nltk from typing import List -from deeppavlov.core.common.prints import RedirectedPrints from deeppavlov.core.models.component import Component from deeppavlov.core.common.registry import register -with RedirectedPrints(): - nltk.download('punkt') - nltk.download('stopwords') - nltk.download('perluniprops') - nltk.download('nonbreaking_prefixes') - - @register("nltk_tokenizer") class NLTKTokenizer(Component): """Class for splitting texts on tokens using NLTK diff --git a/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py b/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py index 1c623ca7f9..6e9c66fc32 100644 --- a/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py +++ b/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py @@ -21,9 +21,11 @@ from sklearn.utils import murmurhash3_32 from deeppavlov.core.models.component import Component +from deeppavlov.core.models.estimator import Estimator from deeppavlov.core.models.serializable import Serializable from deeppavlov.core.common.log import get_logger from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator logger = get_logger(__name__) @@ -44,13 +46,13 @@ def hash_(token: str, hash_size: int) -> int: @register('hashing_tfidf_vectorizer') -class HashingTfIdfVectorizer(Component, Serializable): - """Create a tfidf matrix from collection of documents. +class HashingTfIdfVectorizer(Estimator, Serializable): + """Create a tfidf matrix from collection of documents of size [n_documents X n_features(hash_size)]. Args: tokenizer: a tokenizer class hash_size: a hash size, power of two - doc_index: a dictinary of document ids and their titles + doc_index: a dictionary of document ids and their titles save_path: a path to **.npz** file where tfidf matrix is saved load_path: a path to **.npz** file where tfidf matrix is loaded from @@ -72,20 +74,29 @@ def __init__(self, tokenizer: Component, hash_size=2 ** 24, doc_index: Optional[ self.hash_size = hash_size self.tokenizer = tokenizer - self.term_freqs = None - self.doc_index = doc_index self.rows = [] self.cols = [] self.data = [] + if kwargs.get('mode', 'infer') == 'infer': + self.tfidf_matrix, opts = self.load() + self.ngram_range = opts['ngram_range'] + self.hash_size = opts['hash_size'] + self.term_freqs = opts['term_freqs'].squeeze() + self.doc_index = opts['doc_index'] + self.index2doc = self.get_index2doc() + else: + self.term_freqs = None + self.doc_index = doc_index + def __call__(self, questions: List[str]) -> Sparse: - """Transform input list of documents to a tfidf vectors. + """Transform input list of documents to tfidf vectors. Args: questions: a list of input strings - Return: - transformed documents as a csr_matrix + Returns: + transformed documents as a csr_matrix with shape [n_documents X :attr:`hash_size`] """ @@ -112,12 +123,21 @@ def __call__(self, questions: List[str]) -> Sparse: indptr = np.array([0, len(hashes_unique)]) sp_tfidf = Sparse((tfidf, hashes_unique, indptr), shape=(1, self.hash_size) - ) + ) sp_tfidfs.append(sp_tfidf) transformed = sp.sparse.vstack(sp_tfidfs) return transformed + def get_index2doc(self) -> Dict[Any, int]: + """Invert doc_index. + + Returns: + inverted doc_index dict + + """ + return dict(zip(self.doc_index.values(), self.doc_index.keys())) + def get_counts(self, docs: List[str], doc_ids: List[Any]) \ -> Generator[Tuple[KeysView, ValuesView, List[int]], Any, None]: """Get term counts for a list of documents. @@ -251,12 +271,36 @@ def load(self) -> Tuple[Sparse, Dict]: Returns: a tuple of tfidf matrix and csr data. + Raises: + FileNotFoundError if :attr:`load_path` doesn't exist. + Todo: * implement loading from URL """ + if not self.load_path.exists(): + raise FileNotFoundError("HashingTfIdfVectorizer path doesn't exist!") + logger.info("Loading tfidf matrix from {}".format(self.load_path)) loader = np.load(self.load_path) matrix = Sparse((loader['data'], loader['indices'], - loader['indptr']), shape=loader['shape']) + loader['indptr']), shape=loader['shape']) return matrix, loader['opts'].item(0) + + def fit_batches(self, iterator: DataFittingIterator, batch_size: int) -> None: + """Generate a batch to be fit to a vectorizer. + + Args: + iterator: an instance of an iterator class + batch_size: a size of a generated batch + + Returns: + None + + """ + self.doc_index = iterator.doc2index + for x, y in iterator.gen_batches(batch_size): + self.fit_batch(x, y) + + def fit(self): + pass diff --git a/deeppavlov/models/vectorizers/sentence2vector_w2v_avg.py b/deeppavlov/models/vectorizers/sentence2vector_w2v_avg.py new file mode 100644 index 0000000000..41927fd2e6 --- /dev/null +++ b/deeppavlov/models/vectorizers/sentence2vector_w2v_avg.py @@ -0,0 +1,49 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import numpy as np + +from deeppavlov.core.models.component import Component +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.common.registry import register + +logger = get_logger(__name__) + + +@register('sentence2vector_w2v_avg') +class SentenceAvgW2vVectorizer(Component): + """Sentence vectorizer which produce one vector as average sum of words vectors in sentence""" + + def __init__(self, **kwargs) -> None: + pass + + def __call__(self, questions: List[str], tokens_fasttext_vectors: List) -> List: + """Vectorize list of sentences + + Parameters: + questions: list of questions/sentences + tokens_fasttext_vectors: fasttext vectors for sentences + + Returns: + List of vectorized sentences + """ + + questions_vectors = [] + for i, q in enumerate(questions): + q_weights = [1/len(questions[i])]*len(questions[i]) + questions_vectors.append(np.average(tokens_fasttext_vectors[i], weights=q_weights, axis=0)) + + return questions_vectors diff --git a/deeppavlov/models/vectorizers/sentence2vector_w2v_tfidf.py b/deeppavlov/models/vectorizers/sentence2vector_w2v_tfidf.py new file mode 100644 index 0000000000..2a787a900f --- /dev/null +++ b/deeppavlov/models/vectorizers/sentence2vector_w2v_tfidf.py @@ -0,0 +1,113 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer + +from deeppavlov.core.models.estimator import Estimator +from deeppavlov.core.models.serializable import Serializable +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.file import save_pickle +from deeppavlov.core.common.file import load_pickle +from deeppavlov.core.commands.utils import expand_path, make_all_dirs, is_file_exist + +logger = get_logger(__name__) + + +@register('sentence2vector_w2v_tfidf') +class SentenceW2vVectorizerTfidfWeights(Estimator, Serializable): + """ + Sentence vectorizer which produce one vector as tf-idf weighted sum of words vectors in sentence + + Parameters: + save_path: path to save the model + load_path: path to load the model + + Returns: + None + """ + + def __init__(self, save_path: str = None, load_path: str = None, **kwargs) -> None: + self.save_path = save_path + self.load_path = load_path + + if is_file_exist(self.load_path): + self.load() + else: + if kwargs['mode'] != 'train': + self.load() + else: + self.vectorizer = TfidfVectorizer() + + def __call__(self, questions: List[str], tokens_fasttext_vectors: List) -> List: + """Vectorize list of sentences + + Parameters: + questions: list of questions/sentences + tokens_fasttext_vectors: fasttext vectors for sentences + + Returns: + List of vectorized sentences + """ + if isinstance(questions[0], list): + questions = [' '.join(x) for x in questions] + + q_vects = self.vectorizer.transform(questions) + questions_vectors = [] + for i, q in enumerate(questions): + q_weights = [] + for token in q.split(): + if token in self.token2idx: + tfidf_vector = q_vects[i, :] + q_weights.append(tfidf_vector[0, self.token2idx[token]]) + else: + q_weights.append(0) + if sum(q_weights) == 0: + questions_vectors.append(None) + else: + questions_vectors.append(np.average(tokens_fasttext_vectors[i], weights=q_weights, axis=0)) + + return questions_vectors + + def fit(self, x_train: List) -> None: + """Train tf-idf weights + + Parameters: + x_train: train sentences + + Returns: + None + """ + if isinstance(x_train[0], list): + x_train = [' '.join(x) for x in x_train] + + self.vectorizer = TfidfVectorizer() + self.vectorizer.fit(x_train) + self.token2idx = self.vectorizer.vocabulary_ + + def save(self) -> None: + """Save model""" + logger.info("Saving tfidf_vectorizer to {}".format(self.save_path)) + path = expand_path(self.save_path) + make_all_dirs(path) + save_pickle(self.vectorizer, path) + + def load(self) -> None: + """Load model""" + logger.info("Loading tfidf_vectorizer from {}".format(self.load_path)) + self.vectorizer = load_pickle(expand_path(self.load_path)) + self.token2idx = self.vectorizer.vocabulary_ diff --git a/deeppavlov/models/vectorizers/tfidf_vectorizer.py b/deeppavlov/models/vectorizers/tfidf_vectorizer.py new file mode 100644 index 0000000000..14ad2d3805 --- /dev/null +++ b/deeppavlov/models/vectorizers/tfidf_vectorizer.py @@ -0,0 +1,100 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from scipy.sparse import csr_matrix +from sklearn.feature_extraction.text import TfidfVectorizer + +from deeppavlov.core.models.estimator import Estimator +from deeppavlov.core.models.serializable import Serializable +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.file import save_pickle +from deeppavlov.core.common.file import load_pickle +from deeppavlov.core.commands.utils import expand_path, make_all_dirs, is_file_exist + + +TOKENIZER = None +logger = get_logger(__name__) + + +@register('tfidf_vectorizer') +class TfIdfVectorizer(Estimator, Serializable): + """ + Sentence vectorizer which produce sparse vector with TF-IDF values for each word in sentence + + Parameters: + save_path: path to save the model + load_path: path to load the model + + Returns: + None + """ + + def __init__(self, save_path: str = None, load_path: str = None, **kwargs) -> None: + self.save_path = save_path + self.load_path = load_path + + if is_file_exist(self.load_path): + self.load() + else: + if kwargs['mode'] == 'train': + self.vectorizer = TfidfVectorizer() + else: + self.load() + + def __call__(self, questions: List[str]) -> csr_matrix: + """ + Vectorize sentence into TF-IDF values + + Parameters: + questions: list of sentences + + Returns: + list of vectorized sentences + """ + if isinstance(questions[0], list): + questions = [' '.join(q) for q in questions] + + q_vects = self.vectorizer.transform(questions) + return q_vects + + def fit(self, x_train: List[str]) -> None: + """ + Train TF-IDF vectorizer + + Parameters: + x_train: list of sentences for train + + Returns: + None + """ + if isinstance(x_train[0], list): + x_train = [' '.join(q) for q in x_train] + + self.vectorizer = TfidfVectorizer() + self.vectorizer.fit(x_train) + + def save(self) -> None: + """Save TF-IDF vectorizer""" + path = expand_path(self.save_path) + make_all_dirs(path) + logger.info("Saving tfidf_vectorizer to {}".format(path)) + save_pickle(self.vectorizer, path) + + def load(self) -> None: + """Load TF-IDF vectorizer""" + logger.info("Loading tfidf_vectorizer from {}".format(expand_path(self.load_path))) + self.vectorizer = load_pickle(expand_path(self.load_path)) diff --git a/deeppavlov/run_model.py b/deeppavlov/run_model.py index 906b1cb522..71e6867960 100644 --- a/deeppavlov/run_model.py +++ b/deeppavlov/run_model.py @@ -19,8 +19,8 @@ from deeppavlov.core.commands.infer import interact_model -# PIPELINE_CONFIG_PATH = 'configs/intents/intents_dstc2.json' -# PIPELINE_CONFIG_PATH = 'configs/intents/intents_snips.json' +# PIPELINE_CONFIG_PATH = 'configs/classifiers/intents_dstc2.json' +# PIPELINE_CONFIG_PATH = 'configs/classifiers/intents_snips.json' # PIPELINE_CONFIG_PATH = 'configs/ner/ner_dstc2.json' # PIPELINE_CONFIG_PATH = 'configs/ner/ner_rus.json' # PIPELINE_CONFIG_PATH = 'configs/ner/slotfill_dstc2.json' diff --git a/deeppavlov/vocabs/wiki_sqlite.py b/deeppavlov/vocabs/wiki_sqlite.py index 86f84f6733..9ea75eedf7 100644 --- a/deeppavlov/vocabs/wiki_sqlite.py +++ b/deeppavlov/vocabs/wiki_sqlite.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Any, Optional +from typing import List, Any, Optional, Union from deeppavlov.core.common.registry import register from deeppavlov.core.common.log import get_logger @@ -28,21 +28,26 @@ class WikiSQLiteVocab(SQLiteDataIterator): Args: data_url: an URL where to download a DB from data_dir: a directory where to save downloaded DB to + join_docs: whether to join extracted docs with ' ' or not + shuffle: whether to shuffle data or not - """ + Attributes: + join_docs: whether to join extracted docs with ' ' or not - def __init__(self, data_url: str, data_dir: str = '', **kwargs): + """ - super().__init__(data_dir=data_dir, data_url=data_url) + def __init__(self, data_url: str, data_dir: str = '', join_docs: bool=True, shuffle: bool=False, **kwargs): + super().__init__(data_dir=data_dir, data_url=data_url, shuffle=shuffle) + self.join_docs = join_docs - def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[str]: - """Get the contents of files, stacked by space. + def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[Union[str, List[str]]]: + """Get the contents of files, stacked by space or as they are. Args: doc_ids: a batch of lists of ids to get contents for Returns: - a list of contents + a list of contents / list of lists of contents """ all_contents = [] if not doc_ids: @@ -51,7 +56,8 @@ def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) - for ids in doc_ids: contents = [self.get_doc_content(doc_id) for doc_id in ids] - contents = ' '.join(contents) + if self.join_docs: + contents = ' '.join(contents) all_contents.append(contents) return all_contents diff --git a/docs/_static/diagram.png b/docs/_static/gobot_diagram.png similarity index 100% rename from docs/_static/diagram.png rename to docs/_static/gobot_diagram.png diff --git a/docs/_static/kvret_diagram.png b/docs/_static/kvret_diagram.png new file mode 100644 index 0000000000..d72cb8b04d Binary files /dev/null and b/docs/_static/kvret_diagram.png differ diff --git a/docs/apiref/core.rst b/docs/apiref/core.rst index 6382bef512..f5d8d29322 100644 --- a/docs/apiref/core.rst +++ b/docs/apiref/core.rst @@ -10,5 +10,4 @@ DeepPavlov Core core.commands core.common core.data - core.layers core.models diff --git a/docs/apiref/core/commands.rst b/docs/apiref/core/commands.rst index 5c800d01a6..e1f2388b0e 100644 --- a/docs/apiref/core/commands.rst +++ b/docs/apiref/core/commands.rst @@ -2,5 +2,8 @@ core.commands ============= Basic training and inference functions. -.. automodule:: deeppavlov.core.commands +.. automodule:: deeppavlov.core.commands.infer + :members: + +.. automodule:: deeppavlov.core.commands.train :members: diff --git a/docs/apiref/core/common.rst b/docs/apiref/core/common.rst index 78700e61f5..05b252cb88 100644 --- a/docs/apiref/core/common.rst +++ b/docs/apiref/core/common.rst @@ -2,5 +2,14 @@ core.common =========== Registration and classes initialization functionality, class method decorators. -.. automodule:: deeppavlov.core.common +.. automodule:: deeppavlov.core.common.chainer + :members: + +.. automodule:: deeppavlov.core.common.metrics_registry + :members: + +.. automodule:: deeppavlov.core.common.params + :members: + +.. automodule:: deeppavlov.core.common.registry :members: diff --git a/docs/apiref/core/layers.rst b/docs/apiref/core/layers.rst deleted file mode 100644 index a28e3a71ae..0000000000 --- a/docs/apiref/core/layers.rst +++ /dev/null @@ -1,6 +0,0 @@ -core.layers -=========== -Collection of commonly used Layers for TF models. - -.. automodule:: deeppavlov.core.layers - :members: diff --git a/docs/apiref/core/models.rst b/docs/apiref/core/models.rst index 38ab785e15..a3ebc78260 100644 --- a/docs/apiref/core/models.rst +++ b/docs/apiref/core/models.rst @@ -2,5 +2,16 @@ core.models =========== Abstract model classes and interfaces. -.. automodule:: deeppavlov.core.models - :members: +.. autoclass:: deeppavlov.core.models.component.Component + +.. autoclass:: deeppavlov.core.models.serializable.Serializable + +.. autoclass:: deeppavlov.core.models.estimator.Estimator + +.. autoclass:: deeppavlov.core.models.nn_model.NNModel + +.. autoclass:: deeppavlov.core.models.tf_backend.TfModelMeta + +.. autoclass:: deeppavlov.core.models.tf_model.TFModel + +.. autoclass:: deeppavlov.core.models.keras_model.KerasModel diff --git a/docs/apiref/dataset_iterators.rst b/docs/apiref/dataset_iterators.rst index d20c096dbe..45989fa9b5 100644 --- a/docs/apiref/dataset_iterators.rst +++ b/docs/apiref/dataset_iterators.rst @@ -15,6 +15,7 @@ Concrete DatasetIterator classes. .. autoclass:: deeppavlov.dataset_iterators.kvret_dialog_iterator.KvretDialogDatasetIterator +.. autofunction:: deeppavlov.dataset_iterators.morphotagger_iterator.preprocess_data .. autoclass:: deeppavlov.dataset_iterators.morphotagger_iterator.MorphoTaggerDatasetIterator .. autoclass:: deeppavlov.dataset_iterators.sqlite_iterator.SQLiteDataIterator diff --git a/docs/apiref/dataset_readers.rst b/docs/apiref/dataset_readers.rst index 9ec14a7cab..2fafa08dc3 100644 --- a/docs/apiref/dataset_readers.rst +++ b/docs/apiref/dataset_readers.rst @@ -17,7 +17,8 @@ Concrete DatasetReader classes. .. automodule:: deeppavlov.dataset_readers.kvret_reader :members: -.. autoclass:: deeppavlov.dataset_readers.morphotagging_dataset_reader.MorphotaggerDatasetReader +.. automodule:: deeppavlov.dataset_readers.morphotagging_dataset_reader + :members: .. autoclass:: deeppavlov.dataset_readers.ontonotes_reader.OntonotesReader @@ -26,3 +27,9 @@ Concrete DatasetReader classes. .. automodule:: deeppavlov.dataset_readers.typos_reader :members: + +.. autoclass:: deeppavlov.dataset_readers.faq_reader.FaqDatasetReader + :members: + +.. autoclass:: deeppavlov.dataset_readers.line_reader.LineReader + :members: diff --git a/docs/apiref/models/classifiers.rst b/docs/apiref/models/classifiers.rst index c30119d6a0..e9afd3633d 100644 --- a/docs/apiref/models/classifiers.rst +++ b/docs/apiref/models/classifiers.rst @@ -7,7 +7,7 @@ deeppavlov.models.classifiers .. autoclass:: deeppavlov.models.classifiers.keras_classification_model.KerasClassificationModel .. automethod:: __call__ - .. automethod:: texts2vec + .. automethod:: pad_texts .. automethod:: train_on_batch .. automethod:: infer_on_batch .. automethod:: cnn_model @@ -20,3 +20,15 @@ deeppavlov.models.classifiers .. automethod:: bilstm_self_add_attention_model .. automethod:: bilstm_self_mult_attention_model .. automethod:: bigru_model + +.. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier + :members: + + .. automethod:: __call__ + + +.. autoclass:: deeppavlov.models.classifiers.logreg_classifier.LogregClassifier + :members: + + .. automethod:: __call__ + diff --git a/docs/apiref/models/morpho_tagger.rst b/docs/apiref/models/morpho_tagger.rst index 891352a58c..c1a64a1b4d 100644 --- a/docs/apiref/models/morpho_tagger.rst +++ b/docs/apiref/models/morpho_tagger.rst @@ -2,9 +2,15 @@ deeppavlov.models.morpho_tagger =============================== .. autoclass:: deeppavlov.models.morpho_tagger.tagger.MorphoTaggerWrapper + :members: + + .. automethod:: __call__ .. autofunction:: deeppavlov.models.morpho_tagger.common.predict_with_model +.. autoclass:: deeppavlov.models.morpho_tagger.network.CharacterTagger + :members: + .. autofunction:: deeppavlov.models.morpho_tagger.common.prettify .. autoclass:: deeppavlov.models.morpho_tagger.common.TagOutputPrettifier diff --git a/docs/apiref/models/preprocessors.rst b/docs/apiref/models/preprocessors.rst index 193927a21c..bc88e96c64 100644 --- a/docs/apiref/models/preprocessors.rst +++ b/docs/apiref/models/preprocessors.rst @@ -7,6 +7,8 @@ deeppavlov.models.preprocessors .. autoclass:: deeppavlov.models.preprocessors.capitalization.CapitalizationPreprocessor +.. autofunction:: deeppavlov.models.preprocessors.capitalization.process_word + .. autoclass:: deeppavlov.models.preprocessors.capitalization.LowercasePreprocessor .. autoclass:: deeppavlov.models.preprocessors.char_splitter.CharSplitter @@ -26,3 +28,12 @@ deeppavlov.models.preprocessors .. autoclass:: deeppavlov.models.preprocessors.str_lower.StrLower .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.DocumentChunker + + .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.StringMultiplier + + .. automethod:: __call__ + diff --git a/docs/apiref/models/ranking.rst b/docs/apiref/models/ranking.rst index 06351aef1e..8f74a77169 100644 --- a/docs/apiref/models/ranking.rst +++ b/docs/apiref/models/ranking.rst @@ -25,3 +25,8 @@ Ranking classes. :members: .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.ranking.logit_ranker.LogitRanker + :members: + + .. automethod:: __call__ diff --git a/docs/apiref/models/vectorizers.rst b/docs/apiref/models/vectorizers.rst index 6b28949f7b..2985a4beb5 100644 --- a/docs/apiref/models/vectorizers.rst +++ b/docs/apiref/models/vectorizers.rst @@ -1,10 +1,23 @@ deeppavlov.models.vectorizers ============================= -.. automodule:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer .. autoclass:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer :members: .. automethod:: __call__ +.. autoclass:: deeppavlov.models.vectorizers.tfidf_vectorizer.TfIdfVectorizer + :members: + + .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.vectorizers.sentence2vector_w2v_tfidf.SentenceW2vVectorizerTfidfWeights + :members: + + .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.vectorizers.sentence2vector_w2v_avg.SentenceAvgW2vVectorizer + :members: + + .. automethod:: __call__ diff --git a/docs/components/classifiers.rst b/docs/components/classifiers.rst index b95a25e4e2..1ec8fb720e 100644 --- a/docs/components/classifiers.rst +++ b/docs/components/classifiers.rst @@ -62,6 +62,12 @@ embeddings trained on DSTC-2 dataset that is not the best choice for this task. Train set is divided to train and validation sets to illustrate ``basic_classification_iterator`` work. +**Detecting Insults in Social Commentary** dataset +(https://www.kaggle.com/c/detecting-insults-in-social-commentary) +contains binary classification task for **detecting insults** for +participants of conversation. Train, valid and test division is the same +as for the Kaggle challenge. + **AG News** dataset (https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html) contains **topic classification** task for 5 classes (range from 0 @@ -69,57 +75,75 @@ to 4 points scale). Test set is initial one from a web-site, valid is a Stratified division 1/5 from the train set from web-site with 42 seed, and the train set is the rest. -**Detecting Insults in Social Commentary** dataset -(https://www.kaggle.com/c/detecting-insults-in-social-commentary) -contains binary classification task for **detecting insults** for -participants of conversation. Train, valid and test division is the same -as for the Kaggle challenge. - **Twitter mokoron** dataset (http://study.mokoron.com/) contains **sentiment classification** of Russian tweets for positive and negative -replies [5]. Train, valid and test division is made by hands (Stratified +replies [5]. It was automatically labeled. +Train, valid and test division is made by hands (Stratified division: 1/5 from all dataset for test set with 42 seed, then 1/5 from -the rest for validation set with 42 seed). Attention! The pre-trained -model was trained on ``sentiment_twitter_data/no_smiles_data`` -- the -same dataset but with removed "(" and ")". - -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ -| Model | Dataset | Valid accuracy | Test accuracy | -+================================================+===================+=============================================+=============================================+ -| ``configs/intents/intents_dstc2.json`` | DSTC 2 | 0.8744 | 0.8801 | -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ -| ``configs/intents/intents_dstc2_big.json`` | DSTC 2 | 0.9682 | 0.9684 | -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ -| ``configs/intents/intents_snips.json`` | SNIPS | 0.8829 | -- | -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ -| ``configs/sentiment/insults_kaggle.json`` | InsultsKaggle | 0.8757 | 0.7503 | -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ -| ``configs/sentiment/sentiment_ag_news.json`` | AG News | 0.8735 | 0.8859 | -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ -| ``configs/sentiment/sentiment_twitter.json`` | Twitter.mokoron | 0.8021 (with smiles), 0.8008 (no\_smiles) | 0.7949 (with smiles), 0.7943 (no\_smiles) | -+------------------------------------------------+-------------------+---------------------------------------------+---------------------------------------------+ +the rest for validation set with 42 seed). Two provided pre-trained +models were trained on the same dataset but with and without preprocessing. +The main difference between scores is caused by the fact that some symbols +(deleted during preprocessing) were used for automatic labelling. Therefore, +it can be considered that model trained on preprocessed data is +based on semantics while model trained on unprocessed data +is based on punctuation and syntax. + +**RuSentiment** dataset (http://text-machine.cs.uml.edu/projects/rusentiment/) contains +**sentiment classification** of social media posts for Russian language within 5 classes 'positive', 'negative', +'neutral', 'speech', 'skip'. + + ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| Dataset | Model | Task | Lang | Metric | Valid | Test | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `DSTC 2`_ | :config:`DSTC 2 on DSTC 2 embeddings ` | 28 intents | En | Accuracy | 0.8554 | 0.8658 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `DSTC 2`_ | :config:`DSTC 2 on Wiki embeddings ` | 28 intents | En | Accuracy | 0.9659 | 0.9659 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `SNIPS-2017`_ | :config:`SNIPS on DSTC 2 embeddings ` | 7 intents | En | F1 | 0.8821 | -- | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `SNIPS-2017`_ | :config:`SNIPS on Wiki embeddings ` | 7 intents | En | F1 | 0.9852 | -- | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `Insults`_ | :config:`InsultsKaggle on Reddit embeddings ` | Insult detection | En | ROC-AUC | 0.9287 | 0.8602 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `AG News`_ | :config:`AG News on Wiki embeddings ` | 5 topics | En | Accuracy | 0.8735 | 0.8859 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +|`Twitter mokoron`_ | :config:`Twitter on RuWiki+Lenta embeddings without any preprocessing ` | Sentiment | Ru | Accuracy | 0.9968 | 0.9971 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +|`Twitter mokoron`_ | :config:`Twitter on RuWiki+Lenta embeddings with preprocessing ` | Sentiment | Ru | Accuracy | 0.7944 | 0.7879 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +|`RuSentiment`_ | :config:`RuSentiment on RuWiki+Lenta embeddings ` | Sentiment | Ru | F1 | 0.7843 | 0.6556 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ + +.. _`DSTC 2`: http://camdial.org/~mh521/dstc/ +.. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines +.. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary +.. _`AG News`: https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html +.. _`Twitter mokoron`: http://study.mokoron.com/ +.. _`RuSentiment`: http://text-machine.cs.uml.edu/projects/rusentiment/ + Download pre-trained model -------------------------- DeepPavlov provides the following **pre-trained models**: -- ``configs/intents/intents_dstc2.json`` -- DSTC 2 - intent model for English language with embeddings trained +- :config:`intents_dstc2.json ` -- DSTC 2 - intent model for English language with embeddings trained via fastText on DSTC 2 (800 Mb). -- ``configs/intents/intents_dstc2_big.json`` -- DSTC 2 - intent model for English language with embeddings trained +- :config:`intents_dstc2_big.json ` -- DSTC 2 - intent model for English language with embeddings trained on Wiki (https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md, 8.5 Gb). This model achieves higher accuracy than the first one. -- ``configs/intents/intents_snips.json`` -- SNIPS - intent model for English language. -- ``configs/sentiment/insults_kaggle.json`` -- Insults analysis for English language. -- ``configs/sentiment/sentiment_ag_news.json`` -- AG News topic analysis for English language. -- ``configs/sentiment/sentiment_twitter.json`` -- Twitter Mokoron sentiment analysis for **Russian** language. +- :config:`intents_snips.json ` -- SNIPS - intent model for English language. +- :config:`insults_kaggle.json ` -- Insults analysis for English language. +- :config:`topic_ag_news.json ` -- AG News topic analysis for English language. +- :config:`sentiment_twitter.json ` -- Twitter Mokoron sentiment analysis for **Russian** language. To download pre-trained models, vocabs, embeddings on the dataset of interest one should run the following command providing corresponding name of the config file (see above): :: - python deep.py download configs/intents/intents_dstc2.json + python deep.py download configs/classifiers/intents_dstc2.json or provide flag ``-d`` for commands like ``interact``, ``interactbot``, etc. The flag ``-d`` provides downloading all the required components. @@ -133,13 +157,13 @@ command providing corresponding name of the config file (see above): :: - python deep.py interact configs/intents/intents_dstc2.json + python deep.py interact configs/classifiers/intents_dstc2.json or :: - python deep.py interactbot configs/intents/intents_dstc2.json -t + python deep.py interactbot configs/classifiers/intents_dstc2.json -t For 'interactbot' mode one should specify a Telegram bot token in ``-t`` parameter or in the ``TELEGRAM_TOKEN`` environment variable. @@ -149,7 +173,7 @@ which the string belongs to, and the second one is a dictionary with probability the considered classes (take into account that for multi-class classification then sum of probabilities is not equal to 1). -An example of interacting the model from ``configs/intents/intents_dstc2.json`` +An example of interacting the model from :config:`intents_dstc2.json ` :: @@ -157,7 +181,7 @@ An example of interacting the model from ``configs/intents/intents_dstc2.json`` >> (array(['inform_pricerange'], dtype='` :: @@ -189,12 +213,12 @@ classification task. Below the list of available models is presented: Configuration parameters ~~~~~~~~~~~~~~~~~~~~~~~~ -One can find examples of config files in ``deeppavlov/configs/intents`` and ``deeppavlov/configs/sentiment``. +One can find examples of config files in ``deeppavlov/configs/classifiers``. Detailed description of configuration file and specific parameters for all presented classification models can be found in :doc:`reference `. -Some clue parameters for ``deeppavlov/configs/intents/intents_dstc2.json`` config file are +Some clue parameters for :config:`intents_dstc2.json ` config file are presented in the table below. +--------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -379,7 +403,7 @@ Then training process can be run in the same way: python deep.py train "path_to_config" -The current version of ``intents_snips.json`` contains parameters for +The current version of :config:`intents_snips.json `` contains parameters for intent recognition for SNIPS benchmark dataset [2] that was restored in ``.csv`` format and will be downloaded automatically. @@ -409,7 +433,7 @@ trained on Reddit dataset. +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | wit.ai | 0.9877 | 0.9913 | 0.9921 | 0.9766 | 0.9977 | 0.9458 | 0.9673 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ -| snips.ai | 0.9873 | 0.9921 | 0.9939 | 0.9729 | 0.9985 | 0.9455 | 0.9613 | +| snips.ai | 0.9873 | 0.9921 | 0.9939 | 0.9729 | 0.9985 | 0.9455 | 0.9613 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ | recast.ai | 0.9894 | 0.9943 | 0.9910 | 0.9660 | 0.9981 | 0.9424 | 0.9539 | +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ diff --git a/docs/components/faq.rst b/docs/components/faq.rst new file mode 100644 index 0000000000..07307652a7 --- /dev/null +++ b/docs/components/faq.rst @@ -0,0 +1,161 @@ +================================ +Frequently Asked Questions (FAQ) +================================ + +This is implementation of FAQ component which helps to classify incoming questions. + +:: + + :: What is your open hours? + >> 8am - 8pm + + +Config +====== + +As usual, config consists of: + +- **dataset_reader** +- **dataset_iterator** +- **chainer** + +You can use you own dataset_reader, dataset_iterator for speficic data. +Let's consider chainer in more details. + +Config Structure +---------------- + +- **chainer** - pipeline manager + + - **in** - pipeline input data: question + - **out** - pipeline output data: answer + score[0,1] + +- **preprocessing** - it can be tokenization, lemmatization, stemming and etc. In example tfidf_logreg_autofaq.json there are tokenization and lemmatization. + +- **vectorizer** - vectorizer of incoming sentences. It can be word embeddings vectorizer, bag of words vectorizer, tf-idf vectorizer and etc. Th output is vectorized sentences (numeric vectors). + +- **classifier** - This is faq model that classify incoming question. Model receive vectorized train sentences and vectorized question for inference. Output is classified answer from train dataset. + + +Vectorizers +----------- + +Vectorizers produce numeric vectors of input sentences + +- **tfidf_vectorizer** - TF-IDF vectorizer + + - **in** - input data: question + - **fit_on** - train data: token lemmas of question + - **save_path** - path where to save model + - **load_path** - path where to load model + - **out** - output data: vectorized sentence + +- **sentence2vector_v2w_tfidf** - Sentence vectorizer: weighted sum of word embeddings from sentence + + - **in** - input data: question + - **fit_on** - train data: [token lemmas of question, word embeddings] + - **save_path** - path where to save model + - **load_path** - path where to load model + - **out** - output data: vectorized sentence + +- **sentence2vector_v2w_avg** - Sentence vectorizer: average sum of word embeddings from sentence + - **in** - input data: question + - **out** - output data: vectorized sentence + + + +Classifiers for FAQ +------------------- + +This is models that classify incoming question and find corresponding answer + +- **cos_sim_classifier** - Classifier based on cosine similarity + + - **in** - input data: question + - **fit_on** - train data: [vectorized sentences, answers] + - **save_path** - path where to save model + - **load_path** - path where to load model + - **out** - output data: [answer, score] + + +- **logreg_classifier** - Logistic Regression classifier, that output most probable answer with score + + - **in** - input data: question + - **fit_on** - train data: [vectorized sentences, answers] + - **c** - regularization parameter for logistic regression model + - **penalty** - regularization type: 'l1' or 'l2' + - **save_path** - path where to save model + - **load_path** - path where to load model + - **out** - output data: [answer, score] + + + +Running FAQ +=========== + + +Training +-------- + +To train your own model by running command `train`, for example: + +.. code:: bash + + cd deeppavlov/ + python deep.py train configs/faq/tfidf_autofaq.json + + +Interacting +----------- + +After model has trained, you can use it for inference: model will return answers from FAQ data that used for train. + +.. code:: bash + + cd deeppavlov/ + python deep.py interact configs/faq/tfidf_autofaq.json -d + + +Inference example: + +:: + + :: What is your open hours? + >> 8am - 8pm + + +Available Data and Pretrained Models +==================================== + +As an example you can try pretrained models on FAQ dataset in English: MIPT FAQ for entrants - https://mipt.ru/english/edu/faqs/ + + + :: + + tfidf_logreg_classifier_en_mipt_faq - http://files.deeppavlov.ai/faq/mipt/tfidf_logreg_classifier_en_mipt_faq.pkl + tfidf_vectorizer_en_mipt_faq - http://files.deeppavlov.ai/faq/mipt/tfidf_vectorizer_en_mipt_faq.pkl + + +- **tfidf_logreg_classifier_en_mipt_faq.pkl** - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) +- **tfidf_vectorizer_en_mipt_faq.pkl** - pre-trained model for TF-IDF vectorizer based on MIPT FAQ + +Example config - :download:`deeppavlov/configs/faq/tfidf_logreg_en_faq.json <../../deeppavlov/configs/faq/tfidf_logreg_en_faq.json>` + + +Also you can use pretrained model on Russan FAQ dataset from school-site: http://www.ftl.name/page/989 + + :: + + tfidf_cos_sim_classifier - http://files.deeppavlov.ai/faq/school/faq_tfidf_cos_model.pkl + tfidf_logreg_classifier - http://files.deeppavlov.ai/faq/school/faq_tfidf_logreg_model.pkl + fasttext_cos_classifier - http://files.deeppavlov.ai/faq/school/faq_fasttext_cos_model.pkl + tfidf_vectorizer_ruwiki - http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl + + +- **tfidf_cos_sim_classifier.pkl** - pre-trained cosine similarity classifier for classifying input question (vectorized by tfidf) +- **tfidf_logreg_classifier.pkl** - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) +- **fasttext_cos_classifier.pkl** - pre-trained cosine similarity classifier for classifying input question (vectorized by word embeddings) +- **tfidf_vectorizer_ruwiki.pkl** - pre-trained model for TF-IDF vectorizer based on Russian Wikipedia + + + diff --git a/docs/components/neural_ranking.rst b/docs/components/neural_ranking.rst index f1c9978fac..5b9051d994 100644 --- a/docs/components/neural_ranking.rst +++ b/docs/components/neural_ranking.rst @@ -52,22 +52,6 @@ command: As an example of configuration file see :config:`ranking_insurance.json `. - -Comparison ----------- - -The InsuranceQA V1 dataset: - -+------------------------------------------------------------------+-------------------------+--------------------+ -| Model | Validation (Recall@1) | Test1 (Recall@1) | -+==================================================================+=========================+====================+ -| Architecture II: (HLQA(200) CNNQA(4000) 1-MaxPooling Tanh) [1] | 61.8 | 62.8 | -+------------------------------------------------------------------+-------------------------+--------------------+ -| QA-LSTM basic-model(max pooling) [2] | 64.3 | 63.1 | -+------------------------------------------------------------------+-------------------------+--------------------+ -| Our model (biLSTM, max pooling) | **67.6** | **67.6** | -+------------------------------------------------------------------+-------------------------+--------------------+ - Literature ---------- diff --git a/docs/components/squad.rst b/docs/components/squad.rst index 71fe270c61..0797982dc8 100644 --- a/docs/components/squad.rst +++ b/docs/components/squad.rst @@ -90,10 +90,34 @@ Pretrained model is available and can be downloaded: python -m deeppavlov download deeppavlov/configs/squad/squad.json -It achieves ~80 F-1 score and ~71 EM on dev set. Results of the most -recent solutions could be found on `SQuAD +It achieves ~80 F-1 score and ~71 EM on `SQuAD-v1.1`_ dev set. + +In the following table you can find comparison with published results. Results of the most recent competitive solutions could be found on `SQuAD Leadearboad `__. ++----------------------------------------------+----------------+-----------------+ +| Model (single model) | EM (dev) | F-1 (dev) | ++----------------------------------------------+----------------+-----------------+ +| :config:`DeepPavlov ` | 71.41 | 80.26 | ++----------------------------------------------+----------------+-----------------+ +| `BiDAF + Self Attention + ELMo`_ | -- | 85.6 | ++----------------------------------------------+----------------+-----------------+ +| `QANet`_ | 75.1 | 83.8 | ++----------------------------------------------+----------------+-----------------+ +| `FusionNet`_ | 75.3 | 83.6 | ++----------------------------------------------+----------------+-----------------+ +| `R-Net`_ | 71.1 | 79.5 | ++----------------------------------------------+----------------+-----------------+ +| `BiDAF`_ | 67.7 | 77.3 | ++----------------------------------------------+----------------+-----------------+ + +.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250 +.. _`BiDAF`: https://arxiv.org/abs/1611.01603 +.. _`R-Net`: https://www.microsoft.com/en-us/research/publication/mrc/ +.. _`FusionNet`: https://arxiv.org/abs/1711.07341 +.. _`QANet`: https://arxiv.org/abs/1804.09541 +.. _`BiDAF + Self Attention + ELMo`: https://arxiv.org/abs/1802.05365 + SDSJ Task B ~~~~~~~~~~~ @@ -103,4 +127,8 @@ Pretrained model is available and can be downloaded: python -m deeppavlov download deeppavlov/configs/squad/squad_ru.json -It achieves ~80 F-1 score and ~60 EM on dev set. ++---------------+---------------------------------+----------------+-----------------+ +| Model config | EM (dev) | F-1 (dev) | ++-------------------------------------------------+----------------+-----------------+ +| :config:`DeepPavlov ` | 60.58 | 80.22 | ++-------------------------------------------------+----------------+-----------------+ diff --git a/docs/components/tfidf_ranking.rst b/docs/components/tfidf_ranking.rst index 2d8fecb8e9..97813daf1c 100644 --- a/docs/components/tfidf_ranking.rst +++ b/docs/components/tfidf_ranking.rst @@ -165,13 +165,16 @@ class. Comparison ========== -Scores for **TF-IDF Ranker** skill: +Scores for **TF-IDF Ranker** model: -+-------------------------------------------------+--------------------------------------------------------------------------+----------------+ -| Skill | Config | Recall (top 5) | -+=================================================+==========================================================================+================+ -| **TF-IDF Ranker English** | :config:`en_ranker_tfidf_wiki.json ` | 0.756 | -+-------------------------------------------------+--------------------------------------------------------------------------+----------------+ + ++-------------------------------------------------------+----------------+----------------------+-----------------+ +| Model | Dataset | Wiki dump | Recall (top 5) | ++-------------------------------------------------------+----------------+----------------------+-----------------+ +| :config:`DeepPavlov ` | SQuAD (dev) | enwiki (2018-02-11) | 75.6 | ++-------------------------------------------------------+----------------+----------------------+-----------------+ +| `DrQA`_ | SQuAD (dev) | enwiki (2016-12-21) | 77.8 | ++-------------------------------------------------------+----------------+----------------------+-----------------+ References diff --git a/docs/devguides/rest_api.rst b/docs/devguides/rest_api.rst index ce36cb5acd..53e97245f7 100644 --- a/docs/devguides/rest_api.rst +++ b/docs/devguides/rest_api.rst @@ -32,26 +32,26 @@ component arguments names from ``model_args_names``. Default argument name for one argument components is *"context"*. Here are POST requests examples for some of the library components: -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| Component | POST request JSON payload example | -+=========================================+===============================================================================================================================================+ -| **One argument components** | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| NER component | {"context":"Elon Musk launched his cherry Tesla roadster to the Mars orbit"} | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| Intent classification component | {"context":"I would like to go to a restaurant with Asian cuisine this evening"} | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| Automatic spelling correction component | {"context":"errror"} | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| Ranking component | {"context":"What is the average cost of life insurance services?"} | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| (Seq2seq) Goal-oriented bot | {"context":"Hello, can you help me to find and book a restaurant this evening?"} | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| **Multiple arguments components** | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ -| Question Answering component | | {"context":"After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies.", | -| | |  "question":"What strained the relationship between Great Britain and its colonies?"} | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+ ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| Component | POST request JSON payload example | ++=========================================+=================================================================================================================================================+ +| **One argument components** | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| NER component | {"context":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]} | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| Intent classification component | {"context":["I would like to go to a restaurant with Asian cuisine this evening"]} | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| Automatic spelling correction component | {"context":["errror"]} | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| Ranking component | {"context":["What is the average cost of life insurance services?"]} | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| (Seq2seq) Goal-oriented bot | {"context":["Hello, can you help me to find and book a restaurant this evening?"]} | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Multiple arguments components** | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ +| Question Answering component | | {"context":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], | +| | |  "question":["What strained the relationship between Great Britain and its colonies?"]} | ++-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ Flasgger UI for API testing is provided on ``:/apidocs`` diff --git a/docs/index.rst b/docs/index.rst index fb4fd44874..4d4c3d9f94 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,6 +29,7 @@ Welcome to DeepPavlov's documentation! Slot filling Spelling Correction TF-IDF Ranking + FAQ .. toctree:: diff --git a/docs/intro/config_description.rst b/docs/intro/config_description.rst index a8baad9b63..1e8b52773e 100644 --- a/docs/intro/config_description.rst +++ b/docs/intro/config_description.rst @@ -101,8 +101,8 @@ parameter which contains a list of ground truth answer names. For example: "in_y": ["y"], "out": ["y_predicted"], "name": "intent_model", - "save_path": "intents/intent_cnn", - "load_path": "intents/intent_cnn", + "save_path": "classifiers/intent_cnn", + "load_path": "classifiers/intent_cnn", "classes_vocab": { "ref": "classes_vocab" } @@ -135,8 +135,8 @@ and ``train``: Simplified version of training pipeline contains two elements: ``dataset`` and ``train``. The ``dataset`` element currently can be used for train from classification data in ``csv`` and ``json`` formats. You can find complete examples of how to use simplified training pipeline in -:config:`intents_sample_csv.json ` and -:config:`intents_sample_json.json ` config files. +:config:`intents_sample_csv.json ` and +:config:`intents_sample_json.json ` config files. Train Parameters diff --git a/docs/intro/features.rst b/docs/intro/features.rst index 3200120660..9e8765bb8d 100644 --- a/docs/intro/features.rst +++ b/docs/intro/features.rst @@ -10,12 +10,28 @@ Based on neural Named Entity Recognition network. The NER component reproduces a of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition `__ which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf. ++---------------------------------------------------------------------------------------------------------------------------+------------------+ +| Dataset | Test F1 | ++---------------------------------------------------------------------------------------------------------------------------+------------------+ +| :config:`Persons-1000 dataset with additional LOC and ORG markup ` | 95.25 | ++---------------------------------------------------------------------------------------------------------------------------+------------------+ +| :config:`DSTC 2 ` | 98.40 | ++---------------------------------------------------------------------------------------------------------------------------+------------------+ +| :config:`OntoNotes ` | 87.07 | ++---------------------------------------------------------------------------------------------------------------------------+------------------+ + - :doc:`Slot filling components ` Based on fuzzy Levenshtein search to extract normalized slot values from text. The components either rely on NER results or perform needle in haystack search. ++---------------------------------------------------------------------------------------------------------------------------+------------------+ +| Dataset | Slots Accuracy | ++---------------------------------------------------------------------------------------------------------------------------+------------------+ +| :config:`DSTC 2 ` | 98.85 | ++---------------------------------------------------------------------------------------------------------------------------+------------------+ + - :doc:`Classification component ` @@ -23,42 +39,179 @@ Component for classification tasks (intents, sentiment, etc) on word-level. Shal BiLSTM with self-attention and other models are presented. The model also allows multilabel classification of texts. Several pre-trained models are available and presented in Table below. -========================================================================================== ========================================= ========================================= - Dataset Valid accuracy Test accuracy -========================================================================================== ========================================= ========================================= - :config:`DSTC 2 ` 0.8744 0.8801 - :config:`DSTC 2 ` 0.9682 0.9684 - :config:`SNIPS ` 0.8829 -- - :config:`InsultsKaggle ` 0.8757 0.7503 - :config:`AG News ` 0.8735 0.8859 - :config:`Twitter.mokoron ` 0.8021 (with smiles), 0.8008 (no\_smiles) 0.7949 (with smiles), 0.7943 (no\_smiles) -========================================================================================== ========================================= ========================================= ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| Dataset | Model | Task | Lang | Metric | Valid | Test | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `DSTC 2`_ | :config:`DSTC 2 on DSTC 2 embeddings ` | 28 intents | En | Accuracy | 0.8554 | 0.8658 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `DSTC 2`_ | :config:`DSTC 2 on Wiki embeddings ` | 28 intents | En | Accuracy | 0.9659 | 0.9659 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `SNIPS-2017`_ | :config:`SNIPS on DSTC 2 embeddings ` | 7 intents | En | F1 | 0.8821 | -- | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `SNIPS-2017`_ | :config:`SNIPS on Wiki embeddings ` | 7 intents | En | F1 | 0.9852 | -- | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `Insults`_ | :config:`InsultsKaggle on Reddit embeddings ` | Insult detection | En | ROC-AUC | 0.9287 | 0.8602 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +| `AG News`_ | :config:`AG News on Wiki embeddings ` | 5 topics | En | Accuracy | 0.8735 | 0.8859 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +|`Twitter mokoron`_ | :config:`Twitter on RuWiki+Lenta embeddings without any preprocessing ` | Sentiment | Ru | Accuracy | 0.9968 | 0.9971 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +|`Twitter mokoron`_ | :config:`Twitter on RuWiki+Lenta embeddings with preprocessing ` | Sentiment | Ru | Accuracy | 0.7944 | 0.7879 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ +|`RuSentiment`_ | :config:`RuSentiment on RuWiki+Lenta embeddings ` | Sentiment | Ru | F1 | 0.7843 | 0.6556 | ++-------------------+--------------------------------------------------------------------------------------------------------------+------------------+------+----------+--------+--------+ + +.. _`DSTC 2`: http://camdial.org/~mh521/dstc/ +.. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines +.. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary +.. _`AG News`: https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html +.. _`Twitter mokoron`: http://study.mokoron.com/ +.. _`RuSentiment`: http://text-machine.cs.uml.edu/projects/rusentiment/ + + +As no one had published intent recognition for DSTC-2 data, the +comparison of the presented model is given on **SNIPS** dataset. The +evaluation of model scores was conducted in the same way as in [3] to +compare with the results from the report of the authors of the dataset. +The results were achieved with tuning of parameters and embeddings +trained on Reddit dataset. + ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| Model | AddToPlaylist | BookRestaurant | GetWheather | PlayMusic | RateBook | SearchCreativeWork | SearchScreeningEvent | ++========================+=================+==================+===============+==============+==============+======================+========================+ +| api.ai | 0.9931 | 0.9949 | 0.9935 | 0.9811 | 0.9992 | 0.9659 | 0.9801 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| ibm.watson | 0.9931 | 0.9950 | 0.9950 | 0.9822 | 0.9996 | 0.9643 | 0.9750 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| microsoft.luis | 0.9943 | 0.9935 | 0.9925 | 0.9815 | 0.9988 | 0.9620 | 0.9749 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| wit.ai | 0.9877 | 0.9913 | 0.9921 | 0.9766 | 0.9977 | 0.9458 | 0.9673 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| snips.ai | 0.9873 | 0.9921 | 0.9939 | 0.9729 | 0.9985 | 0.9455 | 0.9613 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| recast.ai | 0.9894 | 0.9943 | 0.9910 | 0.9660 | 0.9981 | 0.9424 | 0.9539 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| amazon.lex | 0.9930 | 0.9862 | 0.9825 | 0.9709 | 0.9981 | 0.9427 | 0.9581 | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ +| Shallow-and-wide CNN | **0.9956** | **0.9973** | **0.9968** | **0.9871** | **0.9998** | **0.9752** | **0.9854** | ++------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+ + + - :doc:`Goal-oriented bot ` Based on Hybrid Code Networks (HCNs) architecture from `Jason D. Williams, Kavosh Asadi, Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control with supervised and reinforcement learning – 2017 `__. It allows to predict responses in goal-oriented dialog. The model is -customizable: embeddings, slot filler and intent classifier can switched on and off on demand. +customizable: embeddings, slot filler and intent classifier can be switched on and off on demand. +Available pre-trained models: -- :doc:`Seq2seq goal-oriented bot ` ++------------------------------------------------------------------------------------------------+---------------------+--------------------+ +| Dataset & Model | Valid turn accuracy | Test turn accuracy | ++================================================================================================+=====================+====================+ +| :config:`DSTC2, bot with slot filler & intents ` | 0.5179 | 0.5125 | ++------------------------------------------------------------------------------------------------+---------------------+--------------------+ +| :config:`DSTC2, bot with slot filler & embeddings & attention ` | 0.5538 | 0.5551 | ++------------------------------------------------------------------------------------------------+---------------------+--------------------+ + +Other benchmarks on DSTC2 (can't be directly compared due to dataset :doc:`modifications `): -Dialogue agent predicts responses in a goal-oriented dialog and is able to handle multiple domains (pretrained bot -allows calendar scheduling, weather information retrieval, and point-of-interest navigation). The model is end-to-end -differentiable and does not need to explicitly model dialogue state or belief trackers. ++----------------------------------------------------+------------------------------+ +|             Dataset & Model                 | Test turn accuracy | ++====================================================+==============================+ +| DSTC2, Bordes and Weston (2016) |   0.411           | ++----------------------------------------------------+------------------------------+ +| DSTC2, Perez and Liu (2016)     |   0.487           | ++----------------------------------------------------+------------------------------+ +| DSTC2, Eric and Manning (2017)     |   0.480           | ++----------------------------------------------------+------------------------------+ +| DSTC2, Williams et al. (2017)   |   0.556           | ++----------------------------------------------------+------------------------------+ +- :doc:`Seq2seq goal-oriented bot ` + +Dialogue agent predicts responses in a goal-oriented dialog and is able to handle +multiple domains (pretrained bot allows calendar scheduling, weather information retrieval, +and point-of-interest navigation). The model is end-to-end differentiable and +does not need to explicitly model dialogue state or belief trackers. + +Comparison of deeppavlov pretrained model with others: + ++------------------------------------------------------+------------------+-----------------+ +| Dataset & Model | Valid BLEU | Test BLEU | ++======================================================+==================+=================+ +| :config:`Kvret, KvretNet ` | 0.1319 | **0.1328** | ++------------------------------------------------------+------------------+-----------------+ +| Kvret, KvretNet, Mihail Eric et al. (2017) | -- | **0.132** | ++------------------------------------------------------+------------------+-----------------+ +| Kvret, CopyNet, Mihail Eric et al. (2017) | -- | 0.110 | ++------------------------------------------------------+------------------+-----------------+ +| Kvret, Attn Seq2Seq, Mihail Eric et al. (2017) | -- | 0.102 | ++------------------------------------------------------+------------------+-----------------+ +| Kvret, Rule-based, Mihail Eric et al. (2017) | -- | 0.066 | ++------------------------------------------------------+------------------+-----------------+ + - :doc:`Automatic spelling correction component ` Pipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors. +Compariosn on the `test set `__ for the `SpellRuEval +competition `__ +on Automatic Spelling Correction for Russian: + ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| Correction method | Precision | Recall | F-measure | Speed (sentences/s) | ++=========================================================================================+===========+========+===========+=====================+ +| Yandex.Speller | 83.09 | 59.86 | 69.59 | 5. | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| :config:`Damerau Levenshtein 1 + lm` | 53.26 | 53.74 | 53.50 | 29.3 | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| :config:`Brill Moore top 4 + lm` | 51.92 | 53.94 | 52.91 | 0.6 | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| JamSpell | 44.57 | 35.69 | 39.64 | 136.2 | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| :config:`Brill Moore top 1 ` | 41.29 | 37.26 | 39.17 | 2.4 | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ +| Hunspell | 30.30 | 34.02 | 32.06 | 20.3 | ++-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+ + + - :doc:`Ranking component ` Based on `LSTM-based deep learning models for non-factoid answer selection `__. The model performs ranking of responses or contexts from some database by their relevance for the given context. +Available pre-trained model(s): + ++-------------------+-------------------------------------------------------------+-----------------------+------------------+ +| Dataset | Model config | Validation (Recall@1) | Test1 (Recall@1) | ++-------------------+-------------------------------------------------------------+-----------------------+------------------+ +| `InsuranceQA V1`_ | :config:`ranking_insurance `| 67.6 | 67.6 | ++-------------------+-------------------------------------------------------------+-----------------------+------------------+ + +.. _`InsuranceQA V1`: https://github.com/shuzi/insuranceQA + +Comparison with other models on the `InsuranceQA V1 `__: + ++---------------------------------------------------------------+-------------------------+--------------------+ +| Model | Validation (Recall@1) | Test1 (Recall@1) | ++===============================================================+=========================+====================+ +| `Architecture II (HLQA(200) CNNQA(4000) 1-MaxPooling Tanh)`_ | 61.8 | 62.8 | ++---------------------------------------------------------------+-------------------------+--------------------+ +| `QA-LSTM basic-model(max pooling)`_ | 64.3 | 63.1 | ++---------------------------------------------------------------+-------------------------+--------------------+ +| :config:`ranking_insurance ` | **67.6** | **67.6** | ++---------------------------------------------------------------+-------------------------+--------------------+ + +.. _`Architecture II (HLQA(200) CNNQA(4000) 1-MaxPooling Tanh)`: https://arxiv.org/pdf/1508.01585.pdf +.. _`QA-LSTM basic-model(max pooling)`: https://arxiv.org/pdf/1511.04108.pdf + - :doc:`Question Answering component ` @@ -66,12 +219,43 @@ Based on `R-NET: Machine Reading Comprehension with Self-matching Networks `__. The model solves the task of looking for an answer on a question in a given context (`SQuAD `__ task format). ++---------------+-----------------------------------------------------+----------------+-----------------+ +| Dataset | Model config | EM (dev) | F-1 (dev) | ++---------------+-----------------------------------------------------+----------------+-----------------+ +| `SQuAD-v1.1`_ | :config:`squad ` | 71.41 | 80.26 | ++---------------+-----------------------------------------------------+----------------+-----------------+ +| SDSJ Task B | :config:`squad_ru ` | 60.58 | 80.22 | ++---------------+-----------------------------------------------------+----------------+-----------------+ + +.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250 - :doc:`Morphological tagging component ` Based on character-based approach to morphological tagging `Heigold et al., 2017. An extensive empirical evaluation of character-based morphological tagging for 14 languages `__. A state-of-the-art -model for Russian and several other languages. Model assigns morphological tags in UD format to sequences of words. +model for Russian and several other languages. Model takes as input tokenized sentences and outputs the corresponding +sequence of morphological labels in `UD format `__. The table below +contains word and sentence accuracy on UD2.0 datasets. + ++-----------------+------------------------------------+---------------+----------------+ +| Dataset | Model | Word accuracy | Sent. accuracy | ++-----------------+------------------------------------+---------------+----------------+ +| `UD2.0 Russian`_|`UD Pipe 1.2`_ (Straka et al., 2017)| 93.57 | 43.04 | ++ +------------------------------------+---------------+----------------+ +| |`Basic model`_ | 95.17 | 50.58 | ++ +------------------------------------+---------------+----------------+ +| |`Pymorphy-enhanced model`_ | 96.23 | 58.00 | ++-----------------+------------------------------------+---------------+----------------+ + +.. _`UD2.0 Russian`: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1983 +.. _`UD Pipe 1.2`: http://ufal.mff.cuni.cz/udpipe +.. _`Basic model`: :config: +.. _`Pymorphy-enhanced model`: :config: + +- :doc:`Frequently Asked Questions (FAQ) component ` + +Set of pipelines for FAQ task: classifying incoming question into set of known questions and return prepared answer. +You can build different pipelines based on: tf-idf, weighted fasttext, cosine similarity, logistic regression. Skills @@ -83,6 +267,13 @@ An open domain question answering skill. The skill accepts free-form questions a based on its Wikipedia knowledge. ++------------------------------------------------------+-----------------------+--------+ +| Dataset | Wiki dump | F1 | ++------------------------------------------------------+-----------------------+--------+ +| :config:`SQuAD (dev) ` | enwiki (2018-02-11) | 28.0 | ++------------------------------------------------------+-----------------------+--------+ + + Parameters evolution -------------------- @@ -123,7 +314,7 @@ Examples of some components ``python -m deeppavlov riseapi deeppavlov/configs/ner/slotfill_dstc2.json -d`` - Predict intents on every line in a file: - ``python -m deeppavlov predict deeppavlov/configs/intents/intents_snips.json -d --batch-size 15 < /data/in.txt > /data/out.txt`` + ``python -m deeppavlov predict deeppavlov/configs/classifiers/intents_snips.json -d --batch-size 15 < /data/in.txt > /data/out.txt`` View `video demo `__ of deployment of a diff --git a/docs/intro/parameters_evolution.rst b/docs/intro/parameters_evolution.rst index ffbba0767a..bc4ffde36b 100644 --- a/docs/intro/parameters_evolution.rst +++ b/docs/intro/parameters_evolution.rst @@ -66,7 +66,7 @@ Evolution process can be described in the following way: - ``{"evolve_range": [min_value, max_value], "discrete": true}`` - discrete values uniformly distributed on the following interval, - ``{"evolve_bool": true}`` - bool values, -- ``{"evolve_choice": true, "values": [value_0, ..., value_n]}`` - +- ``{"evolve_choice": [value_0, ..., value_n]}`` - values uniformly taking out of the given values. - Choose the main model in the pipe being evolved. Find or add extra @@ -97,10 +97,10 @@ Example :: cd deeppavlov - python deep.py download configs/intents/intents_snips.json + python deep.py download configs/classifiers/intents_snips.json - To evolve the model run the following command providing corresponding - name of the config file (see above): + name of the config file (see above) :config:`intents_dstc2.json `: :: diff --git a/docs/intro/tutorials.rst b/docs/intro/tutorials.rst index f1c5c51901..5c50059de3 100644 --- a/docs/intro/tutorials.rst +++ b/docs/intro/tutorials.rst @@ -51,3 +51,10 @@ Chit-chat bot with DeepPavlov Implement in DeepPavlov sequence-to-sequence encoder-decoder model with attention mechanism and teacher forcing for chit-chat. + +FAQ with DeepPavlov +------------------- + +`Jupyter notebook `__ + +Implement FAQ model in DeepPavlov based on sentence vectorizers and classifiers. diff --git a/docs/skills/go_bot.rst b/docs/skills/go_bot.rst index 953189b0bd..ab3ca9d03c 100644 --- a/docs/skills/go_bot.rst +++ b/docs/skills/go_bot.rst @@ -60,7 +60,7 @@ Requirements - config :config:`configs/ner/slotfill_dstc2.json ` is recommended 2. (*optional, but recommended*) pretrained intents classifier model - - config :config:`configs/intents/intents_dstc2_big.json ` is recommended + - config :config:`configs/classifiers/intents_dstc2_big.json ` is recommended 3. (*optional*) any sentence (word) embeddings for english - fasttext embeddings can be downloaded @@ -297,11 +297,11 @@ Scores for different modifications of our bot model: +-----------------------------------------------+----------------------------------------------------------------------+----------------------------+ | bot with slot filler & fasttext embeddings | | 0.5317 | +-----------------------------------------------+----------------------------------------------------------------------+----------------------------+ -| bot with slot filler & intents | :config:`gobot_dstc2.json ` | 0.5113 | +| bot with slot filler & intents | :config:`gobot_dstc2.json ` | 0.5125 | +-----------------------------------------------+----------------------------------------------------------------------+----------------------------+ | bot with slot filler & intents & embeddings | | 0.5145 | +-----------------------------------------------+----------------------------------------------------------------------+----------------------------+ -| bot with slot filler & embeddings & attention | :config:`gobot_dstc2_best.json ` | **0.5525** | +| bot with slot filler & embeddings & attention | :config:`gobot_dstc2_best.json ` | **0.5551** | +-----------------------------------------------+----------------------------------------------------------------------+----------------------------+ There is another modification of DSTC2 dataset called dialog babi Task6 @@ -349,4 +349,4 @@ Sequence-to-Sequence Architecture Gives Good Performance on Task-Oriented Dialogue" - 2017 `_ -.. |alt text| image:: ../_static/diagram.png +.. |alt text| image:: ../_static/gobot_diagram.png diff --git a/docs/skills/odqa.rst b/docs/skills/odqa.rst index b32628fd39..fc7521c422 100644 --- a/docs/skills/odqa.rst +++ b/docs/skills/odqa.rst @@ -80,12 +80,17 @@ Comparison Scores for **ODQA** skill: -+-------------------------------------------------+------------------------------------------------------------------+-----------------------+------------+ -| Skill | Config | Ranker Recall (top 5) | Reader f1 | -+=================================================+==================================================================+=======================+============+ -| **ODQA English** | :config:`en_odqa_infer_wiki.json ` | 0.756 | 0.257 | -+-------------------------------------------------+------------------------------------------------------------------+-----------------------+------------+ ++-----------------------+-----------------------------+----------------+-----------------------+--------+------+ +| Model | Dataset | Wiki dump | F1 | EM | ++-----------------------------------------------------+----------------+-----------------------+--------+------+ +|:config:`DeepPavlov ` | SQuAD (dev) | enwiki (2018-02-11) | 28.0 | \- | ++-----------------------------------------------------+----------------+-----------------------+--------+------+ +|`DrQA`_ | SQuAD (dev) | enwiki (2016-12-21) | \- | 27.1 | ++-----------------------------------------------------+----------------+-----------------------+--------+------+ + + +EM stands for "exact-match accuracy". References ========== diff --git a/docs/skills/seq2seq_go_bot.rst b/docs/skills/seq2seq_go_bot.rst index 7e3b03e336..43f2a52cf3 100644 --- a/docs/skills/seq2seq_go_bot.rst +++ b/docs/skills/seq2seq_go_bot.rst @@ -5,6 +5,16 @@ A goal-oriented bot based on a rnn that encodes user utterance and generates response in a sequence-to-sequence manner. For network architecture is similar to `[2] <#references>`__. +|alt text| **Diagram 1.** Key-value retrieval network. +For each time-step of decoding, the cell state is used to compute +an attention over the encoder states and a separate attention +over the key of each entry in the KB. The attentions over the encoder +are used to generate a context vector which is combined with the cell state to +get a distribution over the normal vocabulary. +The attentions over the keys of the KB become the logits +for their associated values and are separate entries in a +now augmented vocabulary that we argmax over. + The dialogue agent is able to handle multi-domain data. The model is end-to-end differentiable and does not need to explicitly model dialogue state or belief trackers. @@ -21,27 +31,86 @@ For more info on the dataset see :class:`~deeppavlov.dataset_readers.kvret_reade Here is a simple example of interaction with a trained dialogue bot (available for download): -:: - - x_text:: Where's the nearest parking garage? - >> the nearest is dish parking at dish_parking_address. is that okay? - x_text:: Yes, please set directions via a route that avoids all heavy traffic if possible. - >> it looks like there is a road block being reported on the route but i will still find the quickest route to dish_parking_address. - x_text:: Thanks so much for your help. - >> you're welcome. have a good day. end_of_dialogue - - x_text:: Will it be warm in Camarillo over the next 2 days? - >> no, it's not gon na be warm in camarillo over the next 2 days +.. code:: text + + x_text:: Where is the nearest gas station? + dialog_id:: 2b77c100-0fec-426a-a483-04ac03763776 + >> chevron is the nearest gas station at 783 arcadia pl. + x_text:: How far is chevron? + dialog_id:: 2b77c100-0fec-426a-a483-04ac03763776 + >> chevron is the closest at 783 arcadia pl. + x_text:: Ok, is there traffic? + dialog_id:: 2b77c100-0fec-426a-a483-04ac03763776 + >> there is heavy traffic on the route to chevron. + x_text:: Thanks + dialog_id:: 2b77c100-0fec-426a-a483-04ac03763776 + >> you're welcome. end_of_dialogue + + x_text:: Will it be rainy on tuesday in Camarillo? + dialog_id:: 3a3f05ec-a5db-4ba5-add8-4ffe02758c35 + >> it will not be cloudy in camarillo on tuesday. x_text:: Thank you car! + dialog_id:: 3a3f05ec-a5db-4ba5-add8-4ffe02758c35 >> you're welcome! end_of_dialogue - x_text:: What day is my lab appointment, and who all will be there? - >> your lab appointment is on lab_appointment_date at lab_appointment_time with lab_appointment_party. - x_text:: Thanks! - >> you're welcome. + x_text:: When is my tennis activity? + dialog_id:: 0b144470-faa8-4e71-98c7-a9b645fffab0 + >> your tennis activity is on sunday at 4 pm. + x_text:: With whom is it? + dialog_id:: 0b144470-faa8-4e71-98c7-a9b645fffab0 + >> your tennis activity on sunday at 4 pm is with your father. + x_text:: Ok, good day to you + dialog_id:: 0b144470-faa8-4e71-98c7-a9b645fffab0 + >> you are welcome. end_of_dialogue + x_text:: exit + +For every ``dialog_id`` that was preset in train, dev or test dataset there exists a list of entries in knowledge base. The entries are being used by model for response prediction. To see available dialogs ids and corresponding knowledge base items see `kvret_dataset` files. + +In the dialogs above, dialog with ``dialog_id="2b77c100-0fec-426a-a483-04ac03763776"`` had entry: + +.. code:: json + + { + "distance": "5 miles", + "traffic_info": "heavy traffic", + "poi_type": "gas station", + "address": "783 Arcadia Pl", + "poi": "Chevron" + } + + +Dialog with ``dialog_id="3a3f05ec-a5db-4ba5-add8-4ffe02758c35"`` had entry: + +.. code:: json + + { + "monday": "clear skies, low of 90F, high of 100F", + "tuesday": "cloudy, low of 90F, high of 100F", + "friday": "overcast, low of 90F, high of 100F", + "wednesday": "windy, low of 30F, high of 50F", + "thursday": "snow, low of 90F, high of 100F", + "sunday": "rain, low of 60F, high of 70F", + "location": "camarillo", + "saturday": "overcast, low of 60F, high of 80F", + "today": "monday" + } + +Dialog with ``dialog_id="0b144470-faa8-4e71-98c7-a9b645fffab0"`` had entry: + +.. code:: json + + { + "room": "-", + "agenda": "-", + "time": "4pm", + "date": "sunday", + "party": "father", + "event": "tennis activity" + } + Configs -^^^^^^^ +------- Config :config:`configs/seq2seq_go_bot/bot_kvret_infer.json ` is recommended to be used for inference (interaction) of a pretrained model. @@ -86,10 +155,29 @@ To infer from a pretrained model with config path equal to ````:    utterance = input(':: ') Config parameters: -^^^^^^^^^^^^^^^^^^ +------------------ To configure your own pipelines that contain a ``"seq2seq_go_bot"`` component, refer to documentation for :class:`~deeppavlov.models.seq2seq_go_bot.bot.Seq2SeqGoalOrientedBot` and :class:`~deeppavlov.models.seq2seq_go_bot.network.Seq2SeqGoalOrientedBotNetwork` classes. +Comparison +^^^^^^^^^^ + +Comparison of BLEU scores on test set of Kvret dataset: + ++------------------------------------------------------+------------------+ +| Model | Test BLEU | ++======================================================+==================+ +| DeepPavlov implementation of KV Retrieval Net | **0.132** | ++------------------------------------------------------+------------------+ +| KV Retrieven Net from `[2] <#references>`__ | **0.132** | ++------------------------------------------------------+------------------+ +| Copy Net from `[2] <#references>`__ | 0.110 | ++------------------------------------------------------+------------------+ +| Attn. Seq2Seq from `[2] <#references>`__ | 0.102 | ++------------------------------------------------------+------------------+ +| Rule-Based from `[2] <#references>`__ | 0.660 | ++------------------------------------------------------+------------------+ + References ---------- @@ -97,3 +185,4 @@ References [2] `Mihail Eric, Lakshmi Krishnan, Francois Charette, and Christopher D. Manning, "Key-Value Retrieval Networks for Task-Oriented Dialogue – 2017 `_ +.. |alt text| image:: ../_static/kvret_diagram.png diff --git a/examples/Keras_classification_config_description.ipynb b/examples/Keras_classification_config_description.ipynb index fdcafa3c52..5b278de6d7 100644 --- a/examples/Keras_classification_config_description.ipynb +++ b/examples/Keras_classification_config_description.ipynb @@ -25,7 +25,7 @@ "## Firstly, please, download dataset, embedding file and pre-trained model for considered task via command in terminal:\n", "\n", "```\n", - "python -m deeppavlov download configs/sentiment/insults_kaggle.json\n", + "python -m deeppavlov download configs/classifiers/insults_kaggle.json\n", "```" ] }, @@ -38,23 +38,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", + "import json\n", "\n", "from deeppavlov.core.common.file import read_json, save_json" ] }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": true - }, + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ "def print_json(data):\n", @@ -70,18 +67,16 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": true - }, + "execution_count": 8, + "metadata": {}, "outputs": [], "source": [ - "config = read_json(\"../../configs/sentiment/insults_kaggle.json\")" + "config = read_json(\"../deeppavlov/configs/classifiers/insults_kaggle.json\")" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -129,6 +124,15 @@ " \"name\": \"dirty_comments_preprocessor\"\n", " },\n", " {\n", + " \"in\": \"x_prep\",\n", + " \"out\": \"x_tok\",\n", + " \"id\": \"my_tokenizer\",\n", + " \"name\": \"nltk_tokenizer\",\n", + " \"tokenizer\": \"wordpunct_tokenize\"\n", + " },\n", + " {\n", + " \"in\": \"x_tok\",\n", + " \"out\": \"x_emb\",\n", " \"id\": \"my_embedder\",\n", " \"name\": \"fasttext\",\n", " \"save_path\": \"embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", @@ -136,13 +140,8 @@ " \"dim\": 300\n", " },\n", " {\n", - " \"id\": \"my_tokenizer\",\n", - " \"name\": \"nltk_tokenizer\",\n", - " \"tokenizer\": \"wordpunct_tokenize\"\n", - " },\n", - " {\n", " \"in\": [\n", - " \"x_prep\"\n", + " \"x_emb\"\n", " ],\n", " \"in_y\": [\n", " \"y\"\n", @@ -152,9 +151,10 @@ " \"y_probas_dict\"\n", " ],\n", " \"main\": true,\n", - " \"name\": \"intent_model\",\n", - " \"save_path\": \"sentiment/insults_kaggle_v0\",\n", - " \"load_path\": \"sentiment/insults_kaggle_v0\",\n", + " \"name\": \"keras_classification_model\",\n", + " \"save_path\": \"classifiers/insults_kaggle_v0\",\n", + " \"load_path\": \"classifiers/insults_kaggle_v0\",\n", + " \"embedding_size\": \"#my_embedder.dim\",\n", " \"classes\": \"#classes_vocab.keys()\",\n", " \"kernel_sizes_cnn\": [\n", " 1,\n", @@ -173,9 +173,7 @@ " \"coef_reg_den\": 0.01,\n", " \"dropout_rate\": 0.5,\n", " \"dense_size\": 100,\n", - " \"model_name\": \"cnn_model\",\n", - " \"embedder\": \"#my_embedder\",\n", - " \"tokenizer\": \"#my_tokenizer\"\n", + " \"model_name\": \"cnn_model\"\n", " }\n", " ],\n", " \"out\": [\n", @@ -199,13 +197,17 @@ " \"test_best\": true\n", " },\n", " \"metadata\": {\n", + " \"requirements\": [\n", + " \"../dp_requirements/tf.txt\",\n", + " \"../dp_requirements/fasttext.txt\"\n", + " ],\n", " \"labels\": {\n", " \"telegram_utils\": \"IntentModel\",\n", " \"server_utils\": \"KerasIntentModel\"\n", " },\n", " \"download\": [\n", " \"http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz\",\n", - " \"http://files.deeppavlov.ai/deeppavlov_data/sentiment.tar.gz\",\n", + " \"http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz\",\n", " \"http://files.deeppavlov.ai/datasets/insults_data.tar.gz\",\n", " {\n", " \"url\": \"http://files.deeppavlov.ai/embeddings/reddit_fastText/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", @@ -232,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -265,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -332,13 +334,13 @@ "4 \"C\\xe1c b\\u1ea1n xu\\u1ed1ng \\u0111\\u01b0\\u1edd... Not Insult" ] }, - "execution_count": 51, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pd.read_csv(\"../../../download/insults_data/train.csv\").head()" + "pd.read_csv(\"../download/insults_data/train.csv\").head()" ] }, { @@ -362,10 +364,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "config[\"dataset_reader\"][\"train\"] = \"train.csv\"\n", @@ -377,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -413,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -452,10 +452,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "config[\"dataset_iterator\"][\"shuffle\"] = True\n", @@ -468,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -508,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -545,6 +543,15 @@ " \"name\": \"dirty_comments_preprocessor\"\n", " },\n", " {\n", + " \"in\": \"x_prep\",\n", + " \"out\": \"x_tok\",\n", + " \"id\": \"my_tokenizer\",\n", + " \"name\": \"nltk_tokenizer\",\n", + " \"tokenizer\": \"wordpunct_tokenize\"\n", + " },\n", + " {\n", + " \"in\": \"x_tok\",\n", + " \"out\": \"x_emb\",\n", " \"id\": \"my_embedder\",\n", " \"name\": \"fasttext\",\n", " \"save_path\": \"embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", @@ -552,13 +559,8 @@ " \"dim\": 300\n", " },\n", " {\n", - " \"id\": \"my_tokenizer\",\n", - " \"name\": \"nltk_tokenizer\",\n", - " \"tokenizer\": \"wordpunct_tokenize\"\n", - " },\n", - " {\n", " \"in\": [\n", - " \"x_prep\"\n", + " \"x_emb\"\n", " ],\n", " \"in_y\": [\n", " \"y\"\n", @@ -568,9 +570,10 @@ " \"y_probas_dict\"\n", " ],\n", " \"main\": true,\n", - " \"name\": \"intent_model\",\n", - " \"save_path\": \"sentiment/insults_kaggle_v0\",\n", - " \"load_path\": \"sentiment/insults_kaggle_v0\",\n", + " \"name\": \"keras_classification_model\",\n", + " \"save_path\": \"classifiers/insults_kaggle_v0\",\n", + " \"load_path\": \"classifiers/insults_kaggle_v0\",\n", + " \"embedding_size\": \"#my_embedder.dim\",\n", " \"classes\": \"#classes_vocab.keys()\",\n", " \"kernel_sizes_cnn\": [\n", " 1,\n", @@ -589,9 +592,7 @@ " \"coef_reg_den\": 0.01,\n", " \"dropout_rate\": 0.5,\n", " \"dense_size\": 100,\n", - " \"model_name\": \"cnn_model\",\n", - " \"embedder\": \"#my_embedder\",\n", - " \"tokenizer\": \"#my_tokenizer\"\n", + " \"model_name\": \"cnn_model\"\n", " }\n", " ],\n", " \"out\": [\n", @@ -618,7 +619,7 @@ "\n", "* Every element in pipe should have specified `name` that is a registered name in DeepPavlov.\n", "\n", - "* For further usage parameter `id` can be specified. For example, tokenizer should be given to `KerasModel` during initialization of model. Therefore, one should place a tokenizer element before model, specify `\"id\": \"my_tokenizer\"` and then refer to it `\"tokenizer\": \"#my_tokenizer\"` in model parameters.\n", + "* For further usage parameter `id` can be specified. \n", "\n", "* If element of pipe processes data, `in` and `out` determine the order of data flow.\n", "\n", @@ -636,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -681,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -727,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -735,11 +736,11 @@ "output_type": "stream", "text": [ "{\n", - " \"id\": \"my_embedder\",\n", - " \"name\": \"fasttext\",\n", - " \"save_path\": \"embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", - " \"load_path\": \"embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", - " \"dim\": 300\n", + " \"in\": \"x_prep\",\n", + " \"out\": \"x_tok\",\n", + " \"id\": \"my_tokenizer\",\n", + " \"name\": \"nltk_tokenizer\",\n", + " \"tokenizer\": \"wordpunct_tokenize\"\n", "}\n" ] } @@ -750,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -758,9 +759,13 @@ "output_type": "stream", "text": [ "{\n", - " \"id\": \"my_tokenizer\",\n", - " \"name\": \"nltk_tokenizer\",\n", - " \"tokenizer\": \"wordpunct_tokenize\"\n", + " \"in\": \"x_tok\",\n", + " \"out\": \"x_emb\",\n", + " \"id\": \"my_embedder\",\n", + " \"name\": \"fasttext\",\n", + " \"save_path\": \"embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", + " \"load_path\": \"embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", + " \"dim\": 300\n", "}\n" ] } @@ -773,6 +778,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "* `in` and `out` denote names and structure of data transferred in pipeline. \n", "* `id` is a user-denoted name for further references in config.\n", "* `name` is a registered name of embedder/tokenizer in DeepPavlov.\n", "* `save_path` and `load_path` denote where to load pre-trained embedder/tokenizer from or where to save trained embedder/tokenizer.\n", @@ -794,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -803,7 +809,7 @@ "text": [ "{\n", " \"in\": [\n", - " \"x_prep\"\n", + " \"x_emb\"\n", " ],\n", " \"in_y\": [\n", " \"y\"\n", @@ -813,9 +819,10 @@ " \"y_probas_dict\"\n", " ],\n", " \"main\": true,\n", - " \"name\": \"intent_model\",\n", - " \"save_path\": \"sentiment/insults_kaggle_v0\",\n", - " \"load_path\": \"sentiment/insults_kaggle_v0\",\n", + " \"name\": \"keras_classification_model\",\n", + " \"save_path\": \"classifiers/insults_kaggle_v0\",\n", + " \"load_path\": \"classifiers/insults_kaggle_v0\",\n", + " \"embedding_size\": \"#my_embedder.dim\",\n", " \"classes\": \"#classes_vocab.keys()\",\n", " \"kernel_sizes_cnn\": [\n", " 1,\n", @@ -834,9 +841,7 @@ " \"coef_reg_den\": 0.01,\n", " \"dropout_rate\": 0.5,\n", " \"dense_size\": 100,\n", - " \"model_name\": \"cnn_model\",\n", - " \"embedder\": \"#my_embedder\",\n", - " \"tokenizer\": \"#my_tokenizer\"\n", + " \"model_name\": \"cnn_model\"\n", "}\n" ] } @@ -852,6 +857,7 @@ "* `in`, `in_y` and `out` denote names and structure of data transferred in pipeline. DatasetIterator `basic_dataset_iterator` provides data sample as tuple of two elements (`x`, `y`): text and its labels. Then preprocessor processes `x` to `x_prep`, and exactly this `x_prep` is an input for the main model along with `y` labels. For each sample the main model provides tuple of two elements (`y_labels`, `y_probas_dict`) where `y_labels` is an array of predicted classes (which sample belongs with), `y_probas_dict` is a dictionary like {\"class_i\": probability_i}.\n", "* `name` is a registered name of model in DeepPavlov.\n", "* `save_path` and `load_path` denote where to load pre-trained model from or where to save trained model.\n", + "* `embedding_size` refers to embedder's dimension.\n", "* `classes` contains names of all the presented in the train dataset classes. In the considered case it is presented as a reference to method `keys()` applied to the vocabulary of labels (`id` is used to refer).\n", "* `model_name` is a method name of `KerasIntentModel` class. **Currently available methods** are `cnn_model`, `dcnn_model`, `cnn_model_max_and_aver_pool`, `bilstm_model`, `bilstm_bilstm_model`, `bilstm_cnn_model`, `cnn_bilstm_model`, `bilstm_self_add_attention_model`, `bilstm_self_mult_attention_model`, `bigru_model`.\n", "* `kernel_sizes_cnn`, `filters_cnn`, `dense_size`, `last_layer_activation`, `coef_reg_cnn`, `coef_reg_den`, `dropout_rate` are specific parameters for `cnn_model` method of `KerasIntentModel`.\n", @@ -874,7 +880,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -927,7 +933,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -935,13 +941,17 @@ "output_type": "stream", "text": [ "{\n", + " \"requirements\": [\n", + " \"../dp_requirements/tf.txt\",\n", + " \"../dp_requirements/fasttext.txt\"\n", + " ],\n", " \"labels\": {\n", " \"telegram_utils\": \"IntentModel\",\n", " \"server_utils\": \"KerasIntentModel\"\n", " },\n", " \"download\": [\n", " \"http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz\",\n", - " \"http://files.deeppavlov.ai/deeppavlov_data/sentiment.tar.gz\",\n", + " \"http://files.deeppavlov.ai/deeppavlov_data/classifiers.tar.gz\",\n", " \"http://files.deeppavlov.ai/datasets/insults_data.tar.gz\",\n", " {\n", " \"url\": \"http://files.deeppavlov.ai/embeddings/reddit_fastText/wordpunct_tok_reddit_comments_2017_11_300.bin\",\n", diff --git a/examples/tutorials/faq_tutorial_tfidf_logreg.ipynb b/examples/tutorials/faq_tutorial_tfidf_logreg.ipynb new file mode 100644 index 0000000000..7fe7dc4233 --- /dev/null +++ b/examples/tutorials/faq_tutorial_tfidf_logreg.ipynb @@ -0,0 +1,237 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FAQ task" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "FAQ consists of questions:\n", + "1. What is preparatory course?\n", + " - Preparatory course is a special educational program lasting 1 academic year (7-10 months), where students learn Russian and special disciplines (mathematics and physics).\n", + "2. What is invitation letter?\n", + " - The invitation is official document which is prepared by Ministry of Internal Affairs of Russian Federation. It confirms that the student is admitted to this university.\n", + "3. ...\n", + "\n", + "\n", + "Now you have questions from users and you need to answer, for example:\n", + "\n", + ":: Could I work while studying?\n", + "> It allows the student to find well paid work and to start climbing up on a career ladder right after completing university course. Students of the Russian universities are obliged to attend all lectures as only the knowledge gained during classroom occupations allows students to become the effective and knowing professionals. \n", + "\n", + "\n", + "First of all we need train dataset of FAQ.\n", + "
\n", + "As example, let's consider MIPT FAQ for entrants - https://mipt.ru/english/edu/faqs/\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import deeppavlov\n", + "from deeppavlov.models.classifiers.logreg_classifier import LogregClassifier\n", + "from deeppavlov.models.vectorizers.tfidf_vectorizer import TfIdfVectorizer\n", + "from deeppavlov.models.tokenizers.ru_tokenizer import RussianTokenizer\n", + "from deeppavlov.dataset_readers.faq_reader import FaqDatasetReader\n", + "from deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n", + "from deeppavlov.core.data.utils import download_decompress" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Read FAQ data\n", + "reader = FaqDatasetReader()\n", + "faq_data = reader.read(data_url='http://files.deeppavlov.ai/faq/mipt/faq.csv', x_col_name='Question', y_col_name='Answer')\n", + "iterator = DataLearningIterator(data=faq_data)\n", + "\n", + "x,y = iterator.get_instances()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train FAQ" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's consider simple case for FAQ model (in the end you can find more complex pipeline models):\n", + "1. TF_IDF vectorizer on lemmatized questions\n", + "2. Logistic regression classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-08-31 13:48:55.488 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 16: Loading dictionaries from /home/andrey/v_envs/deep_pavlov_env/lib/python3.6/site-packages/pymorphy2_dicts/data\n", + "2018-08-31 13:48:55.526 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 20: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168\n" + ] + } + ], + "source": [ + "# create tokenizer\n", + "tokenizer = RussianTokenizer(lemmas=True)\n", + "# fit TF-IDF vectorizer on train FAQ dataset \n", + "vectorizer = TfIdfVectorizer(mode='train')\n", + "vectorizer.fit(x)\n", + "\n", + "# Now collect (x,y) pairs: x_train - vectorized question, y_train - answer from FAQ\n", + "x_train = vectorizer(tokenizer(x))\n", + "y_train = y \n", + "\n", + "# Let's use top 2 answers for each incoming questions (top_n param)\n", + "clf= LogregClassifier(mode='train', top_n=2, c=1000, penalty='l2', save_path='faq/tfidf_logreg_classifier_en_mipt_faq.pkl', load_path='faq/tfidf_logreg_classifier_en_mipt_faq.pkl')\n", + "clf.fit(x_train, y_train)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test FAQ" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "test_questions = ['Could you help me??', 'Could I work while studying?']\n", + "tokenized_test_questions = tokenizer(test_questions)\n", + "test_q_vectorized = vectorizer(tokenized_test_questions)\n", + "answers = clf(test_q_vectorized)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have all output of FAQ model: answers and scores.\n", + "
\n", + "Answers:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Answers 0:\n", + "['If you have any problems you can address to Department of Foreign Students: +7 (495) 408-70-43 (Auditorium building, room 315).', 'Life insurance and health is obligatory for any foreign citizen who arrived to Russian Federation for study.']\n", + "\n", + "Answers 1:\n", + "['Russian education is one of the most qualitative and fundamental in the world. It allows the student to find well paid work and to start climbing up on a career ladder right after completing university course. Students of the Russian universities are obliged to attend all lectures as only the knowledge gained during classroom occupations allows students to become the effective and knowing professionals. Thus, there is an opportunity to work only after classes or during vacation on the weekend.', 'Life insurance and health is obligatory for any foreign citizen who arrived to Russian Federation for study.']\n", + "\n" + ] + } + ], + "source": [ + "for i, answer in enumerate(answers[0]):\n", + " print('Answers {}:\\n{}\\n'.format(i, answer))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see score for each answer (score: [0,1])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scores 0:[0.92, 0.01]\n", + "Scores 1:[0.8, 0.03]\n" + ] + } + ], + "source": [ + "for i, score in enumerate(answers[1]):\n", + " print('Scores {}:{}'.format(i, score))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Described model built in config - deeppavlov/configs/faq/tfidf_logreg_en_faq.json\n", + "\n", + "You can also combine different components to construct pipelines for FAQ task:\n", + "\n", + "Vectorizers:\n", + " - deeppavlov.core.models.vectorizers.TfIdfVectorizer\n", + " - deeppavlov.core.models.vectorizers.SentenceAvgW2vVectorizer\n", + " - deeppavlov.core.models.vectorizers.SentenceW2vVectorizerTfidfWeights\n", + "\n", + "Classifiers:\n", + " - deeppavlov.models.classifiers.logreg_classifier.LogregClassifier\n", + " - deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 4bb2f47230..c06bf34e77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -Cython==0.27.1 +Cython==0.28.5 overrides==1.9 numpy==1.14.5 pandas==0.23.1 diff --git a/tests/test_configs/intents/intents_snips_bigru.json b/tests/test_configs/classifiers/intents_snips_bigru.json similarity index 90% rename from tests/test_configs/intents/intents_snips_bigru.json rename to tests/test_configs/classifiers/intents_snips_bigru.json index 5e6011cdfb..eaf2cad29f 100644 --- a/tests/test_configs/intents/intents_snips_bigru.json +++ b/tests/test_configs/classifiers/intents_snips_bigru.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_bigru", - "load_path": "intents/intent_cnn_snips_bigru", + "save_path": "classifiers/intent_cnn_snips_bigru", + "load_path": "classifiers/intent_cnn_snips_bigru", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm": 64, "confident_threshold": 0.5, @@ -77,9 +82,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "bigru_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "bigru_model" } ], "out": [ diff --git a/tests/test_configs/intents/intents_snips_bilstm.json b/tests/test_configs/classifiers/intents_snips_bilstm.json similarity index 89% rename from tests/test_configs/intents/intents_snips_bilstm.json rename to tests/test_configs/classifiers/intents_snips_bilstm.json index 4e0d11fdff..c750f83602 100644 --- a/tests/test_configs/intents/intents_snips_bilstm.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_bistlm", - "load_path": "intents/intent_cnn_snips_bilstm", + "save_path": "classifiers/intent_cnn_snips_bistlm", + "load_path": "classifiers/intent_cnn_snips_bilstm", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm": 64, "confident_threshold": 0.5, @@ -77,9 +82,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "bilstm_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "bilstm_model" } ], "out": [ diff --git a/tests/test_configs/intents/intents_snips_bilstm_bilstm.json b/tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json similarity index 89% rename from tests/test_configs/intents/intents_snips_bilstm_bilstm.json rename to tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json index 5c2de7145f..7c57553211 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_bilstm.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_bistlm_bilstm", - "load_path": "intents/intent_cnn_snips_bilstm_bilstm", + "save_path": "classifiers/intent_cnn_snips_bistlm_bilstm", + "load_path": "classifiers/intent_cnn_snips_bilstm_bilstm", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm_1": 64, "units_lstm_2": 64, @@ -78,9 +83,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "bilstm_bilstm_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "bilstm_bilstm_model" } ], "out": [ diff --git a/tests/test_configs/intents/intents_snips_bilstm_cnn.json b/tests/test_configs/classifiers/intents_snips_bilstm_cnn.json similarity index 90% rename from tests/test_configs/intents/intents_snips_bilstm_cnn.json rename to tests/test_configs/classifiers/intents_snips_bilstm_cnn.json index f1b1d6ea65..dc26a005fd 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_cnn.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_cnn.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_bistlm_cnn", - "load_path": "intents/intent_cnn_snips_bilstm_cnn", + "save_path": "classifiers/intent_cnn_snips_bistlm_cnn", + "load_path": "classifiers/intent_cnn_snips_bilstm_cnn", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm": 64, "kernel_sizes_cnn": [ @@ -84,9 +89,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "bilstm_cnn_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "bilstm_cnn_model" } ], "out": [ diff --git a/tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json b/tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json similarity index 88% rename from tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json rename to tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json index 6b79e5a94a..7a5bfdcbb7 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_bilstm_self_add_attention", - "load_path": "intents/intent_cnn_snips_bilstm_self_add_attention", + "save_path": "classifiers/intent_cnn_snips_bilstm_self_add_attention", + "load_path": "classifiers/intent_cnn_snips_bilstm_self_add_attention", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm": 64, "self_att_hid": 64, @@ -79,9 +84,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "bilstm_self_add_attention_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "bilstm_self_add_attention_model" } ], "out": [ diff --git a/tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json b/tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json similarity index 88% rename from tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json rename to tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json index 2af8a059a5..c944eee463 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_bilstm_self_mult_attention", - "load_path": "intents/intent_cnn_snips_bilstm_self_mult_attention", + "save_path": "classifiers/intent_cnn_snips_bilstm_self_mult_attention", + "load_path": "classifiers/intent_cnn_snips_bilstm_self_mult_attention", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm": 64, "self_att_hid": 64, @@ -79,9 +84,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "bilstm_self_mult_attention_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "bilstm_self_mult_attention_model" } ], "out": [ diff --git a/tests/test_configs/intents/intents_snips_cnn_bilstm.json b/tests/test_configs/classifiers/intents_snips_cnn_bilstm.json similarity index 90% rename from tests/test_configs/intents/intents_snips_cnn_bilstm.json rename to tests/test_configs/classifiers/intents_snips_cnn_bilstm.json index 97a0983216..e796aef822 100644 --- a/tests/test_configs/intents/intents_snips_cnn_bilstm.json +++ b/tests/test_configs/classifiers/intents_snips_cnn_bilstm.json @@ -38,20 +38,24 @@ "load_path": "vocabs/snips_classes.dict" }, { + "in": "x", + "out": "x_tok", + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": "x_tok", + "out": "x_emb", "id": "my_embedder", "name": "fasttext", "save_path": "embeddings/dstc2_fastText_model.bin", "load_path": "embeddings/dstc2_fastText_model.bin", "dim": 100 }, - { - "id": "my_tokenizer", - "name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, { "in": [ - "x" + "x_emb" ], "in_y": [ "y" @@ -62,8 +66,9 @@ ], "main": true, "name": "keras_classification_model", - "save_path": "intents/intent_cnn_snips_cnn_bistlm", - "load_path": "intents/intent_cnn_snips_cnn_bilstm", + "save_path": "classifiers/intent_cnn_snips_cnn_bistlm", + "load_path": "classifiers/intent_cnn_snips_cnn_bilstm", + "embedding_size": "#my_embedder.dim", "classes": "#classes_vocab.keys()", "units_lstm": 64, "kernel_sizes_cnn": [ @@ -84,9 +89,7 @@ "dropout_rate": 0.5, "rec_dropout_rate": 0.5, "dense_size": 100, - "model_name": "cnn_bilstm_model", - "embedder": "#my_embedder", - "tokenizer": "#my_tokenizer" + "model_name": "cnn_bilstm_model" } ], "out": [ diff --git a/tests/test_configs/odqa/en_odqa_infer_wiki_test.json b/tests/test_configs/odqa/en_odqa_infer_wiki_test.json index e0070edc6e..6ebd9fed98 100644 --- a/tests/test_configs/odqa/en_odqa_infer_wiki_test.json +++ b/tests/test_configs/odqa/en_odqa_infer_wiki_test.json @@ -3,12 +3,8 @@ "in": [ "question_raw" ], - "in_y": [ - "ans_raw", - "ans_raw_start" - ], "out": [ - "ans_predicted" + "best_answer" ], "pipe": [ { @@ -49,18 +45,41 @@ "context_raw" ], "data_dir": "odqa", + "join_docs": false, "shuffle": false, "data_url": "http://files.deeppavlov.ai/datasets/wikipedia/wiki_test.db" }, { - "config_path": "../deeppavlov/configs/squad/squad.json", + "name": "document_chunker", "in": [ - "context_raw", - "question_raw" + "context_raw" + ], + "out": [ + "chunks" + ], + "flatten_result": true + }, + { + "name": "string_multiplier", + "in": [ + "question_raw", + "chunks" + ], + "out": [ + "questions" + ] + }, + { + "name": "logit_ranker", + "squad_model": { + "config_path": "../deeppavlov/configs/squad/squad.json" + }, + "in": [ + "chunks", + "questions" ], "out": [ - "ans_predicted", - "ans_start_predicted" + "best_answer" ] } ] diff --git a/tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json b/tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json index 91b09601c3..cb22244faf 100644 --- a/tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json +++ b/tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json @@ -13,6 +13,23 @@ "y" ], "pipe": [ + { + "name": "hashing_tfidf_vectorizer", + "id": "vectorizer", + "fit_on_batch": [ + "x" + ], + "save_path": "odqa/en_wiki_test_tfidf.npz", + "load_path": "odqa/en_wiki_test_tfidf.npz", + "tokenizer": { + "name": "stream_spacy_tokenizer", + "lemmas": true, + "ngram_range": [ + 1, + 2 + ] + } + }, { "name": "tfidf_ranker", "top_n": 5, @@ -23,25 +40,7 @@ "y", "score" ], - "fit_on_batch": [ - "x" - ], - "vectorizer": { - "name": "hashing_tfidf_vectorizer", - "fit_on_batch": [ - "x" - ], - "save_path": "odqa/en_wiki_test_tfidf.npz", - "load_path": "odqa/en_wiki_test_tfidf.npz", - "tokenizer": { - "name": "stream_spacy_tokenizer", - "lemmas": true, - "ngram_range": [ - 1, - 2 - ] - } - } + "vectorizer": "#vectorizer" } ] }, @@ -63,4 +62,4 @@ "http://files.deeppavlov.ai/deeppavlov_data/odqa_test.tar.gz" ] } -} \ No newline at end of file +} diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index c31d3e4f4f..b74f8ea69e 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -38,6 +38,13 @@ # Mapping from model name to config-model_dir-ispretrained and corresponding queries-response list. PARAMS = { + "faq": { + ("faq/tfidf_logreg_en_faq.json", "faq_tfidf_logreg_en", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("faq/tfidf_autofaq.json", "faq_tfidf_cos", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("faq/tfidf_logreg_autofaq.json", "faq_tfidf_logreg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("faq/fasttext_avg_autofaq.json", "faq_fasttext_avg", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("faq/fasttext_tfidf_autofaq.json", "faq_fasttext_tfidf", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK] + }, "spelling_correction": { ("spelling_correction/brillmoore_wikitypos_en.json", "error_model", ALL_MODES): [ @@ -46,7 +53,6 @@ ], ("spelling_correction/brillmoore_kartaslov_ru.json", "error_model", ALL_MODES): [ - ("преведствую", "приветствую"), ("я джва года дду эту игру", "я два года жду эту игру") ], ("spelling_correction/levenshtein_corrector_ru.json", "error_model", ('IP',)): @@ -60,31 +66,32 @@ ("go_bot/gobot_dstc2_best.json", "gobot_dstc2_best", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("go_bot/gobot_dstc2_minimal.json", "gobot_dstc2_minimal", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, - "intents": { - ("intents/intents_dstc2.json", "intents", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_dstc2_big.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] + "classifiers": { + ("classifiers/intents_dstc2.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_dstc2_big.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/insults_kaggle.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/sentiment_twitter.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/sentiment_twitter_preproc.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/topic_ag_news.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/rusentiment_cnn.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK] }, "snips": { - ("intents/intents_snips.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_bigru.json", "intents", ('TI')): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_bilstm.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_bilstm_bilstm.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_bilstm_cnn.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_bilstm_self_add_attention.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_bilstm_self_mult_attention.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_snips_cnn_bilstm.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] - }, - "sentiment": { - ("sentiment/insults_kaggle.json", "sentiment", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("sentiment/sentiment_twitter.json", "sentiment", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], - ("sentiment/sentiment_ag_news.json", "sentiment", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK] + ("classifiers/intents_snips.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_big.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_bigru.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_bilstm.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_bilstm_bilstm.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_bilstm_cnn.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_bilstm_self_add_attention.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_bilstm_self_mult_attention.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_snips_cnn_bilstm.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, "evolution": { ("evolution/evolve_intents_snips.json", "evolution", ('E',)): None }, "sample": { - ("intents/intents_sample_csv.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("intents/intents_sample_json.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] + ("classifiers/intents_sample_csv.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_sample_json.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, "ner": { ("ner/ner_conll2003.json", "ner_conll2003", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], @@ -108,7 +115,23 @@ ("squad/squad.json", "squad_model", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK], ("squad/squad_ru.json", "squad_model_ru", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK] }, - "seq2seq_go_bot": {("seq2seq_go_bot/bot_kvret.json", "seq2seq_go_bot", ALL_MODES): [FOUR_ARGUMENTS_INFER_CHECK]}, + "seq2seq_go_bot": { + ("seq2seq_go_bot/bot_kvret.json", "seq2seq_go_bot", ('TI',)): + [ + ("will it snow on tuesday?", + "f78cf0f9-7d1e-47e9-aa45-33f9942c94be", + "", + "", + "", + None) + ], + ("seq2seq_go_bot/bot_kvret_infer.json", "seq2seq_go_bot", ('IP',)): + [ + ("will it snow on tuesday?", + "f78cf0f9-7d1e-47e9-aa45-33f9942c94be", + None) + ] + }, "odqa": { ("odqa/en_odqa_infer_wiki_test.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] }, diff --git a/tests/test_tf_layers.py b/tests/test_tf_layers.py new file mode 100644 index 0000000000..26b7054006 --- /dev/null +++ b/tests/test_tf_layers.py @@ -0,0 +1,233 @@ +from pathlib import Path +import shutil +import pytest + +from functools import reduce + +tests_dir = Path(__file__).parent +tf_layers_data_path = tests_dir / "tf_layers_data" + + +def setup_module(): + shutil.rmtree(str(tf_layers_data_path), ignore_errors=True) + tf_layers_data_path.mkdir(parents=True) + + +def teardown_module(): + shutil.rmtree(str(tf_layers_data_path), ignore_errors=True) + + +import tensorflow as tf +import numpy as np + +from deeppavlov.core.layers.tf_layers import cudnn_lstm, cudnn_compatible_lstm, cudnn_gru, cudnn_compatible_gru + + +class DPCudnnLSTMModel: + def __init__(self, num_layers, num_units): + sess_config = tf.ConfigProto(allow_soft_placement=True) + sess_config.gpu_options.allow_growth = True + self.sess = tf.Session(config=sess_config) + + self.x = tf.placeholder(shape=(None, None, 50), dtype=tf.float32) + with tf.variable_scope('cudnn_model'): + h, (h_last, c_last) = cudnn_lstm(self.x, num_units, num_layers, trainable_initial_states=True) + + self.h = h + self.h_last = h_last + + self.sess.run(tf.global_variables_initializer()) + + def __call__(self, x): + feed_dict = { + self.x: x, + } + return self.sess.run([self.h, self.h_last], feed_dict=feed_dict) + + def save(self, path='model'): + print('[saving model to {}]'.format(path)) + saver = tf.train.Saver() + saver.save(self.sess, path) + + def load(self, path): + saver = tf.train.Saver() + saver.restore(self.sess, path) + + +class DPLSTMModel: + def __init__(self, num_layers, num_units): + sess_config = tf.ConfigProto(allow_soft_placement=True) + sess_config.gpu_options.allow_growth = True + self.sess = tf.Session(config=sess_config) + + self.x = tf.placeholder(shape=(None, None, 50), dtype=tf.float32) + with tf.variable_scope('cudnn_model'): + h, (h_last, c_last) = cudnn_compatible_lstm(self.x, num_units, num_layers, trainable_initial_states=True) + + self.h = h + self.h_last = h_last + + self.sess.run(tf.global_variables_initializer()) + + def __call__(self, x): + feed_dict = { + self.x: x, + } + return self.sess.run([self.h, self.h_last], feed_dict=feed_dict) + + def save(self, path='model'): + print('[saving model to {}]'.format(path)) + saver = tf.train.Saver() + saver.save(self.sess, path) + + def load(self, path): + saver = tf.train.Saver() + saver.restore(self.sess, path) + + +class DPCudnnGRUModel: + def __init__(self, num_layers, num_units): + sess_config = tf.ConfigProto(allow_soft_placement=True) + sess_config.gpu_options.allow_growth = True + self.sess = tf.Session(config=sess_config) + + self.x = tf.placeholder(shape=(None, None, 50), dtype=tf.float32) + with tf.variable_scope('cudnn_model'): + h, h_last = cudnn_gru(self.x, num_units, num_layers, trainable_initial_states=True) + + self.h = h + self.h_last = h_last + + self.sess.run(tf.global_variables_initializer()) + + def __call__(self, x): + feed_dict = { + self.x: x, + } + return self.sess.run([self.h, self.h_last], feed_dict=feed_dict) + + def save(self, path='model'): + print('[saving model to {}]'.format(path)) + saver = tf.train.Saver() + saver.save(self.sess, path) + + def load(self, path): + saver = tf.train.Saver() + saver.restore(self.sess, path) + + +class DPGRUModel: + def __init__(self, num_layers, num_units): + sess_config = tf.ConfigProto(allow_soft_placement=True) + sess_config.gpu_options.allow_growth = True + self.sess = tf.Session(config=sess_config) + + self.x = tf.placeholder(shape=(None, None, 50), dtype=tf.float32) + with tf.variable_scope('cudnn_model'): + h, h_last = cudnn_compatible_gru(self.x, num_units, num_layers, trainable_initial_states=True) + + self.h = h + self.h_last = h_last + + self.sess.run(tf.global_variables_initializer()) + + def __call__(self, x): + feed_dict = { + self.x: x, + } + return self.sess.run([self.h, self.h_last], feed_dict=feed_dict) + + def save(self, path='model'): + print('[saving model to {}]'.format(path)) + saver = tf.train.Saver() + saver.save(self.sess, path) + + def load(self, path): + saver = tf.train.Saver() + saver.restore(self.sess, path) + + +class TestTFLayers: + + allowed_error_lvl = 0.01 + + @staticmethod + def equal_values(a, b, round=5): + a, b = np.round(a, round), np.round(b, round) + return np.sum(a == b) / reduce(lambda x, y: x * y, a.shape) + + @pytest.mark.parametrize("num_layers", [1, 3]) + def test_cudnn_lstm_save_load(self, num_layers): + x = np.random.normal(size=(10, 10, 50)) + tf.reset_default_graph() + cdnnlstmmodel = DPCudnnLSTMModel(num_layers=num_layers, num_units=100) + before_load_hidden, before_load_state = cdnnlstmmodel(x)[0], cdnnlstmmodel(x)[1] + cdnnlstmmodel.save(str(tf_layers_data_path / 'dpcudnnlstmmodel' / 'model')) + + tf.reset_default_graph() + cdnnlstmmodel = DPCudnnLSTMModel(num_layers=num_layers, num_units=100) + cdnnlstmmodel.load(str(tf_layers_data_path / 'dpcudnnlstmmodel' / 'model')) + after_load_hidden, after_load_state = cdnnlstmmodel(x)[0], cdnnlstmmodel(x)[1] + + equal_hidden = self.equal_values(after_load_hidden, before_load_hidden) + equal_state = self.equal_values(after_load_state, before_load_state) + + assert equal_hidden > 1 - self.allowed_error_lvl + assert equal_state > 1 - self.allowed_error_lvl + + @pytest.mark.parametrize("num_layers", [1, 3]) + def test_cudnn_lstm_save_and_cudnn_compatible_load(self, num_layers): + x = np.random.normal(size=(10, 10, 50)) + tf.reset_default_graph() + cdnnlstmmodel = DPCudnnLSTMModel(num_layers=num_layers, num_units=100) + before_load_hidden, before_load_state = cdnnlstmmodel(x)[0], cdnnlstmmodel(x)[1] + cdnnlstmmodel.save(str(tf_layers_data_path / 'dpcudnnlstmmodel' / 'model')) + + tf.reset_default_graph() + cdnnlstmmodel = DPLSTMModel(num_layers=num_layers, num_units=100) + cdnnlstmmodel.load(str(tf_layers_data_path / 'dpcudnnlstmmodel' / 'model')) + after_load_hidden, after_load_state = cdnnlstmmodel(x)[0], cdnnlstmmodel(x)[1] + + equal_hidden = self.equal_values(after_load_hidden, before_load_hidden) + equal_state = self.equal_values(after_load_state, before_load_state) + + assert equal_hidden > 1 - self.allowed_error_lvl + assert equal_state > 1 - self.allowed_error_lvl + + @pytest.mark.parametrize("num_layers", [1, 3]) + def test_cudnn_gru_save_load(self, num_layers): + x = np.random.normal(size=(10, 10, 50)) + tf.reset_default_graph() + cdnngrumodel = DPCudnnGRUModel(num_layers=num_layers, num_units=100) + before_load_hidden, before_load_state = cdnngrumodel(x)[0], cdnngrumodel(x)[1] + cdnngrumodel.save(str(tf_layers_data_path / 'cdnngrumodel' / 'model')) + + tf.reset_default_graph() + cdnngrumodel = DPCudnnGRUModel(num_layers=num_layers, num_units=100) + cdnngrumodel.load(str(tf_layers_data_path / 'cdnngrumodel' / 'model')) + after_load_hidden, after_load_state = cdnngrumodel(x)[0], cdnngrumodel(x)[1] + + equal_hidden = self.equal_values(after_load_hidden, before_load_hidden) + equal_state = self.equal_values(after_load_state, before_load_state) + + assert equal_hidden > 1 - self.allowed_error_lvl + assert equal_state > 1 - self.allowed_error_lvl + + @pytest.mark.parametrize("num_layers", [1, 3]) + def test_cudnn_gru_save_and_cudnn_compatible_load(self, num_layers): + x = np.random.normal(size=(10, 10, 50)) + tf.reset_default_graph() + cdnngrumodel = DPCudnnGRUModel(num_layers=num_layers, num_units=100) + before_load_hidden, before_load_state = cdnngrumodel(x)[0], cdnngrumodel(x)[1] + cdnngrumodel.save(str(tf_layers_data_path / 'cdnngrumodel' / 'model')) + + tf.reset_default_graph() + cdnngrumodel = DPGRUModel(num_layers=num_layers, num_units=100) + cdnngrumodel.load(str(tf_layers_data_path / 'cdnngrumodel' / 'model')) + after_load_hidden, after_load_state = cdnngrumodel(x)[0], cdnngrumodel(x)[1] + + equal_hidden = self.equal_values(after_load_hidden, before_load_hidden) + equal_state = self.equal_values(after_load_state, before_load_state) + + assert equal_hidden > 1 - self.allowed_error_lvl + assert equal_state > 1 - self.allowed_error_lvl \ No newline at end of file