From 82f374ef2601a21827af0fb9d2a7ca76acd090be Mon Sep 17 00:00:00 2001 From: rfe Date: Tue, 7 Sep 2021 20:30:46 +0200 Subject: [PATCH] suffix tree --- environment.yml | 3 +- fst_sp/FST_SentencePiece.ipynb | 20 +- fst_sp/kaldi_FST_SentencePiece.ipynb | 1197 +++++++++++++++++--------- fst_sp/seed_vocab.py | 101 +++ 4 files changed, 930 insertions(+), 391 deletions(-) create mode 100644 fst_sp/seed_vocab.py diff --git a/environment.yml b/environment.yml index 5c9233a..b9a529e 100644 --- a/environment.yml +++ b/environment.yml @@ -165,6 +165,7 @@ dependencies: - typing-extensions=3.7.4.3=hd3eb1b0_0 - typing_extensions=3.7.4.3=pyh06a4308_0 - urllib3=1.26.4=pyhd3eb1b0_0 + - unidecode=1.2.0=pyhd3eb1b0_0 - wcwidth=0.2.5=py_0 - wheel=0.36.2=pyhd3eb1b0_0 - xz=5.2.5=h7b6447c_0 @@ -177,7 +178,7 @@ dependencies: - antlr4-python3-runtime==4.8 - "git+https://github.com/tuanh208/CPC_audio.git@zerospeech" - "git+https://github.com/pytorch/fairseq.git" - - hydra-core==1.0.6 + - hydra-core==1.0.7 - importlib-resources==5.1.4 - jiwer==2.2.0 - omegaconf==2.0.6 diff --git a/fst_sp/FST_SentencePiece.ipynb b/fst_sp/FST_SentencePiece.ipynb index 0cae335..22c8ea9 100644 --- a/fst_sp/FST_SentencePiece.ipynb +++ b/fst_sp/FST_SentencePiece.ipynb @@ -2,10 +2,22 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "5f4eb9c9-9ee9-42e5-8945-df227482cc99", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'openfst_python'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# import kaldi.fstext as fst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mopenfst_python\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msentencepiece\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mspm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openfst_python'" + ] + } + ], "source": [ "import io\n", "from collections import defaultdict, namedtuple, Counter\n", @@ -1168,7 +1180,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1182,7 +1194,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/fst_sp/kaldi_FST_SentencePiece.ipynb b/fst_sp/kaldi_FST_SentencePiece.ipynb index 460a42e..818cbaf 100644 --- a/fst_sp/kaldi_FST_SentencePiece.ipynb +++ b/fst_sp/kaldi_FST_SentencePiece.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "5f4eb9c9-9ee9-42e5-8945-df227482cc99", "metadata": {}, "outputs": [], @@ -40,10 +40,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "e1daeebb-b743-48a0-9e3b-a7067567ea1c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Sentence(text='abababaabb cacacacacacacaca acacacacacac ca caca cacaca ac acac acacac abcabcabcabcabcab', count=1)]\n" + ] + } + ], "source": [ "# TEXT = open('botchan.txt').read()\n", "\n", @@ -51,12 +59,13 @@ "TEXT = unidecode.unidecode(TEXT.lower()) #.replace(' ', '')\n", "SENTENCES_BY_SPACE = [Sentence(text, count) for text, count in Counter(TEXT.split()).items()]\n", "SENTENCES = [Sentence(line.strip(), 1) for line in TEXT.split('\\n') if line.strip()]\n", - "TEXT = '\\n'.join([s[0] for s in SENTENCES])" + "TEXT = '\\n'.join([s[0] for s in SENTENCES])\n", + "print(SENTENCES)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "4b651c92-1c69-4309-afb4-ce8be0841fec", "metadata": {}, "outputs": [ @@ -133,7 +142,17 @@ "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=6 size=10 obj=35.1543 num_tokens=39 num_tokens/piece=3.9\n", "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=7 size=10 obj=35.1543 num_tokens=39 num_tokens/piece=3.9\n", "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=8 size=10 obj=35.1543 num_tokens=39 num_tokens/piece=3.9\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=9 size=10 " + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=9 size=10 obj=35.1543 num_tokens=39 num_tokens/piece=3.9\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=8 obj=42.8455 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=2 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=3 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=4 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=5 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=6 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=7 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=8 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=9 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n" ] } ], @@ -151,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "id": "2c26ed68-def2-4d52-9c2d-dd49e89173a1", "metadata": {}, "outputs": [ @@ -162,23 +181,6 @@ "Tot unigram prob: 0.8163016306690613\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "obj=35.1543 num_tokens=39 num_tokens/piece=3.9\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=8 obj=42.8455 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=2 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=3 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=4 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=5 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=6 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=7 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=8 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=9 size=8 obj=43.2459 num_tokens=43 num_tokens/piece=5.375\n" - ] - }, { "data": { "text/plain": [ @@ -192,7 +194,7 @@ " SentencePiece(index=7, symbol='a', log_freq=-4.326745510101318)]" ] }, - "execution_count": 4, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -206,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 62, "id": "d16b3fb0-f78d-4e60-b45f-334b3663546d", "metadata": {}, "outputs": [ @@ -219,7 +221,7 @@ " input_format: \n", " model_prefix: \n", " model_type: UNIGRAM\n", - " vocab_size: 10\n", + " vocab_size: 17\n", " self_test_sample_size: 0\n", " character_coverage: 0.9995\n", " input_sentence_size: 0\n", @@ -228,7 +230,7 @@ " shrinking_factor: 0.75\n", " max_sentence_length: 4192\n", " num_threads: 16\n", - " num_sub_iterations: 2\n", + " num_sub_iterations: 1\n", " max_sentencepiece_length: 16\n", " split_by_unicode_script: 1\n", " split_by_number: 1\n", @@ -274,8 +276,7 @@ "trainer_interface.cc(516) LOG(INFO) Tokenizing input sentences with whitespace: 1\n", "trainer_interface.cc(526) LOG(INFO) Done! 10\n", "unigram_model_trainer.cc(488) LOG(INFO) Using 10 sentences for EM training\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=12 obj=7.39336 num_tokens=17 num_tokens/piece=1.41667\n", - "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=10 obj=34.606 num_tokens=39 num_tokens/piece=3.9\n" + "unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=12 obj=7.39336 num_tokens=17 num_tokens/piece=1.41667\n" ] } ], @@ -286,13 +287,15 @@ "spm.SentencePieceTrainer.train(\n", " sentence_iterator=io.BytesIO(TEXT.encode()),\n", " model_writer=model,\n", - " vocab_size=10,\n", + " vocab_size=17,\n", + " num_sub_iterations=1\n", + " \n", ")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 63, "id": "41aba295-82d8-445e-8a8d-74bfb207f476", "metadata": {}, "outputs": [ @@ -300,7 +303,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tot unigram prob: 0.8261814959746875\n" + "Tot unigram prob: 0.6545648024932031\n" ] }, { @@ -309,16 +312,23 @@ "[SentencePiece(index=0, symbol='', log_freq=0.0),\n", " SentencePiece(index=1, symbol='', log_freq=0.0),\n", " SentencePiece(index=2, symbol='', log_freq=0.0),\n", - " SentencePiece(index=3, symbol='b', log_freq=-0.5813193321228027),\n", - " SentencePiece(index=4, symbol='ab', log_freq=-1.9380321502685547),\n", - " SentencePiece(index=5, symbol='▁acacac', log_freq=-3.227902412414551),\n", - " SentencePiece(index=6, symbol='▁cacaca', log_freq=-3.227902412414551),\n", - " SentencePiece(index=7, symbol='▁', log_freq=-4.227703094482422),\n", - " SentencePiece(index=8, symbol='c', log_freq=-4.227802753448486),\n", - " SentencePiece(index=9, symbol='a', log_freq=-4.227902889251709)]" + " SentencePiece(index=3, symbol='▁caca', log_freq=-2.6853573322296143),\n", + " SentencePiece(index=4, symbol='▁cacaca', log_freq=-2.7242918014526367),\n", + " SentencePiece(index=5, symbol='▁ab', log_freq=-2.7418582439422607),\n", + " SentencePiece(index=6, symbol='▁ca', log_freq=-2.7452681064605713),\n", + " SentencePiece(index=7, symbol='▁acac', log_freq=-2.8035264015197754),\n", + " SentencePiece(index=8, symbol='▁ac', log_freq=-2.810230255126953),\n", + " SentencePiece(index=9, symbol='▁acacac', log_freq=-2.9112017154693604),\n", + " SentencePiece(index=10, symbol='b', log_freq=-3.0529541969299316),\n", + " SentencePiece(index=11, symbol='ab', log_freq=-3.0605990886688232),\n", + " SentencePiece(index=12, symbol='ababa', log_freq=-3.1826658248901367),\n", + " SentencePiece(index=13, symbol='▁', log_freq=-3.6624233722686768),\n", + " SentencePiece(index=14, symbol='c', log_freq=-3.9868173599243164),\n", + " SentencePiece(index=15, symbol='a', log_freq=-3.98691725730896),\n", + " SentencePiece(index=16, symbol='▁a', log_freq=-3.98691725730896)]" ] }, - "execution_count": 6, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -332,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "id": "56943071-f1ed-4ece-a9ad-74c818c72859", "metadata": {}, "outputs": [], @@ -345,18 +355,44 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "61e51478-2eee-4e7f-bfbe-ad2c14b56ed0", + "execution_count": 65, + "id": "aa9784a2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '')\n", + "(1, '')\n", + "(2, '')\n", + "(3, '▁caca')\n", + "(4, '▁cacaca')\n", + "(5, '▁ab')\n", + "(6, '▁ca')\n", + "(7, '▁acac')\n", + "(8, '▁ac')\n", + "(9, '▁acacac')\n", + "(10, 'b')\n", + "(11, 'ab')\n", + "(12, 'ababa')\n", + "(13, '▁')\n", + "(14, 'c')\n", + "(15, 'a')\n", + "(16, '▁a')\n" + ] + } + ], "source": [ "T = SentencePieceTrainer(INITIAL_PIECES)\n", - "# T = SentencePieceTrainer(FINAL_PIECES)" + "# T = SentencePieceTrainer(FINAL_PIECES)\n", + "for a in fst.SymbolTableIterator(T.PIECE_SYMB):\n", + " print(a)\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 66, "id": "54e51757-e557-4a56-aad0-ac174c3a9843", "metadata": {}, "outputs": [ @@ -369,11 +405,11 @@ "\n", "\n", - "\n", + "\n", "\n", "FST\n", - "\n", + "\n", "\n", "\n", "0\n", @@ -422,36 +458,62 @@ "\n", "\n", "4\n", - "\n", - "\n", - "4\n", + "\n", + "4\n", "\n", "\n", "\n", "3->4\n", - "\n", - "\n", + "\n", + "\n", "c\n", "\n", + "\n", + "\n", + "5\n", + "\n", + "5\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "\n", + "6\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "a\n", + "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "T.text_to_fst('abc')" + "T.text_to_fst('abc a')" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 67, "id": "05cdbfaf-c0e2-450a-b5e1-e7817e8fb021", "metadata": {}, "outputs": [ @@ -464,231 +526,319 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "FST\n", - "\n", + "\n", "\n", "\n", "0\n", - "\n", - "\n", - "0\n", + "\n", + "\n", + "0\n", "\n", "\n", "\n", "0->0\n", - "\n", - "\n", - "a:a/4.2279\n", + "\n", + "\n", + "a:a/3.9869\n", "\n", "\n", "\n", "0->0\n", - "\n", - "\n", - "b:b/0.58132\n", + "\n", + "\n", + "b:b/3.053\n", "\n", "\n", "\n", "0->0\n", - "\n", - "\n", - "c:c/4.2278\n", + "\n", + "\n", + "c:c/3.9868\n", "\n", "\n", "\n", "0->0\n", - "\n", - "\n", - "▁:▁/4.2277\n", + "\n", + "\n", + "▁:▁/3.6624\n", "\n", "\n", "\n", "1\n", - "\n", - "1\n", + "\n", + "1\n", "\n", "\n", "\n", "0->1\n", - "\n", - "\n", - "a:<eps>\n", + "\n", + "\n", + "a:<eps>\n", "\n", - "\n", + "\n", "\n", - "2\n", - "\n", - "2\n", + "5\n", + "\n", + "5\n", "\n", - "\n", + "\n", "\n", - "0->2\n", - "\n", - "\n", - "▁:<eps>\n", + "0->5\n", + "\n", + "\n", + "▁:<eps>\n", "\n", "\n", "\n", "1->0\n", - "\n", - "\n", - "b:ab/1.938\n", + "\n", + "\n", + "b:ab/3.0606\n", "\n", - "\n", + "\n", "\n", - "3\n", - "\n", - "3\n", + "2\n", + "\n", + "2\n", "\n", - "\n", + "\n", "\n", - "2->3\n", - "\n", - "\n", - "a:<eps>\n", + "1->2\n", + "\n", + "\n", + "b:<eps>\n", "\n", - "\n", + "\n", + "\n", + "5->0\n", + "\n", + "\n", + "a:▁a/3.9869\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "6\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "a:<eps>\n", + "\n", + "\n", + "\n", + "11\n", + "\n", + "11\n", + "\n", + "\n", + "\n", + "5->11\n", + "\n", + "\n", + "c:<eps>\n", + "\n", + "\n", "\n", - "8\n", - "\n", - "8\n", + "3\n", + "\n", + "3\n", "\n", - "\n", + "\n", "\n", - "2->8\n", - "\n", - "\n", - "c:<eps>\n", + "2->3\n", + "\n", + "\n", + "a:<eps>\n", "\n", "\n", "\n", "4\n", - "\n", - "4\n", + "\n", + "4\n", "\n", "\n", "\n", "3->4\n", - "\n", - "\n", - "c:<eps>\n", - "\n", - "\n", - "\n", - "9\n", - "\n", - "9\n", - "\n", - "\n", - "\n", - "8->9\n", - "\n", - "\n", - "a:<eps>\n", + "\n", + "\n", + "b:<eps>\n", "\n", - "\n", - "\n", - "5\n", - "\n", - "5\n", - "\n", - "\n", + "\n", "\n", - "4->5\n", - "\n", - "\n", - "a:<eps>\n", + "4->0\n", + "\n", + "\n", + "a:ababa/3.1827\n", "\n", - "\n", - "\n", - "6\n", - "\n", - "6\n", + "\n", + "\n", + "6->0\n", + "\n", + "\n", + "b:▁ab/2.7419\n", "\n", - "\n", - "\n", - "5->6\n", - "\n", - "\n", - "c:<eps>\n", + "\n", + "\n", + "6->0\n", + "\n", + "\n", + "c:▁ac/2.8102\n", "\n", "\n", "\n", "7\n", - "\n", - "7\n", + "\n", + "7\n", "\n", "\n", - "\n", + "\n", "6->7\n", - "\n", - "\n", - "a:<eps>\n", - "\n", - "\n", - "\n", - "7->0\n", - "\n", - "\n", - "c:▁acacac/3.2279\n", + "\n", + "\n", + "c:<eps>\n", "\n", - "\n", - "\n", - "10\n", - "\n", - "10\n", - "\n", - "\n", - "\n", - "9->10\n", - "\n", - "\n", - "c:<eps>\n", - "\n", - "\n", - "\n", - "11\n", - "\n", - "11\n", - "\n", - "\n", - "\n", - "10->11\n", - "\n", - "\n", - "a:<eps>\n", + "\n", + "\n", + "11->0\n", + "\n", + "\n", + "a:▁ca/2.7453\n", "\n", "\n", "\n", "12\n", - "\n", - "12\n", + "\n", + "12\n", "\n", "\n", - "\n", + "\n", "11->12\n", - "\n", - "\n", - "c:<eps>\n", + "\n", + "\n", + "a:<eps>\n", "\n", - "\n", + "\n", + "\n", + "8\n", + "\n", + "8\n", + "\n", + "\n", + "\n", + "7->8\n", + "\n", + "\n", + "a:<eps>\n", + "\n", + "\n", "\n", - "12->0\n", - "\n", - "\n", - "a:▁cacaca/3.2279\n", + "8->0\n", + "\n", + "\n", + "c:▁acac/2.8035\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "9\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "c:<eps>\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "10\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "a:<eps>\n", + "\n", + "\n", + "\n", + "10->0\n", + "\n", + "\n", + "c:▁acacac/2.9112\n", + "\n", + "\n", + "\n", + "13\n", + "\n", + "13\n", + "\n", + "\n", + "\n", + "12->13\n", + "\n", + "\n", + "c:<eps>\n", + "\n", + "\n", + "\n", + "13->0\n", + "\n", + "\n", + "a:▁caca/2.6854\n", + "\n", + "\n", + "\n", + "14\n", + "\n", + "14\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "a:<eps>\n", + "\n", + "\n", + "\n", + "15\n", + "\n", + "15\n", + "\n", + "\n", + "\n", + "14->15\n", + "\n", + "\n", + "c:<eps>\n", + "\n", + "\n", + "\n", + "15->0\n", + "\n", + "\n", + "a:▁cacaca/2.7243\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 10, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -700,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 68, "id": "64e72dc6-d189-4513-9e6f-31ec5a018f51", "metadata": {}, "outputs": [ @@ -713,124 +863,184 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "FST\n", - "\n", + "\n", "\n", "\n", "0\n", - "\n", - "0\n", + "\n", + "0\n", "\n", "\n", "\n", "1\n", - "\n", - "1\n", + "\n", + "1\n", "\n", "\n", "\n", "0->1\n", - "\n", - "\n", - "▁:▁/4.2277\n", + "\n", + "\n", + "▁:▁/3.6624\n", "\n", "\n", "\n", "2\n", - "\n", - "2\n", + "\n", + "2\n", "\n", - "\n", + "\n", "\n", - "1->2\n", - "\n", - "\n", - "a:a/4.2279\n", + "0->2\n", + "\n", + "\n", + "▁:<eps>\n", "\n", "\n", "\n", "3\n", - "\n", - "3\n", + "\n", + "3\n", "\n", "\n", "\n", "1->3\n", - "\n", - "\n", - "a:<eps>\n", + "\n", + "\n", + "a:a/3.9869\n", "\n", "\n", "\n", "4\n", - "\n", - "4\n", + "\n", + "4\n", "\n", - "\n", + "\n", "\n", - "2->4\n", - "\n", - "\n", - "b:b/0.58132\n", + "1->4\n", + "\n", + "\n", + "a:<eps>\n", "\n", - "\n", + "\n", "\n", - "3->4\n", - "\n", - "\n", - "b:ab/1.938\n", + "2->3\n", + "\n", + "\n", + "a:▁a/3.9869\n", "\n", "\n", "\n", "5\n", - "\n", - "5\n", + "\n", + "5\n", "\n", - "\n", + "\n", "\n", - "4->5\n", - "\n", - "\n", - "c:c/4.2278\n", + "2->5\n", + "\n", + "\n", + "a:<eps>\n", "\n", "\n", "\n", "6\n", - "\n", - "6\n", + "\n", + "6\n", "\n", - "\n", + "\n", "\n", + "3->6\n", + "\n", + "\n", + "b:b/3.053\n", + "\n", + "\n", + "\n", + "4->6\n", + "\n", + "\n", + "b:ab/3.0606\n", + "\n", + "\n", + "\n", "5->6\n", - "\n", - "\n", - "▁:▁/4.2277\n", + "\n", + "\n", + "b:▁ab/2.7419\n", "\n", "\n", "\n", "7\n", - "\n", - "\n", - "7\n", + "\n", + "7\n", "\n", "\n", - "\n", + "\n", "6->7\n", - "\n", - "\n", - "a:a/4.2279\n", + "\n", + "\n", + "c:c/3.9868\n", + "\n", + "\n", + "\n", + "8\n", + "\n", + "8\n", + "\n", + "\n", + "\n", + "7->8\n", + "\n", + "\n", + "▁:▁/3.6624\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "9\n", + "\n", + "\n", + "\n", + "7->9\n", + "\n", + "\n", + "▁:<eps>\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "\n", + "10\n", + "\n", + "\n", + "\n", + "8->10\n", + "\n", + "\n", + "a:a/3.9869\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "a:▁a/3.9869\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 11, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } @@ -844,17 +1054,25 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, + "id": "ef8dfdab", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 69, "id": "f511e71f-30c5-4411-92eb-53eb304db5f3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "PieceCounts(Z=18.794057846069336, counts=defaultdict(, {7: 2.000000476837272, 9: 1.0535966996228223, 3: 0.05359622278555041, 4: 0.9464036251483268, 8: 0.9999995231629555}))" + "PieceCounts(Z=10.658158302307129, counts=defaultdict(, {13: 0.043440237520893474, 15: 0.025365740041493418, 16: 0.988138216606856, 10: 0.013504194035922474, 11: 0.018074516547870796, 5: 0.9684213172487539, 14: 0.9999997615814493}))" ] }, - "execution_count": 12, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } @@ -865,18 +1083,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 71, "id": "e289e75d-9a9c-40b1-8cf2-bbfd8d174d50", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[ViterbiPath(path=[7, 4, 8, 7, 9], prob=0.9464036251483268),\n", - " ViterbiPath(path=[7, 9, 3, 8, 7, 9], prob=0.053596248342227065)]" + "[ViterbiPath(path=[5, 14, 16], prob=0.9441836768129545),\n", + " ViterbiPath(path=[5, 14, 13, 15], prob=0.024237410982522353)]" ] }, - "execution_count": 13, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } @@ -887,7 +1105,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 72, "id": "3c47b2b2-a029-4454-bbc1-416c59138b75", "metadata": {}, "outputs": [ @@ -900,124 +1118,184 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "FST\n", - "\n", + "\n", "\n", "\n", "0\n", - "\n", - "0\n", + "\n", + "0\n", "\n", - "\n", + "\n", "\n", + "2\n", + "\n", + "2\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "▁:<eps>\n", + "\n", + "\n", + "\n", "1\n", - "\n", - "1\n", + "\n", + "1\n", "\n", "\n", - "\n", + "\n", "0->1\n", - "\n", - "\n", - "▁:▁/4.2277,0\n", + "\n", + "\n", + "▁:▁/3.6624,0\n", "\n", "\n", - "\n", + "\n", "3\n", - "\n", - "3\n", + "\n", + "3\n", "\n", - "\n", - "\n", - "1->3\n", - "\n", - "\n", - "a:<eps>\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "a:▁a/3.9869,0\n", "\n", - "\n", - "\n", - "2\n", - "\n", - "2\n", + "\n", + "\n", + "5\n", + "\n", + "5\n", "\n", - "\n", - "\n", - "1->2\n", - "\n", - "\n", - "a:a/4.2279,0\n", + "\n", + "\n", + "2->5\n", + "\n", + "\n", + "a:<eps>\n", "\n", "\n", - "\n", + "\n", "4\n", - "\n", - "4\n", + "\n", + "4\n", "\n", - "\n", - "\n", - "3->4\n", - "\n", - "\n", - "b:ab/1.938,0\n", + "\n", + "\n", + "1->4\n", + "\n", + "\n", + "a:<eps>\n", "\n", - "\n", + "\n", "\n", - "2->4\n", - "\n", - "\n", - "b:b/0.58132,0\n", - "\n", - "\n", - "\n", - "5\n", - "\n", - "5\n", - "\n", - "\n", - "\n", - "4->5\n", - "\n", - "\n", - "c:c/4.2278,0\n", + "1->3\n", + "\n", + "\n", + "a:a/3.9869,0\n", "\n", "\n", "\n", "6\n", - "\n", - "6\n", + "\n", + "6\n", "\n", - "\n", + "\n", + "\n", + "4->6\n", + "\n", + "\n", + "b:ab/3.0606,0\n", + "\n", + "\n", "\n", + "3->6\n", + "\n", + "\n", + "b:b/3.053,0\n", + "\n", + "\n", + "\n", "5->6\n", - "\n", - "\n", - "▁:▁/4.2277,0\n", + "\n", + "\n", + "b:▁ab/2.7419,0\n", "\n", "\n", "\n", "7\n", - "\n", - "\n", - "7\n", + "\n", + "7\n", "\n", "\n", - "\n", + "\n", "6->7\n", - "\n", - "\n", - "a:a/4.2279,0\n", + "\n", + "\n", + "c:c/3.9868,0\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "9\n", + "\n", + "\n", + "\n", + "7->9\n", + "\n", + "\n", + "▁:<eps>\n", + "\n", + "\n", + "\n", + "8\n", + "\n", + "8\n", + "\n", + "\n", + "\n", + "7->8\n", + "\n", + "\n", + "▁:▁/3.6624,0\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "\n", + "10\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "a:▁a/3.9869,0\n", + "\n", + "\n", + "\n", + "8->10\n", + "\n", + "\n", + "a:a/3.9869,0\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -1029,17 +1307,17 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 73, "id": "9753f378-8e12-44db-83ce-3c21d7bc901d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "PieceCounts(Z=18.794057846069336, counts=defaultdict(, {7: 2.000000476837272, 9: 1.0535966996228223, 3: 0.05359622278555041, 4: 0.9464036251483268, 8: 0.9999995231629555}))" + "PieceCounts(Z=10.658158302307129, counts=defaultdict(, {13: 0.043440237520893474, 15: 0.025365740041493418, 16: 0.988138216606856, 10: 0.013504194035922474, 11: 0.018074516547870796, 5: 0.9684213172487539, 14: 0.9999997615814493}))" ] }, - "execution_count": 15, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } @@ -1050,18 +1328,18 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 74, "id": "039ac3fb-fd58-4ba8-aba7-b94100f7a079", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[ViterbiPath(path=[7, 4, 8, 7, 9], prob=0.9464036251483268),\n", - " ViterbiPath(path=[7, 9, 3, 8, 7, 9], prob=0.053596248342227065)]" + "[ViterbiPath(path=[5, 14, 16], prob=0.9441836768129545),\n", + " ViterbiPath(path=[5, 14, 13, 15], prob=0.024237410982522353)]" ] }, - "execution_count": 16, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } @@ -1072,7 +1350,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 75, "id": "7a1d8cc6-5ba9-40e8-87ac-160cb43a5b05", "metadata": {}, "outputs": [ @@ -1080,7 +1358,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[(7, 7), (4, 4), (4, 4), (4, 4), (9, 9), (4, 4), (3, 3), (6, 6), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (5, 5), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (7, 7), (8, 8), (9, 9), (7, 7), (8, 8), (9, 9), (8, 8), (9, 9), (6, 6), (7, 7), (9, 9), (8, 8), (7, 7), (9, 9), (8, 8), (9, 9), (8, 8), (5, 5), (7, 7), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4)]\n" + "[(5, 5), (12, 12), (11, 11), (10, 10), (4, 4), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (9, 9), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (6, 6), (3, 3), (4, 4), (8, 8), (7, 7), (9, 9), (5, 5), (14, 14), (11, 11), (14, 14), (11, 11), (14, 14), (11, 11), (14, 14), (11, 11), (14, 14), (11, 11)]\n" ] } ], @@ -1092,7 +1370,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 76, "id": "55a41b3d-947b-447a-b1c0-485013e11771", "metadata": {}, "outputs": [ @@ -1100,16 +1378,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[(7, 7), (4, 4), (4, 4), (4, 4), (9, 9), (4, 4), (3, 3)]\n", - "[(6, 6), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9)]\n", - "[(5, 5), (9, 9), (8, 8), (9, 9), (8, 8), (9, 9), (8, 8)]\n", - "[(7, 7), (8, 8), (9, 9)]\n", - "[(7, 7), (8, 8), (9, 9), (8, 8), (9, 9)]\n", + "[(5, 5), (12, 12), (11, 11), (10, 10)]\n", + "[(4, 4), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15)]\n", + "[(9, 9), (15, 15), (14, 14), (15, 15), (14, 14), (15, 15), (14, 14)]\n", "[(6, 6)]\n", - "[(7, 7), (9, 9), (8, 8)]\n", - "[(7, 7), (9, 9), (8, 8), (9, 9), (8, 8)]\n", - "[(5, 5)]\n", - "[(7, 7), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4), (8, 8), (4, 4)]\n" + "[(3, 3)]\n", + "[(4, 4)]\n", + "[(8, 8)]\n", + "[(7, 7)]\n", + "[(9, 9)]\n", + "[(5, 5), (14, 14), (11, 11), (14, 14), (11, 11), (14, 14), (11, 11), (14, 14), (11, 11), (14, 14), (11, 11)]\n" ] } ], @@ -1121,7 +1399,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 78, "id": "d78720e1-f55d-4528-97cf-57e80694de77", "metadata": {}, "outputs": [ @@ -1129,16 +1407,36 @@ "name": "stdout", "output_type": "stream", "text": [ - "EM sub_iter=0 size=7 tot_piece_prob=0.9473978471051301 obj=20.143540382385254 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=1 size=7 tot_piece_prob=0.9473908219413193 obj=9.007325291633606 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=2 size=7 tot_piece_prob=0.947378538824443 obj=9.012519359588623 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=3 size=7 tot_piece_prob=0.9473760054764779 obj=9.015252017974854 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=4 size=7 tot_piece_prob=0.947375373403035 obj=9.016208744049072 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=5 size=7 tot_piece_prob=0.9473752376715753 obj=9.016508197784423 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=6 size=7 tot_piece_prob=0.9473751587501271 obj=9.016597795486451 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=7 size=7 tot_piece_prob=0.9473751541401348 obj=9.016624665260315 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=8 size=7 tot_piece_prob=0.9473751399005655 obj=9.01663272380829 num_tokens=55 num_tokens/piece=7.857142857142857\n", - "EM sub_iter=9 size=7 tot_piece_prob=0.9473751265246055 obj=9.016635251045226 num_tokens=55 num_tokens/piece=7.857142857142857\n" + "EM sub_iter=0 size=12 tot_piece_prob=0.8693918826259943 obj=13.593990206718445 num_tokens=39 num_tokens/piece=3.25\n", + "EM sub_iter=1 size=12 tot_piece_prob=0.8727699122224476 obj=8.34394974708557 num_tokens=39 num_tokens/piece=3.25\n", + "EM sub_iter=2 size=12 tot_piece_prob=0.8741026377790027 obj=8.337706542015075 num_tokens=39 num_tokens/piece=3.25\n", + "EM sub_iter=3 size=12 tot_piece_prob=0.8751727373520849 obj=8.341840457916259 num_tokens=39 num_tokens/piece=3.25\n", + "EM sub_iter=4 size=11 tot_piece_prob=0.8838105971649134 obj=8.34699306488037 num_tokens=41 num_tokens/piece=3.727272727272727\n", + "EM sub_iter=5 size=11 tot_piece_prob=0.887924411971826 obj=8.348154759407043 num_tokens=41 num_tokens/piece=3.727272727272727\n", + "EM sub_iter=6 size=11 tot_piece_prob=0.8878875207681647 obj=8.3336656332016 num_tokens=41 num_tokens/piece=3.727272727272727\n", + "EM sub_iter=7 size=11 tot_piece_prob=0.8878791782075109 obj=8.33449227809906 num_tokens=41 num_tokens/piece=3.727272727272727\n", + "EM sub_iter=8 size=11 tot_piece_prob=0.8878771822114357 obj=8.334701108932496 num_tokens=41 num_tokens/piece=3.727272727272727\n", + "EM sub_iter=9 size=11 tot_piece_prob=0.8878765485120356 obj=8.334752082824707 num_tokens=41 num_tokens/piece=3.727272727272727\n", + "EM sub_iter=0 size=9 tot_piece_prob=0.9294637592599753 obj=12.167658138275147 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=1 size=9 tot_piece_prob=0.9293912401137597 obj=9.202643752098083 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=2 size=9 tot_piece_prob=0.929383117390366 obj=9.204878211021423 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=3 size=9 tot_piece_prob=0.929384562185589 obj=9.205665493011475 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=4 size=9 tot_piece_prob=0.9293859683515276 obj=9.20582139492035 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=5 size=9 tot_piece_prob=0.9293866867956463 obj=9.205857014656067 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=6 size=9 tot_piece_prob=0.9293870260762062 obj=9.205867171287537 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=7 size=9 tot_piece_prob=0.9293871504655343 obj=9.205870866775513 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=8 size=9 tot_piece_prob=0.929387178792479 obj=9.205871629714967 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=9 size=9 tot_piece_prob=0.92938719625176 obj=9.205872344970704 num_tokens=53 num_tokens/piece=5.888888888888889\n", + "EM sub_iter=0 size=7 tot_piece_prob=0.954219136391963 obj=10.967120122909545 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=1 size=7 tot_piece_prob=0.9540526893823834 obj=9.783933806419373 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=2 size=7 tot_piece_prob=0.95402870063829 obj=9.786571264266968 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=3 size=7 tot_piece_prob=0.9540249009032046 obj=9.787964510917664 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=4 size=7 tot_piece_prob=0.9540242779373515 obj=9.78823516368866 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=5 size=7 tot_piece_prob=0.9540241588200192 obj=9.788281345367432 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=6 size=7 tot_piece_prob=0.9540241257614047 obj=9.788289332389832 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=7 size=7 tot_piece_prob=0.9540241785045526 obj=9.788291454315186 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=8 size=7 tot_piece_prob=0.9540241508500703 obj=9.788290858268738 num_tokens=63 num_tokens/piece=9.0\n", + "EM sub_iter=9 size=7 tot_piece_prob=0.9540241272339448 obj=9.788290786743165 num_tokens=63 num_tokens/piece=9.0\n" ] } ], @@ -1147,7 +1445,7 @@ "pieces = INITIAL_PIECES\n", "sentences = SENTENCES_BY_SPACE\n", "\n", - "DESIRED_PIECES = 5\n", + "DESIRED_PIECES = 6\n", "PRUNE_FRAC = 0.85\n", "NUM_SUBITER = 10\n", "\n", @@ -1171,7 +1469,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 79, "id": "84b393d3-92be-4842-8bb1-f94effd3fceb", "metadata": {}, "outputs": [ @@ -1188,7 +1486,7 @@ " SentencePiece(index=7, symbol='a', log_freq=-4.326745510101318)]" ] }, - "execution_count": 20, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1199,21 +1497,22 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 80, "id": "32af96ba-e9ca-4e69-a127-0d5fbcd6cc17", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[SentencePiece(index=3, symbol='b', log_freq=-4.2710259982202405),\n", - " SentencePiece(index=7, symbol='▁', log_freq=-2.2952878152022542),\n", - " SentencePiece(index=8, symbol='c', log_freq=-1.0837232665628944),\n", - " SentencePiece(index=9, symbol='a', log_freq=-1.3119614487878422),\n", - " SentencePiece(index=5, symbol='▁acacac', log_freq=-3.5841409609343247)]" + "[SentencePiece(index=5, symbol='▁ab', log_freq=-3.7153066688841117),\n", + " SentencePiece(index=6, symbol='▁ca', log_freq=-2.881972788162477),\n", + " SentencePiece(index=8, symbol='▁ac', log_freq=-4.574554712535495),\n", + " SentencePiece(index=10, symbol='b', log_freq=-1.9974501157027285),\n", + " SentencePiece(index=14, symbol='c', log_freq=-1.1629025446167525),\n", + " SentencePiece(index=15, symbol='a', log_freq=-0.9771427975634528)]" ] }, - "execution_count": 21, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -1224,9 +1523,135 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "f92a050d-53cf-4079-8b45-222ee2781fc1", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a \n", + "ab ab\n", + "aba aba\n", + "ababa ababa\n", + "abcab abcab\n", + "abcabcab abcabcab\n", + "abcabcabcab abcabcabcab\n", + "abcabcabcabcab abcabcabcabcab\n", + "ac ac\n", + "aca aca\n", + "acac acac\n", + "acaca acaca\n", + "acacac acacac\n", + "acacaca acacaca\n", + "acacacac acacacac\n", + "acacacaca acacacaca\n", + "acacacacac acacacacac\n", + "acacacacaca acacacacaca\n", + "acacacacacac acacacacacac\n", + "acacacacacaca acacacacacaca\n", + " acacac▁\n", + " acaca▁ac\n", + " acac▁\n", + " acac▁a\n", + " aca▁\n", + " aca▁ac\n", + " ac▁\n", + " ac▁a\n", + " ac▁acac\n", + " a▁\n", + " a▁ac\n", + " a▁caca\n", + "b \n", + "ba ba\n", + "baba baba\n", + "bcab bcab\n", + "bcabcab bcabcab\n", + "bcabcabcab bcabcabcab\n", + "bcabcabcabcab bcabcabcabcab\n", + "c \n", + "ca ca\n", + "cab cab\n", + "cabcab cabcab\n", + "cabcabcab cabcabcab\n", + "cabcabcabcab cabcabcabcab\n", + "cac cac\n", + "caca caca\n", + "cacac cacac\n", + "cacaca cacaca\n", + "cacacac cacacac\n", + "cacacaca cacacaca\n", + "cacacacac cacacacac\n", + "cacacacaca cacacacaca\n", + "cacacacacac cacacacacac\n", + "cacacacacaca cacacacacaca\n", + "cacacacacacaca cacacacacacaca\n", + " cacaca▁ac\n", + " cacac▁\n", + " caca▁\n", + " caca▁ac\n", + " cac▁\n", + " cac▁a\n", + " ca▁\n", + " ca▁ac\n", + " ca▁caca\n", + " c▁\n", + " c▁a\n", + " c▁acac\n", + "▁ \n", + "▁a ▁a\n", + "▁ab \n", + "▁ac ▁ac\n", + "▁acac ▁acac\n", + "▁acacac ▁acacac\n", + "▁ca ▁ca\n", + "▁caca ▁caca\n", + "▁cacaca ▁cacaca\n" + ] + } + ], + "source": [ + "import importlib,seed_vocab\n", + "importlib.reload(seed_vocab)\n", + "from seed_vocab import *\n", + "\n", + "t=SuffixTree(\"abababaabb_cacacacacacacaca_acacacacacac_ca_caca_cacaca_ac_acac_acacac_abcabcabcabcabcab$\")\n", + "res=get_vocab(0,t)\n", + "\n", + "sentencepiece_result = \"acac acaca acacac caca cacac cacaca acacaca acacacac cacacac cacacaca aca acacacaca ca cac cacacacac acacacacac cacacacaca ac acacacacaca cacacacacac a acacacacacac cacacacacaca abcabcabcab abcabcab c bcabcabcab abcabcabcabcab bcabcab cacacacacacaca cabcabcab acacacacacaca bcabcabcabcab abcab cabcab cabcabcabcab ab bcab cab ▁acac ▁caca ▁acacac ▁cacaca ▁a ▁ac ▁ca b ababa ▁ aba baba ba ▁ab\"\n", + "my_result = \"▁acacac ▁acac ▁ac ▁a ▁cacaca ▁caca ▁ca a▁ac a▁caca a▁ ababa aba abcabcabcabcab abcabcabcab abcabcab abcab ab ac▁acac ac▁a ac▁ aca▁ac aca▁ acac▁a acac▁ acaca▁ac acacac▁ acacacacacaca acacacacacac acacacacaca acacacacac acacacaca acacacac acacaca acacac acaca acac aca ac baba ba bcabcabcabcab bcabcabcab bcabcab bcab c▁acac c▁a c▁ ca▁ac ca▁caca ca▁ cabcabcabcab cabcabcab cabcab cab cac▁a cac▁ caca▁ac caca▁ cacac▁ cacaca▁ac cacacacacacaca cacacacacaca cacacacacac cacacacaca cacacacac cacacaca cacacac cacaca cacac caca cac ca\"\n", + "\n", + "res = [x.replace(\"_\",\"▁\") for x in res]\n", + "\n", + "ssr = sorted(sentencepiece_result.split(\" \"))\n", + "smr = sorted(res)\n", + "\n", + "i,j=0,0\n", + "while i1: # finds the last one that do not fullfill + mid = int((lower_bound+strict_upper_bound)/2) + if cond(l[mid]): + strict_upper_bound = mid + else: + lower_bound = mid + return lower_bound+1 + + #look for first letter + letter = suf[0] + + + child_of_n = binary_search(self.nodes[n].ch, lambda k: self.nodes[k].sub[0] >= letter) + + if child_of_n==len(self.nodes[n].ch) or getChildNode(n,child_of_n).sub[0]>letter: #we need new child of current node + self.nodes.append(Node(suf,[])) + self.nodes[n].ch.insert(child_of_n,len(self.nodes)-1) # the last one node is the next child + return + + + n2=self.nodes[n].ch[child_of_n] #node that beggins with the same prefix + sub2 = self.nodes[n2].sub + assert(sub2[0]==suf[0]) + + try: + # find prefix of remaining suffix in common with child + j = 0 + while j < len(sub2): + if suf[j] != sub2[j]: + # split n2 + n3 = n2 + # new node for the part in common + n2 = len(self.nodes) + self.nodes.append(Node(sub2[:j], [n3])) + self.nodes[n3].sub = sub2[j:] # old node loses the part in common + self.nodes[n].ch[child_of_n] = n2 + break # continue down the tree + j = j + 1 + + self.addSuffix(suf[j:],n2) + + def visualize(self): + if len(self.nodes) == 0: + print( "") + return + + def f(n, pre): + children = self.nodes[n].ch + if len(children) == 0: + print( self.nodes[n].sub) + return + print( self.nodes[n].sub) + for c in children[:-1]: + print( pre, "+-",end=" ") + f(c, pre + " | ") + print( pre, "+-",end=" ") + f(children[-1], pre + " ") + + f(0, "") + + + +#t.visualize() + +def get_vocab(nodenr, tree, prefix=""): #dfs + res=[] + node = tree.nodes[nodenr] + for child in node.ch: + res += get_vocab(child,tree,prefix+node.sub) + if( node.ch and len(prefix+node.sub)>1 ): + res.append(prefix+node.sub) + return res + +if __name__=="__main__": + t=SuffixTree("abababaabb_cacacacacacacaca_acacacacacac_ca_caca_cacaca_ac_acac_acacac_abcabcabcabcabcab$") + res=get_vocab(0,t) + print() + for x in res: + print(x,end=" ") + print() + print(len(res)) \ No newline at end of file