From 88e846103c1512d3d1065394ddb21b0dcaaea604 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sun, 14 Jul 2019 00:17:52 +0200 Subject: [PATCH] Replace all punctuation in title in split_wiki Fix #7 --- nlputils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nlputils.py b/nlputils.py index e72dfa4..80f4277 100644 --- a/nlputils.py +++ b/nlputils.py @@ -1,5 +1,6 @@ from fastai.basics import * import re +import string def get_wiki(path,lang): @@ -35,13 +36,14 @@ def split_wiki(path,lang): dest.mkdir(exist_ok=True, parents=True) title_re = re.compile(rf'') + punctuation_re = re.compile(f'[{re.escape(string.punctuation)}]') lines = (path/name).open() f=None for i,l in enumerate(lines): if i%100000 == 0: print(i) if l.startswith('