diff --git a/nlputils.py b/nlputils.py index e72dfa4..80f4277 100644 --- a/nlputils.py +++ b/nlputils.py @@ -1,5 +1,6 @@ from fastai.basics import * import re +import string def get_wiki(path,lang): @@ -35,13 +36,14 @@ def split_wiki(path,lang): dest.mkdir(exist_ok=True, parents=True) title_re = re.compile(rf'') + punctuation_re = re.compile(f'[{re.escape(string.punctuation)}]') lines = (path/name).open() f=None for i,l in enumerate(lines): if i%100000 == 0: print(i) if l.startswith('