From 3d115644657bfeb44d7be5a5a083cedabf577c67 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 13 Jul 2019 22:44:28 +0200 Subject: [PATCH] Force 'utf-8' encoding without relying on platform-dependent default On Windows, the default encoding is 'cp1252' and this raises a UnicodeDecodeError. Fix #5 --- nlputils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nlputils.py b/nlputils.py index e72dfa4..5a875f3 100644 --- a/nlputils.py +++ b/nlputils.py @@ -26,7 +26,7 @@ def get_wiki(path,lang): shutil.rmtree(path/'text') -def split_wiki(path,lang): +def split_wiki(path,lang,encoding='utf-8'): dest = path/'docs' name = f'{lang}wiki' if dest.exists(): @@ -35,7 +35,7 @@ def split_wiki(path,lang): dest.mkdir(exist_ok=True, parents=True) title_re = re.compile(rf'') - lines = (path/name).open() + lines = (path/name).open(encoding=encoding) f=None for i,l in enumerate(lines): @@ -44,7 +44,7 @@ def split_wiki(path,lang): title = title_re.findall(l)[0].replace('/','_') if len(title)>150: continue if f: f.close() - f = (dest/f'{title}.txt').open('w') + f = (dest/f'{title}.txt').open('w', encoding=encoding) else: f.write(l) f.close() return dest