From eaca4337d972edfe1d44a93e2d93701dbab98766 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Tue, 19 Jul 2016 15:45:21 +0100 Subject: [PATCH 1/4] Handle unicode text --- theanets/recurrent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theanets/recurrent.py b/theanets/recurrent.py index e1ceee5..c238524 100644 --- a/theanets/recurrent.py +++ b/theanets/recurrent.py @@ -88,7 +88,7 @@ def __init__(self, text, alpha=None, min_count=2, unknown='\0'): char for char, count in collections.Counter(text).items() if char != unknown and count >= min_count))) - self.text = re.sub(r'[^{}]'.format(re.escape(self.alpha)), unknown, text) + self.text = re.sub(ur'[^{}]'.format(re.escape(self.alpha)), unknown, text) assert unknown not in self.alpha self._rev_index = unknown + self.alpha self._fwd_index = dict(zip(self._rev_index, range(1 + len(self.alpha)))) From d107a0b1dec5e655dcabcbfc7ce71a25cf740bba Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Tue, 19 Jul 2016 16:43:41 +0100 Subject: [PATCH 2/4] Fix for 3.4 --- theanets/recurrent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theanets/recurrent.py b/theanets/recurrent.py index c238524..3e9b5fe 100644 --- a/theanets/recurrent.py +++ b/theanets/recurrent.py @@ -88,7 +88,7 @@ def __init__(self, text, alpha=None, min_count=2, unknown='\0'): char for char, count in collections.Counter(text).items() if char != unknown and count >= min_count))) - self.text = re.sub(ur'[^{}]'.format(re.escape(self.alpha)), unknown, text) + self.text = re.sub(r'[^{}]'.format(re.escape(self.alpha)).encode('utf8'), unknown, text) assert unknown not in self.alpha self._rev_index = unknown + self.alpha self._fwd_index = dict(zip(self._rev_index, range(1 + len(self.alpha)))) From 27c016d73b2ed531e8f3e6415d7efe7e8a32c486 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Tue, 19 Jul 2016 21:17:59 +0100 Subject: [PATCH 3/4] Fixes utf8 --- theanets/recurrent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theanets/recurrent.py b/theanets/recurrent.py index 3e9b5fe..d3cde06 100644 --- a/theanets/recurrent.py +++ b/theanets/recurrent.py @@ -88,7 +88,7 @@ def __init__(self, text, alpha=None, min_count=2, unknown='\0'): char for char, count in collections.Counter(text).items() if char != unknown and count >= min_count))) - self.text = re.sub(r'[^{}]'.format(re.escape(self.alpha)).encode('utf8'), unknown, text) + self.text = re.sub(r'[^{}]'.format(re.escape(self.alpha).encode('utf8')), unknown, text) assert unknown not in self.alpha self._rev_index = unknown + self.alpha self._fwd_index = dict(zip(self._rev_index, range(1 + len(self.alpha)))) From 22360a238d357ca752537d08229e05596419d10f Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Wed, 20 Jul 2016 14:45:59 +0100 Subject: [PATCH 4/4] Use unicode object --- theanets/recurrent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theanets/recurrent.py b/theanets/recurrent.py index d3cde06..13f328b 100644 --- a/theanets/recurrent.py +++ b/theanets/recurrent.py @@ -81,14 +81,14 @@ class Text(object): A string containing each character in the alphabet. ''' - def __init__(self, text, alpha=None, min_count=2, unknown='\0'): + def __init__(self, text, alpha=None, min_count=2, unknown=u'\0'): self.alpha = alpha if self.alpha is None: self.alpha = ''.join(sorted(set( char for char, count in collections.Counter(text).items() if char != unknown and count >= min_count))) - self.text = re.sub(r'[^{}]'.format(re.escape(self.alpha).encode('utf8')), unknown, text) + self.text = re.sub(unicode(r'[^{}]', 'utf-8').format(re.escape(self.alpha)), unknown, text) assert unknown not in self.alpha self._rev_index = unknown + self.alpha self._fwd_index = dict(zip(self._rev_index, range(1 + len(self.alpha))))