You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When I user p = ttp.Parser(); ttp.parse(tweettext, html=False); I get exceptions for some tweets due to invalid html character formatting as :
Traceback (most recent call last):
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 34, in
extract_tweet_tags("twitter12051154249.txt")
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 28, in extract_tweet_tags
result = ttp.Parser().parse(tweet.strip(), html=True)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 131, in parse
parsed_html = self._html(text) if html else self._text(text)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 148, in _html
return HASHTAG_REGEX.sub(self._parse_tags, html)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 249, in _parse_tags
return '%s%s' % (pre, self.format_tag(tag, text))
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 270, in format_tag
% (urllib.quote('#' + text.encode('utf-8')), tag, text)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa5 in position 0: invalid start byte
I fixed it by adding a line in the following methods:
def parse(self, text, html=True):
'''Parse the text and return a ParseResult instance.'''
self._urls = []
self._users = []
self._lists = []
self._tags = []
self._is_html = html #added to fix a bug
reply = REPLY_REGEX.match(text)
reply = reply.groups(0)[0] if reply is not None else None
parsed_html = self._html(text) if html else self._text(text)
return ParseResult(self._urls, self._users, reply,
self._lists, self._tags, parsed_html)
mat = match.group(0)
# Fix problems with the regex capturing stuff infront of the #
tag = None
for i in u'#\uff03':
pos = mat.rfind(i)
if pos != -1:
tag = i
break
pre, text = mat[:pos], mat[pos + 1:]
if self._include_spans:
span = match.span(0)
# add an offset if pre is e.g. ' '
span = (span[0] + len(pre), span[1])
self._tags.append((text, span))
else:
self._tags.append(text)
if self._is_html: #self._html: changed to fix a bug
return '%s%s' % (pre, self.format_tag(tag, text))
The text was updated successfully, but these errors were encountered:
Hi
When I user p = ttp.Parser(); ttp.parse(tweettext, html=False); I get exceptions for some tweets due to invalid html character formatting as :
Traceback (most recent call last):
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 34, in
extract_tweet_tags("twitter12051154249.txt")
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 28, in extract_tweet_tags
result = ttp.Parser().parse(tweet.strip(), html=True)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 131, in parse
parsed_html = self._html(text) if html else self._text(text)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 148, in _html
return HASHTAG_REGEX.sub(self._parse_tags, html)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 249, in _parse_tags
return '%s%s' % (pre, self.format_tag(tag, text))
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 270, in format_tag
% (urllib.quote('#' + text.encode('utf-8')), tag, text)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa5 in position 0: invalid start byte
I fixed it by adding a line in the following methods:
def parse(self, text, html=True):
'''Parse the text and return a ParseResult instance.'''
self._urls = []
self._users = []
self._lists = []
self._tags = []
self._is_html = html #added to fix a bug
def _parse_tags(self, match):
'''Parse hashtags.'''
The text was updated successfully, but these errors were encountered: