Skip to content

Commit

Permalink
Instantiate parsers only once
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Oct 2, 2019
1 parent 0691799 commit 8bbee78
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 5 deletions.
36 changes: 36 additions & 0 deletions parsel/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from lxml import etree
from lxml.etree import XMLParser as _UnsafeXMLParser
from lxml.html import HTMLParser as _HTMLParser


class _LXMLBaseParser(object):

def __init__(self, parser_cls):
self._parser = parser_cls(recover=True, encoding='utf8')

def parse(self, text, base_url):
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
root = etree.fromstring(body, parser=self._parser, base_url=base_url)
if root is None:
root = etree.fromstring(b'<html/>', parser=self._parser,
base_url=base_url)
return root


class HTMLParser(_LXMLBaseParser):

def __init__(self):
super(HTMLParser, self).__init__(_HTMLParser)


class _XMLParser(_UnsafeXMLParser):

def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
super(_XMLParser, self).__init__(*args, **kwargs)


class XMLParser(_LXMLBaseParser):

def __init__(self):
super(XMLParser, self).__init__(_XMLParser)
3 changes: 3 additions & 0 deletions parsel/parser/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from parsel.parser import HTMLParser

HTML_PARSER = HTMLParser()
3 changes: 3 additions & 0 deletions parsel/parser/xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from parsel.parser import XMLParser

XML_PARSER = XMLParser()
39 changes: 34 additions & 5 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,39 @@
"""

import sys
from importlib import import_module
from warnings import warn

import six
from lxml import etree, html
from lxml import etree

from .utils import flatten, iflatten, extract_regex, shorten
from .csstranslator import HTMLTranslator, GenericTranslator


def _load_object(path):
"""Load an object given its absolute object path, and return it.
`path` can point to a class, function, variable or a class instance. For
example: ``'parsel.parser.html.HTML_PARSER'``.
"""

try:
dot = path.rindex('.')
except ValueError:
raise ValueError("Error loading object '%s': not a full path" % path)

module, name = path[:dot], path[dot+1:]
mod = import_module(module)

try:
obj = getattr(mod, name)
except AttributeError:
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))

return obj


class CannotRemoveElementWithoutRoot(Exception):
pass

Expand All @@ -21,14 +46,16 @@ class CannotRemoveElementWithoutParent(Exception):

class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs):
warn('parsel.selector.SafeXMLParser is deprecated',
DeprecationWarning, stacklevel=2)
kwargs.setdefault('resolve_entities', False)
super(SafeXMLParser, self).__init__(*args, **kwargs)

_ctgroup = {
'html': {'_parser': html.HTMLParser,
'html': {'_parser': 'parsel.parser.html.HTML_PARSER',
'_csstranslator': HTMLTranslator(),
'_tostring_method': 'html'},
'xml': {'_parser': SafeXMLParser,
'xml': {'_parser': 'parsel.parser.xml.XML_PARSER',
'_csstranslator': GenericTranslator(),
'_tostring_method': 'xml'},
}
Expand All @@ -46,6 +73,8 @@ def _st(st):
def create_root_node(text, parser_cls, base_url=None):
"""Create root node for text using given parser class.
"""
warn('parsel.selector.create_root_node is deprecated',
DeprecationWarning, stacklevel=2)
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
parser = parser_cls(recover=True, encoding='utf8')
root = etree.fromstring(body, parser=parser, base_url=base_url)
Expand Down Expand Up @@ -198,7 +227,7 @@ class Selector(object):
def __init__(self, text=None, type=None, namespaces=None, root=None,
base_url=None, _expr=None):
self.type = st = _st(type or self._default_type)
self._parser = _ctgroup[st]['_parser']
self._parser = _load_object(_ctgroup[st]['_parser'])
self._csstranslator = _ctgroup[st]['_csstranslator']
self._tostring_method = _ctgroup[st]['_tostring_method']

Expand All @@ -219,7 +248,7 @@ def __getstate__(self):
raise TypeError("can't pickle Selector objects")

def _get_root(self, text, base_url=None):
return create_root_node(text, self._parser, base_url=base_url)
return self._parser.parse(text=text, base_url=base_url)

def xpath(self, query, namespaces=None, **kwargs):
"""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_deprecations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding:utf-8 -*-


from unittest import TestCase
from warnings import catch_warnings

from parsel.selector import create_root_node, SafeXMLParser
from lxml.html import HTMLParser


class TestDeprecations(TestCase):

def test_create_root_node(self):
with catch_warnings(record=True) as warnings:
create_root_node(u'…', HTMLParser)
self.assertEqual(len(warnings), 1)

def test_SafeXMLParser(self):
with catch_warnings(record=True) as warnings:
parser = SafeXMLParser()
self.assertEqual(len(warnings), 1)
22 changes: 22 additions & 0 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from parsel import Selector
from parsel.selector import (
_load_object,
CannotRemoveElementWithoutRoot,
CannotRemoveElementWithoutParent,
)
Expand Down Expand Up @@ -913,3 +914,24 @@ def test_set(self):
//div[@itemtype="http://schema.org/Event"]
//*[@itemscope]/*/@itemprop)''').extract(),
[u'url', u'name', u'startDate', u'location', u'offers'])


try:
ModuleNotFoundError
except NameError:
ModuleNotFoundError = ImportError


class LoadObjectTestCase(unittest.TestCase):

def test_incomplete_path(self):
with self.assertRaises(ValueError):
object = _load_object('parsel')

def test_inexistent_module(self):
with self.assertRaises(ModuleNotFoundError):
object = _load_object('parsel.inexistent.inexistent')

def test_inexistent_object(self):
with self.assertRaises(NameError):
object = _load_object('parsel.parser.inexistent')

0 comments on commit 8bbee78

Please sign in to comment.