diff --git a/parsers/__init__.py b/parsers/__init__.py index c19aafa9..c2132746 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -12,6 +12,7 @@ cnn.CNNParser politico.PoliticoParser bbc.BBCParser +nunl.NuNLParser """.split() parser_dict = {} diff --git a/parsers/nunl.py b/parsers/nunl.py new file mode 100644 index 00000000..bb7261e3 --- /dev/null +++ b/parsers/nunl.py @@ -0,0 +1,37 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class NuNLParser(BaseParser): + SUFFIX = '' + domains = ['www.nu.nl'] + + feeder_base = 'http://www.nu.nl/' + feeder_pat = '^http://www.nu.nl/\w+/\d+/' + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + header = soup.find('div', 'header') + + self.meta = soup.findAll('meta') + self.title = header.find('h1').getText() + + self.byline = '' + + # Date of the last revision + self.date = header.find('div', 'dateplace-data').contents[2].lstrip() + + content = soup.find('div', 'content') + + self.body = '' + + for i in content.childGenerator(): + if not isinstance(i, Tag): + continue + if not i.name == 'h2' and not i.name == 'p': + continue + + self.body += i.getText() + '\n\n' + diff --git a/website/frontend/models.py b/website/frontend/models.py index 64170af0..929fdbc0 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -21,6 +21,7 @@ def strip_prefix(string, prefix): 'edition.cnn.com': 'CNN', 'www.bbc.co.uk': 'BBC', 'www.politico.com': 'Politico', + 'www.nu.nl': 'Nu.nl', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/views.py b/website/frontend/views.py index 4b0a7487..3af169ff 100644 --- a/website/frontend/views.py +++ b/website/frontend/views.py @@ -95,7 +95,7 @@ def get_articles(source=None, distance=0): return articles -SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk'.split() + [''] +SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk nu.nl'.split() + [''] @cache_page(60 * 30) #30 minute cache def browse(request, source=''):