diff --git a/parsers/__init__.py b/parsers/__init__.py index c19aafa9..5a2ac23d 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -12,6 +12,7 @@ cnn.CNNParser politico.PoliticoParser bbc.BBCParser +nosnl.NOSNLParser """.split() parser_dict = {} diff --git a/parsers/nosnl.py b/parsers/nosnl.py new file mode 100644 index 00000000..ea06e9f2 --- /dev/null +++ b/parsers/nosnl.py @@ -0,0 +1,33 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class NOSNLParser(BaseParser): + domains = ['www.nos.nl'] + + feeder_base = 'http://www.nos.nl' + feeder_pat = '^http://www.nos.nl/artikel/' + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + + article = soup.find('div', id = 'article') + self.title = article.find('h1').getText() + + article_content = soup.find('div', id = 'article-content') + + self.byline = '' + self.date = article_content.find('abbr', 'page-last-modified').getText() + + self.body = '' + for i in article_content.childGenerator(): + if not isinstance(i, Tag): + continue + if not i.name == 'p': + continue + + self.body += i.getText() + '\n\n' + diff --git a/website/frontend/models.py b/website/frontend/models.py index 64170af0..0c9b5e36 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -21,6 +21,7 @@ def strip_prefix(string, prefix): 'edition.cnn.com': 'CNN', 'www.bbc.co.uk': 'BBC', 'www.politico.com': 'Politico', + 'www.nos.nl': 'NOS.nl', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/views.py b/website/frontend/views.py index 4b0a7487..daead0bf 100644 --- a/website/frontend/views.py +++ b/website/frontend/views.py @@ -95,7 +95,7 @@ def get_articles(source=None, distance=0): return articles -SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk'.split() + [''] +SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk nos.nl'.split() + [''] @cache_page(60 * 30) #30 minute cache def browse(request, source=''):