ecprice · flupzor · Apr 6, 2013
diff --git a/parsers/__init__.py b/parsers/__init__.py
@@ -12,6 +12,7 @@
 cnn.CNNParser
 politico.PoliticoParser
 bbc.BBCParser
+nunl.NuNLParser
 """.split()
 
 parser_dict = {}

diff --git a/parsers/nunl.py b/parsers/nunl.py
@@ -0,0 +1,37 @@
+from baseparser import BaseParser
+from BeautifulSoup import BeautifulSoup, Tag
+
+
+class NuNLParser(BaseParser):
+    SUFFIX = ''
+    domains = ['www.nu.nl']
+
+    feeder_base = 'http://www.nu.nl/'
+    feeder_pat  = '^http://www.nu.nl/\w+/\d+/'
+
+    def _parse(self, html):
+        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES,
+                             fromEncoding='utf-8')
+
+        header = soup.find('div', 'header')
+
+        self.meta = soup.findAll('meta')
+        self.title = header.find('h1').getText()
+
+        self.byline = ''
+
+        # Date of the last revision
+        self.date = header.find('div', 'dateplace-data').contents[2].lstrip()
+
+        content = soup.find('div', 'content')
+
+        self.body = ''
+
+        for i in content.childGenerator():
+            if not isinstance(i, Tag):
+                continue
+            if not i.name == 'h2' and not i.name == 'p':
+                continue
+
+            self.body += i.getText() + '\n\n'
+
diff --git a/website/frontend/models.py b/website/frontend/models.py
@@ -21,6 +21,7 @@ def strip_prefix(string, prefix):
                    'edition.cnn.com': 'CNN',
                    'www.bbc.co.uk': 'BBC',
                    'www.politico.com': 'Politico',
+                   'www.nu.nl': 'Nu.nl',
                    }
 
 ancient = datetime(1901, 1, 1)

diff --git a/website/frontend/views.py b/website/frontend/views.py
@@ -95,7 +95,7 @@ def get_articles(source=None, distance=0):
     return articles
 
 
-SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk'.split() + ['']
+SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk nu.nl'.split() + ['']
 
 @cache_page(60 * 30)  #30 minute cache
 def browse(request, source=''):