ecprice · RobertPiwonski · Apr 15, 2015 · Apr 15, 2015 · Apr 26, 2015 · Apr 26, 2015
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+/.idea
 /articles
 
 *~
@@ -30,4 +31,5 @@ pip-log.txt
 .mr.developer.cfg
 
 newsdiffs.db
+newsdiffs.db-journal
 database_settings.py
diff --git a/parsers/RPOnline.py b/parsers/RPOnline.py
@@ -0,0 +1,40 @@
+from baseparser import BaseParser
+from BeautifulSoup import BeautifulSoup, Tag
+
+
+class RPOParser(BaseParser):
+    domains = ['www.rp-online.de']
+
+    feeder_pat = '1\.\d*$'
+    feeder_pages = ['http://www.rp-online.de/']
+
+    def _parse(self, html):
+        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES,
+                             fromEncoding='utf-8')
+        self.meta = soup.findAll('meta')
+        #article headline
+        elt = soup.find('meta', {'property': 'og:title'})['content']
+        if elt is None:
+            self.real_article = False
+            return
+        self.title = elt
+        # byline / author
+        author = soup.find('meta', {'itemprop': 'author'})['content']
+        self.byline = author if author else ''
+        # article date
+        created_at = soup.find('meta', {'property': 'vr:published_time'})['content']
+        self.date = created_at if created_at else ''
+        #article content
+        div = soup.find('div', {'class': 'main-text '})
+        intro = soup.find('div', {'class': 'first intro'})
+        if intro is None:
+            intro = ''
+        else:
+            intro = intro.find('strong').getText()
+        if div is None:
+            self.real_article = False
+            return
+        div = self.remove_non_content(div)
+        self.body = intro
+        self.body += '\n' + '\n\n'.join([x.getText() for x in div.childGenerator()
+                                         if isinstance(x, Tag) and x.name == 'p'])
diff --git a/parsers/__init__.py b/parsers/__init__.py
@@ -4,15 +4,32 @@
 #  - create a parser class in another file, based off (say) bbc.BBCParser
 #  - add it to parsers (below)
 # Test with test_parser.py
-
 # List of parsers to import and use based on parser.domains
 
+"""
+sueddeutsche.SDParser
+stern.SternParser
+bild.BildParser
+focus.FocusParser
+spiegel.SpiegelParser
+zeit.ZeitParser
+RPOnline.RPOParser
+faz.FAZParser
+n-tv.NTVParser
+welt.WeltParser
+"""
+
 parsers = """
-nyt.NYTParser
-cnn.CNNParser
-politico.PoliticoParser
-bbc.BBCParser
-washpo.WashPoParser
+sueddeutsche.SDParser
+stern.SternParser
+bild.BildParser
+focus.FocusParser
+spiegel.SpiegelParser
+zeit.ZeitParser
+RPOnline.RPOParser
+faz.FAZParser
+n-tv.NTVParser
+welt.WeltParser
 """.split()
 
 parser_dict = {}

diff --git a/parsers/baseparser.py b/parsers/baseparser.py
@@ -5,6 +5,7 @@
 import sys
 import time
 import urllib2
+from BeautifulSoup import BeautifulSoup, Comment
 
 # Define a logger
 
@@ -153,4 +154,13 @@ def feed_urls(cls):
 
             all_urls = all_urls + [url for url in urls if
                                    re.search(cls.feeder_pat, url)]
-        return all_urls
+        return set(all_urls)
+
+        #removes all non-content
+    def remove_non_content(self, html):
+        map(lambda x: x.extract(), html.findAll('script'))
+        map(lambda x: x.extract(), html.findAll('style'))
+        map(lambda x: x.extract(), html.findAll('embed'))
+        comments = html.findAll(text=lambda text:isinstance(text, Comment))
+        [comment.extract() for comment in comments]
+        return html
diff --git a/parsers/bbc.py b/parsers/bbc.py
diff --git a/parsers/bild.py b/parsers/bild.py
@@ -0,0 +1,46 @@
+from baseparser import BaseParser
+from BeautifulSoup import BeautifulSoup
+
+
+class BildParser(BaseParser):
+    SUFFIX = ''
+    domains = ['www.bild.de']
+
+    feeder_pat   = '^http://www.bild.de/(politik|regional|geld|digital/[a-z])'
+    feeder_pages = ['http://www.bild.de/politik/startseite',
+                    'http://www.bild.de/geld/startseite/',
+                    'http://www.bild.de/regional/startseite/',
+                    'http://www.bild.de/digital/startseite/']
+
+    def _parse(self, html):
+        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES,
+                             fromEncoding='utf-8')
+
+        self.meta = soup.findAll('meta')
+        #article headline
+        try:
+            elt = soup.find('meta', {'property': 'og:title'})['content']
+            self.title = elt
+        except:
+            self.real_article = False
+            return
+
+        # byline / author
+        author = soup.find('div', {'itemprop':'author'})
+        self.byline = author.getText() if author else ''
+        # article date
+        created_at = soup.find('div', {'class': 'date'})
+        self.date = created_at.getText() if created_at else ''
+        #article content
+        div = soup.find('div', {'itemprop':'articleBody isFamilyFriendly'})
+        if div is None:
+            self.real_article = False
+            return
+        div = self.remove_non_content(div)
+        map(lambda x: x.extract(), div.findAll('div', {'class':'infoEl center edge'})) # commercials
+        text = ''
+        p = div.findAll('p')
+        for txt in p:
+                text += txt.getText()+'\n'
+        self.body = text
+
diff --git a/parsers/cnn.py b/parsers/cnn.py
diff --git a/parsers/faz.py b/parsers/faz.py
@@ -0,0 +1,48 @@
+from baseparser import BaseParser
+from BeautifulSoup import BeautifulSoup, Tag
+
+
+class FAZParser(BaseParser):
+    domains = ['www.faz.net']
+
+    feeder_pat = 'aktuell/.*\.html$'
+    feeder_pages = ['http://www.faz.net/aktuell/finanzen',
+                    'http://www.faz.net/aktuell/gesellschaft',
+                    'http://www.faz.net/aktuell/politik',
+                    'http://www.faz.net/aktuell/wirtschaft',
+                    'http://www.faz.net/aktuell/wissen',
+                    'http://www.faz.net/aktuell/feuilleton',
+                    ]
+
+    def _parse(self, html):
+        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES,
+                             fromEncoding='utf-8')
+        self.meta = soup.findAll('meta')
+        #article headline
+        elt = soup.find('meta', {'property': 'og:title'})
+        if elt is None:
+            self.real_article = False
+            return
+        self.title = elt['content']
+        # byline / author
+        author = soup.find('meta', {'name': 'author'})
+        self.byline = author['content'] if author else ''
+        # article date
+        created_at = soup.find('meta', {'name': 'DC.date.issued'})
+        self.date = created_at['content'] if created_at else ''
+        #article content
+        div = soup.find('div', 'FAZArtikelContent')
+        if div is None:
+            self.real_article = False
+            return
+        div = self.remove_non_content(div)
+        map(lambda x: x.extract(), div.findAll('span', {'class':'autorBox clearfix'})) # Author description
+        map(lambda x: x.extract(), div.findAll('p', {'class':'WeitereBeitraege'})) # more articles like that one
+        map(lambda x: x.extract(), div.findAll('ul', {'class':'WBListe'}))# other articles from this author
+
+        div = div.find('div', {'class': ''})
+        if hasattr(div, "childGenerator"):
+            self.body = '\n' + '\n\n'.join([x.getText() for x in div.childGenerator()
+                                            if isinstance(x, Tag) and x.name == 'p'])
+        else:
+            self.real_article = False
diff --git a/parsers/focus.py b/parsers/focus.py
@@ -0,0 +1,44 @@
+from baseparser import BaseParser
+from BeautifulSoup import BeautifulSoup
+
+
+class FocusParser(BaseParser):
+    SUFFIX = '?drucken=1'
+    domains = ['www.focus.de']
+
+    feeder_pat   = '^http://www.focus.de/(politik|finanzen|gesundheit|wissen)'
+    feeder_pages = ['http://www.focus.de/']
+
+    def _parse(self, html):
+        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES,
+                             fromEncoding='utf-8')
+
+        self.meta = soup.findAll('meta')
+        #article headline
+        elt = soup.find('h1')
+        if elt is None:
+            self.real_article = False
+            return
+        self.title = elt.getText()
+        # byline / author
+        try:
+            author = soup.find('a', {'rel':'author'}).text
+        except:
+            author = ''
+        self.byline = author
+        # article date
+        created_at = soup.find('meta', {'name':'date'})
+        self.date = created_at['content'] if created_at else ''
+        #article content
+        self.body = ''
+        div = soup.find('div', 'articleContent')
+        if div is None:
+            self.real_article = False
+            return
+        div = self.remove_non_content(div)
+        map(lambda x: x.extract(), div.findAll('div', {'class':'adition'})) #focus
+        text = ''
+        p = div.findAll('p')
+        for txt in p:
+                text += txt.getText()+'\n'
+        self.body = text
diff --git a/parsers/n-tv.py b/parsers/n-tv.py
@@ -0,0 +1,42 @@
+from baseparser import BaseParser
+from BeautifulSoup import BeautifulSoup, Tag
+
+
+class NTVParser(BaseParser):
+    domains = ['www.n-tv.de']
+
+    feeder_pat = '^http://www.n-tv.de/(politik|wirtschaft|panorama|technik|wissen)/.*article\d*'
+    feeder_pages = ['http://www.n-tv.de']
+
+    def _parse(self, html):
+        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES,
+                             fromEncoding='utf-8')
+        self.meta = soup.findAll('meta')
+	# Remove any potential "rogue" video articles, that bypass the URL check
+        try: 
+		if 'Mediathek' in soup.find('title').getText():
+	            	self.real_article = False
+	            	return
+	except:
+		pass
+        #article headline
+        elt = soup.find('h1', {'class': 'h1'})
+        if elt is None:
+            self.real_article = False
+            return
+        self.title = elt.getText()
+        # byline / author
+        author = soup.find('p', {'class': 'author'})
+        self.byline = author.getText() if author else ''
+        # article date
+        created_at = soup.find('div', {'itemprop': 'datePublished'})
+        self.date = created_at['content'] if created_at else ''
+        #article content
+        div = soup.find('div', {'class': 'content'})
+        if div is None:
+            self.real_article = False
+            return
+        div = self.remove_non_content(div)
+        map(lambda x: x.extract(), div.findAll('p', {'class': 'author'}))
+        self.body = '\n' + '\n\n'.join([x.getText() for x in div.childGenerator()
+                                        if isinstance(x, Tag) and x.name == 'p'])