diff --git a/.gitignore b/.gitignore index 63d5c677..db37c9ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/.idea /articles *~ @@ -30,4 +31,5 @@ pip-log.txt .mr.developer.cfg newsdiffs.db +newsdiffs.db-journal database_settings.py diff --git a/parsers/RPOnline.py b/parsers/RPOnline.py new file mode 100644 index 00000000..d730eaf8 --- /dev/null +++ b/parsers/RPOnline.py @@ -0,0 +1,40 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class RPOParser(BaseParser): + domains = ['www.rp-online.de'] + + feeder_pat = '1\.\d*$' + feeder_pages = ['http://www.rp-online.de/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('meta', {'property': 'og:title'})['content'] + if elt is None: + self.real_article = False + return + self.title = elt + # byline / author + author = soup.find('meta', {'itemprop': 'author'})['content'] + self.byline = author if author else '' + # article date + created_at = soup.find('meta', {'property': 'vr:published_time'})['content'] + self.date = created_at if created_at else '' + #article content + div = soup.find('div', {'class': 'main-text '}) + intro = soup.find('div', {'class': 'first intro'}) + if intro is None: + intro = '' + else: + intro = intro.find('strong').getText() + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + self.body = intro + self.body += '\n' + '\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/__init__.py b/parsers/__init__.py index a6870dcf..71ec7bb3 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -4,15 +4,32 @@ # - create a parser class in another file, based off (say) bbc.BBCParser # - add it to parsers (below) # Test with test_parser.py - # List of parsers to import and use based on parser.domains +""" +sueddeutsche.SDParser +stern.SternParser +bild.BildParser +focus.FocusParser +spiegel.SpiegelParser +zeit.ZeitParser +RPOnline.RPOParser +faz.FAZParser +n-tv.NTVParser +welt.WeltParser +""" + parsers = """ -nyt.NYTParser -cnn.CNNParser -politico.PoliticoParser -bbc.BBCParser -washpo.WashPoParser +sueddeutsche.SDParser +stern.SternParser +bild.BildParser +focus.FocusParser +spiegel.SpiegelParser +zeit.ZeitParser +RPOnline.RPOParser +faz.FAZParser +n-tv.NTVParser +welt.WeltParser """.split() parser_dict = {} diff --git a/parsers/baseparser.py b/parsers/baseparser.py index ef2a9eb5..00e84521 100644 --- a/parsers/baseparser.py +++ b/parsers/baseparser.py @@ -5,6 +5,7 @@ import sys import time import urllib2 +from BeautifulSoup import BeautifulSoup, Comment # Define a logger @@ -153,4 +154,13 @@ def feed_urls(cls): all_urls = all_urls + [url for url in urls if re.search(cls.feeder_pat, url)] - return all_urls + return set(all_urls) + + #removes all non-content + def remove_non_content(self, html): + map(lambda x: x.extract(), html.findAll('script')) + map(lambda x: x.extract(), html.findAll('style')) + map(lambda x: x.extract(), html.findAll('embed')) + comments = html.findAll(text=lambda text:isinstance(text, Comment)) + [comment.extract() for comment in comments] + return html diff --git a/parsers/bbc.py b/parsers/bbc.py deleted file mode 100644 index c409e11e..00000000 --- a/parsers/bbc.py +++ /dev/null @@ -1,33 +0,0 @@ -from baseparser import BaseParser -from BeautifulSoup import BeautifulSoup, Tag - - -class BBCParser(BaseParser): - SUFFIX = '?print=true' - domains = ['www.bbc.co.uk'] - - feeder_pat = '^http://www.bbc.co.uk/news/' - feeder_pages = ['http://www.bbc.co.uk/news/'] - - def _parse(self, html): - soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, - fromEncoding='utf-8') - - self.meta = soup.findAll('meta') - elt = soup.find('h1', 'story-header') - if elt is None: - self.real_article = False - return - self.title = elt.getText() - self.byline = '' - self.date = soup.find('span', 'date').getText() - - div = soup.find('div', 'story-body') - if div is None: - # Hack for video articles - div = soup.find('div', 'emp-decription') - if div is None: - self.real_article = False - return - self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() - if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/bild.py b/parsers/bild.py new file mode 100644 index 00000000..ea6aa601 --- /dev/null +++ b/parsers/bild.py @@ -0,0 +1,46 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup + + +class BildParser(BaseParser): + SUFFIX = '' + domains = ['www.bild.de'] + + feeder_pat = '^http://www.bild.de/(politik|regional|geld|digital/[a-z])' + feeder_pages = ['http://www.bild.de/politik/startseite', + 'http://www.bild.de/geld/startseite/', + 'http://www.bild.de/regional/startseite/', + 'http://www.bild.de/digital/startseite/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + #article headline + try: + elt = soup.find('meta', {'property': 'og:title'})['content'] + self.title = elt + except: + self.real_article = False + return + + # byline / author + author = soup.find('div', {'itemprop':'author'}) + self.byline = author.getText() if author else '' + # article date + created_at = soup.find('div', {'class': 'date'}) + self.date = created_at.getText() if created_at else '' + #article content + div = soup.find('div', {'itemprop':'articleBody isFamilyFriendly'}) + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + map(lambda x: x.extract(), div.findAll('div', {'class':'infoEl center edge'})) # commercials + text = '' + p = div.findAll('p') + for txt in p: + text += txt.getText()+'\n' + self.body = text + diff --git a/parsers/cnn.py b/parsers/cnn.py deleted file mode 100644 index 1f4afdaa..00000000 --- a/parsers/cnn.py +++ /dev/null @@ -1,38 +0,0 @@ -from baseparser import BaseParser -import re -from BeautifulSoup import BeautifulSoup -from datetime import datetime, timedelta - -DATE_FORMAT = '%B %d, %Y at %l:%M%P EDT' - -class CNNParser(BaseParser): - domains = ['edition.cnn.com'] - - feeder_pat = '^http://edition.cnn.com/201' - feeder_pages = ['http://edition.cnn.com/'] - - def _parse(self, html): - soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, - fromEncoding='utf-8') - p_tags = soup.findAll('p', attrs={'class':re.compile(r'\bcnn_storypgraphtxt\b')}) - if not p_tags: - self.real_article = False - return - - self.meta = soup.findAll('meta') - self.title = soup.find('meta', attrs={'itemprop':'headline'}).get('content') - datestr = soup.find('meta', attrs={'itemprop':'dateModified'}).get('content') - if datestr: - datet = datetime.strptime(datestr, '%Y-%m-%dT%H:%M:%SZ') - timedelta(hours=4) - self.date = datet.strftime(DATE_FORMAT) - else: - self.date = '' - - self.byline = soup.find('meta', attrs={'itemprop':'author'}).get('content') - lede = p_tags[0].previousSibling.previousSibling - - editornotes = soup.findAll('p', attrs={'class':'cnnEditorialNote'}) - contributors = soup.findAll('p', attrs={'class':'cnn_strycbftrtxt'}) - - self.body = '\n'+'\n\n'.join([p.getText() for p in - editornotes + [lede] + p_tags + contributors]) diff --git a/parsers/faz.py b/parsers/faz.py new file mode 100644 index 00000000..1ce12032 --- /dev/null +++ b/parsers/faz.py @@ -0,0 +1,48 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class FAZParser(BaseParser): + domains = ['www.faz.net'] + + feeder_pat = 'aktuell/.*\.html$' + feeder_pages = ['http://www.faz.net/aktuell/finanzen', + 'http://www.faz.net/aktuell/gesellschaft', + 'http://www.faz.net/aktuell/politik', + 'http://www.faz.net/aktuell/wirtschaft', + 'http://www.faz.net/aktuell/wissen', + 'http://www.faz.net/aktuell/feuilleton', + ] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('meta', {'property': 'og:title'}) + if elt is None: + self.real_article = False + return + self.title = elt['content'] + # byline / author + author = soup.find('meta', {'name': 'author'}) + self.byline = author['content'] if author else '' + # article date + created_at = soup.find('meta', {'name': 'DC.date.issued'}) + self.date = created_at['content'] if created_at else '' + #article content + div = soup.find('div', 'FAZArtikelContent') + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + map(lambda x: x.extract(), div.findAll('span', {'class':'autorBox clearfix'})) # Author description + map(lambda x: x.extract(), div.findAll('p', {'class':'WeitereBeitraege'})) # more articles like that one + map(lambda x: x.extract(), div.findAll('ul', {'class':'WBListe'}))# other articles from this author + + div = div.find('div', {'class': ''}) + if hasattr(div, "childGenerator"): + self.body = '\n' + '\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) + else: + self.real_article = False diff --git a/parsers/focus.py b/parsers/focus.py new file mode 100644 index 00000000..2d103360 --- /dev/null +++ b/parsers/focus.py @@ -0,0 +1,44 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup + + +class FocusParser(BaseParser): + SUFFIX = '?drucken=1' + domains = ['www.focus.de'] + + feeder_pat = '^http://www.focus.de/(politik|finanzen|gesundheit|wissen)' + feeder_pages = ['http://www.focus.de/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('h1') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + # byline / author + try: + author = soup.find('a', {'rel':'author'}).text + except: + author = '' + self.byline = author + # article date + created_at = soup.find('meta', {'name':'date'}) + self.date = created_at['content'] if created_at else '' + #article content + self.body = '' + div = soup.find('div', 'articleContent') + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + map(lambda x: x.extract(), div.findAll('div', {'class':'adition'})) #focus + text = '' + p = div.findAll('p') + for txt in p: + text += txt.getText()+'\n' + self.body = text \ No newline at end of file diff --git a/parsers/n-tv.py b/parsers/n-tv.py new file mode 100644 index 00000000..baf393e6 --- /dev/null +++ b/parsers/n-tv.py @@ -0,0 +1,42 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class NTVParser(BaseParser): + domains = ['www.n-tv.de'] + + feeder_pat = '^http://www.n-tv.de/(politik|wirtschaft|panorama|technik|wissen)/.*article\d*' + feeder_pages = ['http://www.n-tv.de'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + self.meta = soup.findAll('meta') + # Remove any potential "rogue" video articles, that bypass the URL check + try: + if 'Mediathek' in soup.find('title').getText(): + self.real_article = False + return + except: + pass + #article headline + elt = soup.find('h1', {'class': 'h1'}) + if elt is None: + self.real_article = False + return + self.title = elt.getText() + # byline / author + author = soup.find('p', {'class': 'author'}) + self.byline = author.getText() if author else '' + # article date + created_at = soup.find('div', {'itemprop': 'datePublished'}) + self.date = created_at['content'] if created_at else '' + #article content + div = soup.find('div', {'class': 'content'}) + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + map(lambda x: x.extract(), div.findAll('p', {'class': 'author'})) + self.body = '\n' + '\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/nyt.py b/parsers/nyt.py deleted file mode 100644 index b9142959..00000000 --- a/parsers/nyt.py +++ /dev/null @@ -1,84 +0,0 @@ -from baseparser import BaseParser -from BeautifulSoup import BeautifulSoup - -class NYTParser(BaseParser): - SUFFIX = '?pagewanted=all' - domains = ['www.nytimes.com'] - - feeder_pat = '^http://www.nytimes.com/201' - feeder_pages = ['http://www.nytimes.com/', - 'http://www.nytimes.com/pages/world/', - 'http://www.nytimes.com/pages/national/', - 'http://www.nytimes.com/pages/politics/', - 'http://www.nytimes.com/pages/nyregion/', - 'http://www.nytimes.com/pages/business/', - 'http://www.nytimes.com/pages/technology/', - 'http://www.nytimes.com/pages/sports/', - 'http://dealbook.nytimes.com/', - 'http://www.nytimes.com/pages/science/', - 'http://www.nytimes.com/pages/health/', - 'http://www.nytimes.com/pages/arts/', - 'http://www.nytimes.com/pages/style/', - 'http://www.nytimes.com/pages/opinion/', - 'http://www.nytimes.com/pages/automobiles/', - 'http://www.nytimes.com/pages/books/', - 'http://www.nytimes.com/crosswords/', - 'http://www.nytimes.com/pages/dining/', - 'http://www.nytimes.com/pages/education/', - 'http://www.nytimes.com/pages/fashion/', - 'http://www.nytimes.com/pages/garden/', - 'http://www.nytimes.com/pages/magazine/', - 'http://www.nytimes.com/pages/business/media/', - 'http://www.nytimes.com/pages/movies/', - 'http://www.nytimes.com/pages/arts/music/', - 'http://www.nytimes.com/pages/obituaries/', - 'http://www.nytimes.com/pages/realestate/', - 'http://www.nytimes.com/pages/t-magazine/', - 'http://www.nytimes.com/pages/arts/television/', - 'http://www.nytimes.com/pages/theater/', - 'http://www.nytimes.com/pages/travel/', - 'http://www.nytimes.com/pages/fashion/weddings/', - 'http://www.nytimes.com/pages/todayspaper/', - 'http://topics.nytimes.com/top/opinion/thepubliceditor/'] - - - def _parse(self, html): - soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) - self.meta = soup.findAll('meta') - try: - seo_title = soup.find('meta', attrs={'name':'hdl'}).get('content') - except AttributeError: - self.real_article = False - return - tmp = soup.find('meta', attrs={'name':'hdl_p'}) - if tmp and tmp.get('content'): - self.title = tmp.get('content') - else: - self.title = seo_title - try: - self.date = soup.find('meta', attrs={'name':'dat'}).get('content') - self.byline = soup.find('meta', attrs={'name':'byl'}).get('content') - except AttributeError: - self.real_article = False - return - p_tags = sum([list(soup.findAll('p', attrs={'itemprop':x})) - for x in ['articleBody', 'reviewBody']], []) - div = soup.find('div', attrs={'class': 'story-addendum story-content theme-correction'}) - if div: - p_tags += [div] - footer = soup.find('footer', attrs={'class':'story-footer story-content'}) - if footer: - p_tags += list(footer.findAll(lambda x: x.get('class') != 'story-print-citation' and x.name == 'p')) - - main_body = '\n\n'.join([p.getText() for p in p_tags]) - authorids = soup.find('div', attrs={'class':'authorIdentification'}) - authorid = authorids.getText() if authorids else '' - - top_correction = '\n'.join(x.getText() for x in - soup.findAll('nyt_correction_top')) or '\n' - bottom_correction = '\n'.join(x.getText() for x in - soup.findAll('nyt_correction_bottom')) or '\n' - self.body = '\n'.join([top_correction, - main_body, - authorid, - bottom_correction,]) diff --git a/parsers/politico.py b/parsers/politico.py deleted file mode 100644 index 11d70445..00000000 --- a/parsers/politico.py +++ /dev/null @@ -1,43 +0,0 @@ -from baseparser import BaseParser, grab_url, logger - -# Different versions of BeautifulSoup have different properties. -# Some work with one site, some with another. -# This is BeautifulSoup 3.2. -from BeautifulSoup import BeautifulSoup -# This is BeautifulSoup 4 -import bs4 -import re - -class PoliticoParser(BaseParser): - domains = ['www.politico.com'] - - feeder_pat = '^http://www.politico.com/(news/stories|story)/' - feeder_pages = ['http://www.politico.com/'] - - feeder_bs = bs4.BeautifulSoup - - def _parse(self, html): - soup = bs4.BeautifulSoup(html) - print_link = soup.findAll('a', href=re.compile('http://dyn.politico.com/printstory.cfm.*'))[0].get('href') - html2 = grab_url(print_link) - logger.debug('got html 2') - # Now we have to switch back to bs3. Hilarious. - # and the labeled encoding is wrong, so force utf-8. - soup = BeautifulSoup(html2, convertEntities=BeautifulSoup.HTML_ENTITIES, - fromEncoding='utf-8') - - self.meta = soup.findAll('meta') - p_tags = soup.findAll('p')[1:] - real_p_tags = [p for p in p_tags if - not p.findAll(attrs={'class':"twitter-follow-button"})] - - self.title = soup.find('strong').getText() - entity = soup.find('span', attrs={'class':'author'}) - children = list(entity.childGenerator()) - try: - self.byline = 'By ' + children[1].getText() - except IndexError: - self.byline = '' - self.date = children[-1].strip() - - self.body = '\n'+'\n\n'.join([p.getText() for p in real_p_tags]) diff --git a/parsers/spiegel.py b/parsers/spiegel.py new file mode 100644 index 00000000..92e47b99 --- /dev/null +++ b/parsers/spiegel.py @@ -0,0 +1,41 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup + +class SpiegelParser(BaseParser): + SUFFIX = '' + domains = ['www.spiegel.de'] + + feeder_pat = '^http://www.spiegel.de/(politik|wirtschaft|panorama|netzwelt|gesundheit)/[a-z]' + feeder_pages = ['http://www.spiegel.de/schlagzeilen/index.html'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('h2', {'class': 'article-title'}) + if elt is None: + self.real_article = False + return + self.title = elt.getText() + # byline / author + try: + author = soup.find('a', {'rel': 'author'}).text + except: + author = '' + self.byline = author + # article date + created_at = soup.find('meta', {'name': 'date'}) + self.date = created_at['content'] if created_at else '' + #article content + div = soup.find('div', 'article-section clearfix') + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + text = '' + p = div.findAll('p') + for txt in p: + text += txt.getText()+'\n' + self.body = text diff --git a/parsers/stern.py b/parsers/stern.py new file mode 100644 index 00000000..e7fc13c6 --- /dev/null +++ b/parsers/stern.py @@ -0,0 +1,43 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup + + +class SternParser(BaseParser): + SUFFIX = '' + domains = ['www.stern.de'] + + feeder_pat = '^http://www.stern.de/(politik|wirtschaft|panorama|lifestyle|wissen|digital)/' + feeder_pages = ['http://www.stern.de/news', + 'http://www.stern.de/news/2', + 'http://www.stern.de/news/3', + 'http://www.stern.de/news/4' + ] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('h2', {'id': 'div_article_headline'}) + if elt is None: + self.real_article = False + return + self.title = elt.getText() + # byline / author + author = soup.find('p', {'id': 'div_article_intro'}).find('span') + self.byline = author.getText() if author else '' + # article date + created_at = soup.find('meta', {'name': 'date'}) + self.date = created_at['content'] if created_at else '' + #article content + div = soup.find('div', {'itemprop': 'mainContentOfPage'}) + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + text = '' + p = div.findAll('p') + for txt in p: + text += txt.getText()+'\n' + self.body = text \ No newline at end of file diff --git a/parsers/sueddeutsche.py b/parsers/sueddeutsche.py new file mode 100644 index 00000000..eda8ca8c --- /dev/null +++ b/parsers/sueddeutsche.py @@ -0,0 +1,42 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class SDParser(BaseParser): + domains = ['www.sueddeutsche.de'] + + feeder_pat = '1\.\d*$' + feeder_pages = ['http://www.sueddeutsche.de/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('meta', {'property': 'og:title'}) + if elt is None: + self.real_article = False + return + else: + self.title = elt['content'] + # byline / author + author = soup.find('div', {'class': 'authorProfileContainer'}) + self.byline = author.getText() if author else '' + # article date + created_at = soup.find('time', {'class': 'timeformat'}) + if created_at is None: + self.real_article = False + return + self.date = created_at['datetime'] + #article content + div = soup.find('div', {'id': 'wrapper'}) + intro = soup.find('section', {'class': 'body'}) + if div is None: + self.real_article = False + return + div = self.remove_non_content(div) + if intro is not None: + self.body = '\n' + '\n\n'.join([x.getText() for x in intro.childGenerator() + if isinstance(x, Tag) and x.name == 'ul']) + self.body += '\n' + '\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/tagesschau.py b/parsers/tagesschau.py deleted file mode 100644 index 535bc625..00000000 --- a/parsers/tagesschau.py +++ /dev/null @@ -1,64 +0,0 @@ -from baseparser import BaseParser -import bs4 - -class TagesschauParser(BaseParser): - SUFFIX = '' - domains = ['www.tagesschau.de'] - - def _parse(self, html): - soup = bs4.BeautifulSoup(html) - - # extract the important text of the article into self.document # - # select the one article - article = soup.select('div.article')[0] - # removing comments - for x in self.descendants(article): - if isinstance(x, bs4.Comment): - x.extract() - # removing elements which don't provide content - for selector in ('.inv .teaserImg #seitenanfang .spacer .clearMe '+ - '.boxMoreLinks .metaBlock .weltatlas .fPlayer .zitatBox .flashaudio').split(' '): - for x in article.select(selector): - x.extract() - # put hrefs into text form cause hrefs are important content - for x in article.select('a'): - x.append(" ["+x.get('href','')+"]") - # ensure proper formating for later use of get_text() - for x in article.select('li'): - x.append("\n") - for tag in 'p h1 h2 h3 h4 h5 ul div'.split(' '): - for x in article.select(tag): - x.append("\n\n") - # strip multiple newlines away - import re - article = re.subn('\n\n+', '\n\n', article.get_text())[0] - # important text is now extracted into self.document - self.document = article - - self.title = soup.find('h1').get_text() - - # a by-line is not always there, but when it is, it is em-tag and - # begins with the word 'Von' - byline = soup.find('em') - if byline: - byline = byline.get_text() - if 'Von ' not in byline: byline = None - if not byline: byline = "nicht genannt" - self.byline = byline - - # TODO self.date is unused, isn't it? but i still fill it here - date = soup.select("div.standDatum") - self.date = date and date[0].get_text() or '' - - # XXX a bug in bs4 that tag.descendants isnt working when .extract is called?? - # TODO investigate and report - @staticmethod - def descendants(tag): - x = tag.next_element - while x: - next = x.next_element or x.parent and x.parent != tag and x.parent.next_sibling - yield x - x = next - - def __unicode__(self): - return self.document diff --git a/parsers/washpo.py b/parsers/washpo.py deleted file mode 100644 index 06c7bfa8..00000000 --- a/parsers/washpo.py +++ /dev/null @@ -1,44 +0,0 @@ -from baseparser import BaseParser -from bs4 import BeautifulSoup -import re -import datetime - -DATE_FORMAT = '%A, %B %e %Y, %l:%M %p' - -class WashPoParser(BaseParser): - SUFFIX = '?print=true' - domains = ['www.washingtonpost.com'] - - feeder_pat = '^http://www.washingtonpost.com/.*_story.html' - feeder_pages = ['http://www.washingtonpost.com/'] - - def _printableurl(self): - return re.sub('_story.html.*', '_print.html', self.url) - - def _parse(self, html): - soup = BeautifulSoup(html) - - self.meta = soup.findAll('meta') - elt = soup.find('h1', property="dc.title") - if elt is None: - self.real_article = False - return - self.title = elt.getText().strip() - elt = soup.find('h3', property="dc.creator") - if elt is None: - self.byline = '' - else: - self.byline = elt.getText().strip() - - elt = soup.find('span', datetitle="published") - if elt is None: - self.date = '' - else: - date = datetime.datetime.fromtimestamp(float(elt['epochtime'])/1000) - self.date = date.strftime(DATE_FORMAT) - - div = soup.find('div', id='content') - if div is None: - self.real_article = False - return - self.body = '\n'+'\n\n'.join([x.getText().strip() for x in div.findAll('p')]) diff --git a/parsers/welt.py b/parsers/welt.py new file mode 100644 index 00000000..a0145c82 --- /dev/null +++ b/parsers/welt.py @@ -0,0 +1,49 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class WeltParser(BaseParser): + + domains = ['www.welt.de'] + + feeder_pat = '^http://www.welt.de/(politik|wirtschaft|regionales|vermischtes|kultur|debatte|finanzen|gesundheit|satire|wissenschaft)/article\d*.*\.html$' + feeder_pages = ['http://www.welt.de/politik/', + 'http://www.welt.de/wirtschaft/', + 'http://www.welt.de/politik/', + 'http://www.welt.de/regionales/', + 'http://www.welt.de/vermischtes/', + 'http://www.welt.de/kultur/', + 'http://www.welt.de/debatte/', + 'http://www.welt.de/finanzen/', + 'http://www.welt.de/gesundheit/', + 'http://www.welt.de/satire/', + 'http://www.welt.de/wissenschaft/' + ] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('h1', 'widget storyContent title prefix_1 grid_8') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + # byline / author + authorids = soup.find('span', {'itemprop': 'author'}) + self.byline = authorids.getText() if authorids else '' + # article date + self.date = soup.find('meta', {'name': 'date'})['content'] + #article content + div = soup.find('div', 'storyBody') + div = self.remove_non_content(div) + #Social Media Infobox & Author meta-data + map(lambda x: x.extract(), div.findAll('div', {'class': 'artAuthor'})) + map(lambda x: x.extract(), div.findAll('div', {'class': 'widget socialMedia socMedArtHead grid_6 prefix_1'})) + map(lambda x: x.extract(), div.findAll('div', {'class': 'citation-social-wrapper'})) + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/zeit.py b/parsers/zeit.py new file mode 100644 index 00000000..d0020452 --- /dev/null +++ b/parsers/zeit.py @@ -0,0 +1,36 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class ZeitParser(BaseParser): + SUFFIX = '?print=true' + domains = ['www.zeit.de'] + + feeder_pat = '^http://www.zeit.de/news/\d' + feeder_pages = ['http://www.zeit.de/news/index/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + #article headline + elt = soup.find('span', 'title') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + # byline / author + author = soup.find('span', {'class': 'header_author'}) + self.byline = author.getText() if author else '' + # article date + created_at = soup.find('span', 'articlemeta-datetime') + self.date = created_at.getText() if created_at else '' + #article content + div = soup.find('div', 'article-body') + div = self.remove_non_content(div) + if div is None: + self.real_article = False + return + self.body = '\n' + '\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/website/frontend/models.py b/website/frontend/models.py index 3aa8a7ba..09bf14f5 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -17,11 +17,14 @@ def strip_prefix(string, prefix): string = string[len(prefix):] return string -PublicationDict = {'www.nytimes.com': 'NYT', - 'edition.cnn.com': 'CNN', - 'www.bbc.co.uk': 'BBC', - 'www.politico.com': 'Politico', - 'www.washingtonpost.com': 'Washington Post', +PublicationDict = {'www.zeit.de': 'Zeit Online', + 'www.bild.de': 'Bild', + 'www.focus.de': 'Focus Online', + 'www.stern.de': 'Der Stern', + 'www.zeit.de': 'Die Zeit', + 'www.sueddeutsche.de': 'Sueddeutsche Zeitung', + 'www.spiegel.de': 'Der Spiegel', + 'www.faz.de': 'FAZ', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/templates/404.html b/website/frontend/templates/404.html index 68336dea..98fd677d 100644 --- a/website/frontend/templates/404.html +++ b/website/frontend/templates/404.html @@ -2,12 +2,12 @@ {% load url from future %} -{% block title %}Error{% endblock title%} +{% block title %}Fehler 404{% endblock title%} {% block content %} -
Hmm. If you are on this page, something went wrong.
-If you want to tell us what, you can contact us.
+Scheint so, als wurde die Seite nicht gefunden.
+Falls du weißt, was schief gelaufen ist, kannst du uns gerne kontaktieren.
{% endblock content%} diff --git a/website/frontend/templates/500.html b/website/frontend/templates/500.html index 71b72381..a184b576 100644 --- a/website/frontend/templates/500.html +++ b/website/frontend/templates/500.html @@ -2,13 +2,12 @@ {% load url from future %} -{% block title %}Error{% endblock title%} +{% block title %}Error 500{% endblock title%} {% block content %} --Hm. If you are on this page, something went wrong. -
If you want to tell us what, you can contact us. +
Wenn du auf dieser Seite bist, scheint etwas schief gelaufen zu sein.
+Falls du mehr darüber weißt, kannst du uns gerne kontaktieren.
{% endblock content%} diff --git a/website/frontend/templates/about.html b/website/frontend/templates/about.html index 4d52d4a4..bf94820a 100644 --- a/website/frontend/templates/about.html +++ b/website/frontend/templates/about.html @@ -5,79 +5,59 @@ {% block active_about %}class="active"{% endblock %} {% block content %} - --
In the age of rapid reporting and digital news, there is rarely a single "final" version of an article. -
NewsDiffs watches different versions of highly-placed articles on online news sites, starting with nytimes.com. +
In einer Zeit, in der es vor Eilmeldungen nur so wimmelt und Nachrichten online abrufbar sind, gibt es kaum noch "endgültige" Versionen von Artikeln. +
NewsDiffs beobachtet verschiedene Versionen von Artikeln der Titelseiten von großen Online-Zeitungen, wie z.B. Zeit Online. -
For better or worse, readers can now view "the making of the sausage" that historically was discreetly -tucked away from view with dead-tree editions. Some of those changes provoke criticism. -
NewsDiffs was born of the Knight Mozilla MIT hackathon on June 17, 2012.
+
So können Leser jetzt das "Making-of" eines Artikels, das früher hinter den Kulissen der Nachrichtenagenturen versteckt war, verfolgen. Einige dieser Änderungen provozieren Kritik. -
Updates to articles on major news web sites happen all the time.
-Often the changes to articles are simply minor, small edits that are tightening up (such as "most" to "many.")
-Sometimes the changes are the insertion or the deletion of a section.
-Sometimes the story changes as a result of a rapidly breaking news, such as the -death of Rodney King. The story grows and deepens over time as more information comes in. To the right is an example -of a story that evolved about the health of former Egyptian president -Hosni Mubarak, first when it was reported that he was "clinically dead" and then later that he had suffered a stroke. -
Another interesting example would have been the killing of Osama Bin Laden on May 1, 2011, which broke at 10:40 p.m. with a sparse report from Helene Cooper: +
Artikel aus Online-Zeitungen großer Nachrichtenportale werden ständig aktualisiert.
+Oft sind diese Änderungen geringfügig, da sie nur dazu dienen den Stil des Artikels zu verbessern (z.B. wird aus einem "jene" ein "diese").
+Manchmal umfassen die Änderungen das Einfügen oder Löschen eines ganzen Abschnitts.
+Gelegentlich verändert sich der Artikel aufgrund von schnell hintereinander eintreffenden Eilmeldungen, wie z.B. bei Rodney +Kings Tod. Mit der Zeit wächst und vertieft sich der Bericht sobald mehr Informationen bekannt werden. Ein weiteres Beispiel hierfür ist die Nachrichtenerstattung über den Gesundheitszustand +des ehemaligen ägyptischen Präsidenten Husni Mubarak. Zunächst wurde berichtet, er wäre "klinisch tot", später dass er einen Schlaganfall erlitten habe. +
Noch ein interessantes Beispiel ist die Tötung von Osama Bin Laden, über die am 01. Mai 2011 um 22:40 Uhr EDT die Berichterstattung mit einem dürftigen Bericht von Helene Cooper begann:
WASHINGTON — Osama bin Laden has been killed, a United States official said. President Obama is expected to make an announcement on Sunday night, almost 10 years after the Sept. 11 attacks on the World Trade Center and the Pentagon.-
Also interesting are the language changes that reflect subtle differences in connotation. For example, whether an election was "democratic" vs. "competitive." -
In some cases, we can see how a story can substantially change as more reporting comes in, such as in a story that -helped inspired this project: the article about the arrests of Occupy Wall Street protestors on October 1, 2011. Two versions, twenty -minutes apart, had substantially different first -paragraphs about the arrests of Occupy Wall Street protestors on the Brooklyn Bridge in October 2011. -The criticism it received was perhaps unfair, but it's hard to determine since the earlier version is -no longer publicly available.
+Außerdem interessant sind Änderungen der Sprache, die subtile Unterschiede der Konnotation nach sich ziehen. Eine Wahl z.B. kann als "democratic" oder "competitive" bezeichnet werden. +
In einigen Fällen lässt sich beobachten, wie eine Darstellung sich wesentlich ändert, je weiter die Berichterstattung voranschreitet. So war es bei dem Bericht, der die Entstehung von NewsDiffs.org ursprünglich inspirierte: der Artikel über die Festnahmen von Occupy Wall Street Demonstranten am 01. Oktober 2011. Zwei Versionen, zwischen denen nur 20 +Minuten verstrichen, hatten grundsätzlich verschiedene +Einleitungsparagraphen. +Die Kritik, die dieser Bericht erfuhr, war eventuell ungerecht, aber das ist schwer zu ermitteln, da die frühere Version nicht mehr öffentlich verfügbar ist.
-
Das diff ist ein in der Computerprogrammierung viel genutztes Tool, das die Unterschiede (engl.: differences) zwischen zwei Dateien aufzeigt. Es wird normalerweise dazu genutzt die Unterschiede zwischen der jetzigen und einer früheren Version einer Datei sichtbar zu machen. Diese Idee der Versionskontrolle ist in der Softwareentwicklung altbekannt und sollte auch im Journalismus benutzt werden, da der Journalismus sich auf sich ständig weiterentwickelnde Versionen von Nachrichtenartikeln hinbewegt.
-
NewsDiffs regularly looks at the stories that are linked to (or have been linked to) -from the homepage of major online news publications, starting with nytimes.com and cnn.com. It parses them and stores them in a git repository. -
The records start June 17, 2012. -
Not all articles are stored. Only those with changes are displayed. NewsDiffs focuses mostly on ones that -are linked from the homepage. +
NewsDiffs überprüft regelmäßig die Artikel, die auf den Titelseiten von Onlineausgaben einiger großer deutscher Nachrichtenagenturen verlinkt sind (oder einmal verlinkt waren), u.a. +Zeit Online und Die Welt. Die Artikel werden von NewsDiffs geparst und in einem git Repository gespeichert. +
Diese Speicherung findet seit 2015 statt. +
Nicht alle Artikel der Online-Zeitungen werden gespeichert, NewsDiffs konzetriert sich auf die Artikel, die von der Titelseite verlinkt sind. Nur Artikel, an denen Änderungen vorgenommen wurden, werden angezeigt.
-
The NewsDiffs source code is available on Github. -
The front end used to view the differences is from the open-source Diff Match Patch library. -
The website is built on Django. -
The prettiness is courtesy of Twitter Bootstrap, which -has -been saving developers from themselves the world -over. - -
NewsDiffs is the product of a weekend of work from the Knight Mozilla MIT -hackathon, -by Eric Price, Jennifer 8. Lee and Greg Price. - -
Greg, who works at Tddium, has his masters in theoretical computer science from MIT and a bachelors in -mathematics from Harvard. (He also led the YouTomb project, which tracked videos removed from -YouTube). Eric is currently in his fourth year of a PhD in theoretical computer science from MIT. Jenny was a reporter at The New York Times for nine years, wonders what it's like to be a product manager and has been -tortured by missing semicolons.
+Der Quelltext zu NewsDiffs ist frei verfügbar auf GitHub. +
Das Frontend, das zum Einsehen der Unterschiede verwendet wird, ist aus der Open-Source Diff Match Patch Library. +
Die Webseite wurde mit Django gebaut. +
Das schöne Layout haben wir Twitter Bootstrap zu verdanken + +
NewsDiffs.de ist im Sommersemester 2015 als Projekt einiger IMI-Studenten der HTW Berlin auf der Grundlage von NewsDiffs.org entstanden. +
Ziel war das Anpassen der Webseite auf den deutschen Nachrichtenraum und der Ausbau durch neue Features. Dafür wurde zunächst NewsDiffs.org geclont, welches während des Knight Mozilla MIT +hackathons 2012 +von Eric Price, Jennifer 8. Lee and Greg Price geschaffen wurde.
{% endblock %} diff --git a/website/frontend/templates/article_history.html b/website/frontend/templates/article_history.html index c8c5ec64..61a48a0d 100644 --- a/website/frontend/templates/article_history.html +++ b/website/frontend/templates/article_history.html @@ -1,19 +1,19 @@ {% extends 'template.html' %} -{% block title %}Article View{% endblock %} +{% block title %}Artikelansicht{% endblock %} {% block content %} {% include "find_by_uri.html" %} -
Headline | -Date/Time EST Archived | +Schlagzeile | +Datum/Zeit der Archivierung | Diff | {{version.title}} | {{version.date}} | {% if difflink %} -(Compare with previous) | +(Vergleich mit vorheriger Version) | {% else %}{% endif %} diff --git a/website/frontend/templates/article_history.xml b/website/frontend/templates/article_history.xml index 13906129..9940454b 100644 --- a/website/frontend/templates/article_history.xml +++ b/website/frontend/templates/article_history.xml @@ -1,6 +1,6 @@ |
Article | Version | Diff |
---|---|---|
Artikel | Version | Diff |
{{last_version.title}} ({{article.publication}}) {{last_version.byline}} |
{% for difflink, version in versions %}
{{version.date}} | {% if difflink %} -(Compare) |
(Vergleichen) | ||
Article | Version | Diff |
---|---|---|
Once Ubiquitous on Campus, ‘Paterno’ Is No Longer Uttered (NYT) By BILL PENNINGTON |
-Nov. 25, 2012, 12:15 a.m. | -(Compare) |
Nov. 24, 2012, 5:43 p.m. | -||
Sex attack on schoolgirl, 11, in Enfield park (BBC) |
-Nov. 25, 2012, 12:13 a.m. | -(Compare) |
Nov. 24, 2012, 5:42 p.m. | -||
Top Egyptian judicial body rips Morsy (CNN) Mohamed Fadel Fahmy and Reza Sayah, CNN |
-Nov. 25, 2012, 12:12 a.m. | -(Compare) |
Nov. 24, 2012, 6:44 p.m. | -(Compare) | |
Nov. 24, 2012, 5:41 p.m. | -||
Missouri Learns From Costly Turnovers in Loss to Slip Past V.C.U. (NYT) By THE ASSOCIATED PRESS |
-Nov. 25, 2012, 12:09 a.m. | -(Compare) |
Nov. 24, 2012, 10:17 p.m. | -||
Student Killed in Melee at Afghan University (NYT) By AZAM AHMED |
-Nov. 25, 2012, 12:09 a.m. | -(Compare) |
Nov. 24, 2012, 10:16 p.m. | -||
Turnovers Cost Florida State Against Gators (NYT) By THE ASSOCIATED PRESS |
-Nov. 25, 2012, 12:09 a.m. | -(Compare) |
Nov. 24, 2012, 10:16 p.m. | -||
Sunday talk show tip sheet (Politico) By Katie Glueck |
-Nov. 24, 2012, 10:22 p.m. | -(Compare) |
Nov. 24, 2012, 5:45 p.m. | -||
Hamas Claim Of Progress Complicates Talk of Truce (NYT) By JODI RUDOREN |
-Nov. 24, 2012, 10:22 p.m. | -(Compare) |
Nov. 24, 2012, 5:44 p.m. | -||
A Fragile Cease-Fire Achieved by Leaving Thorny Issues Unresolved (NYT) By MICHAEL R. GORDON |
-Nov. 24, 2012, 10:21 p.m. | -(Compare) |
Nov. 24, 2012, 5:43 p.m. | -||
Neuroscience: Under Attack (NYT) By ALISSA QUART |
-Nov. 24, 2012, 10:20 p.m. | -(Compare) |
Nov. 24, 2012, 5:43 p.m. | -||
Hector Camacho, 50, Boxer Known for His Quick Hands (NYT) By BRUCE WEBER |
-Nov. 24, 2012, 10:19 p.m. | -(Compare) |
Nov. 24, 2012, 5:41 p.m. | -||
Bill Press is keeping it ‘Current’ (Politico) By Mackenzie Weinger |
-Nov. 24, 2012, 10:19 p.m. | -(Compare) |
Nov. 24, 2012, 5:41 p.m. | -||
Woman killed by tree amid more storms (BBC) |
-Nov. 24, 2012, 10:19 p.m. | -(Compare) |
Nov. 24, 2012, 6:44 p.m. | -(Compare) | |
Nov. 24, 2012, 5:41 p.m. | -||
Away-day blues continue for Real Madrid (CNN) |
-Nov. 24, 2012, 10:19 p.m. | -(Compare) |
Nov. 24, 2012, 6:43 p.m. | -||
Egyptian Judges Challenge Morsi Over New Power (NYT) By DAVID D. KIRKPATRICK |
-Nov. 24, 2012, 10:18 p.m. | -(Compare) |
Nov. 24, 2012, 5:41 p.m. | -||
PATH Service to Resume Into Lower Manhattan (NYT) By MARC SANTORA |
-Nov. 24, 2012, 10:18 p.m. | -(Compare) |
Nov. 24, 2012, 5:40 p.m. | -||
Beating Their Rival, Buckeyes Finish Unbeaten, Untied and Unfulfilled (NYT) By TODD JONES |
-Nov. 24, 2012, 10:17 p.m. | -(Compare) |
Nov. 24, 2012, 6:44 p.m. | -(Compare) | |
Nov. 24, 2012, 5:40 p.m. | -||
Larry Hagman, ‘Dallas’ Villain With Sinister Smile, Dies at 81 (NYT) By ENID NEMY |
-Nov. 24, 2012, 6:46 p.m. | -(Compare) |
Nov. 24, 2012, 5:44 p.m. | -||
Odd Spin on Quarterback Controversy (NYT) By JOHN BRANCH |
-Nov. 24, 2012, 6:46 p.m. | -(Compare) |
Nov. 24, 2012, 5:44 p.m. | -||
Larry Hagman, the man behind iconic villain J.R. Ewing, dies (CNN) Chelsea J. Carter and Greg Botelho, CNN |
-Nov. 24, 2012, 6:46 p.m. | -(Compare) |
Nov. 24, 2012, 5:43 p.m. | -||
Goal to Go (NYT) By CHARLES SIEBERT |
-Nov. 24, 2012, 6:45 p.m. | -(Compare) |
Nov. 24, 2012, 5:41 p.m. | -||
NJ governor puts Sandy damage at $29.4 billion (CNN) Julia Talanova and Rande Iaboni, CNN |
-Nov. 24, 2012, 6:44 p.m. | -(Compare) |
Nov. 24, 2012, 5:41 p.m. | -||
On This, 2 Sides Agree: Fighting Hardened Positions (NYT) By JODI RUDOREN and ISABEL KERSHNER |
-Nov. 24, 2012, 6:44 p.m. | -(Compare) |
Nov. 24, 2012, 5:41 p.m. | -
Bald wird hier ein Kontaktformular stehen.
{% endblock %} diff --git a/website/frontend/templates/diffview.html b/website/frontend/templates/diffview.html index bd9e8ced..ef647b1f 100644 --- a/website/frontend/templates/diffview.html +++ b/website/frontend/templates/diffview.html @@ -47,44 +47,36 @@ {% endif %} -{% if prev %} - <= Previous revision + <= Vorherige Version {% else %} - No previous revision + Keine vorherige Version {% endif %} | - All changes + Alle Versionen | {% if next %} - Later revision => + Spätere Version => {% else %} - No later revision + Keine spätere Version {% endif %}
-These are some highlights of changes that have been -curated by hand. Do you see a good -one? Let us know.
+Hier haben wir ein paar besonders interessante Änderungen +zusammengestellt. Hast du eine interessante beim stöbern gefunden, die du uns +mitteilen magst?