From 573fc22fc8d96763241bb149f6290ce819bd0c57 Mon Sep 17 00:00:00 2001 From: Anja Kammer Date: Wed, 15 Apr 2015 16:54:30 +0200 Subject: [PATCH 01/64] added gitignore and modified manage.py --- .gitignore | 1 + website/manage.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 63d5c677..f53ec90f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/.idea /articles *~ diff --git a/website/manage.py b/website/manage.py index 6fd85def..cceb7f2d 100755 --- a/website/manage.py +++ b/website/manage.py @@ -11,7 +11,7 @@ if __name__ == "__main__": os.environ.setdefault("DJANGO_SETTINGS_MODULE", "website.settings") - sys.path.append(os.path.dirname(os.getcwd())) + sys.path.append((os.getcwd())) from django.core.management import execute_from_command_line execute_from_command_line(sys.argv) From a02e0c1317ebb0ff891270343e2e391fc10c0325 Mon Sep 17 00:00:00 2001 From: Robert Piwonski Date: Sun, 26 Apr 2015 20:27:55 +0200 Subject: [PATCH 02/64] First Version of the Zeit Online Parser --- parsers/zeit.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 parsers/zeit.py diff --git a/parsers/zeit.py b/parsers/zeit.py new file mode 100644 index 00000000..1010b7b4 --- /dev/null +++ b/parsers/zeit.py @@ -0,0 +1,34 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class ZeitParser(BaseParser): + SUFFIX = '?print=true' + domains = ['www.zeit.de'] + + feeder_pat = '^http://www.zeit.de/news/\d' + feeder_pages = ['http://www.zeit.de/news/index/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('span', 'title') + elTopic = soup.find('span', 'supertitle') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find('span', 'articlemeta-datetime').getText() + + div = soup.find('div', 'article-body') + if div is None: + # Hack for video articles + div = soup.find('div', 'emp-decription') + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) From 7e19cbaf4c89ffae289811d14cbaa0dc909b66bd Mon Sep 17 00:00:00 2001 From: Robert Piwonski Date: Sun, 26 Apr 2015 20:57:15 +0200 Subject: [PATCH 03/64] enhanced Baseparser --- parsers/baseparser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parsers/baseparser.py b/parsers/baseparser.py index ef2a9eb5..e746a7be 100644 --- a/parsers/baseparser.py +++ b/parsers/baseparser.py @@ -108,6 +108,7 @@ class BaseParser(object): # Used when finding articles to parse feeder_pat = None # Look for links matching this regular expression feeder_pages = [] # on these pages + feeder_div = None feeder_bs = BeautifulSoup #use this version of beautifulsoup for feed @@ -143,6 +144,8 @@ def feed_urls(cls): for feeder_url in cls.feeder_pages: html = grab_url(feeder_url) soup = cls.feeder_bs(html) + if cls.feeder_div is not None: + soup = soup.find('div', cls.feeder_div) # "or ''" to make None into str urls = [a.get('href') or '' for a in soup.findAll('a')] From d0e77a238d18b436731a121c8382d826d13ff87f Mon Sep 17 00:00:00 2001 From: Anja Kammer Date: Sun, 26 Apr 2015 23:17:10 +0200 Subject: [PATCH 04/64] modified frontend for new scrapers --- parsers/__init__.py | 13 ++++++++----- parsers/bild.py | 32 ++++++++++++++++++++++++++++++++ parsers/focus.py | 31 +++++++++++++++++++++++++++++++ parsers/zeit.py | 2 +- website/frontend/models.py | 3 +++ website/frontend/views.py | 2 +- 6 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 parsers/bild.py create mode 100644 parsers/focus.py diff --git a/parsers/__init__.py b/parsers/__init__.py index a6870dcf..7e589ac3 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -4,15 +4,18 @@ # - create a parser class in another file, based off (say) bbc.BBCParser # - add it to parsers (below) # Test with test_parser.py +#nyt.NYTParser +#cnn.CNNParser +#politico.PoliticoParser +#bbc.BBCParser +#washpo.WashPoParser # List of parsers to import and use based on parser.domains parsers = """ -nyt.NYTParser -cnn.CNNParser -politico.PoliticoParser -bbc.BBCParser -washpo.WashPoParser +zeit.ZeitParser +bild.BildParser +focus.FocusParser """.split() parser_dict = {} diff --git a/parsers/bild.py b/parsers/bild.py new file mode 100644 index 00000000..85a3429d --- /dev/null +++ b/parsers/bild.py @@ -0,0 +1,32 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class BildParser(BaseParser): + SUFFIX = '' + domains = ['www.bild.de'] + + feeder_pat = '^http://www.bild.de/(politik|regional|geld)' + feeder_pages = ['http://www.bild.de/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('span', 'headline') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find(attrs = {'time' : 'datetime'}) + self.authorids = soup.find('div', attrs={'itemprop':'author'}) + self.authorid = self.authorids.getText() if self.authorids else '' + + div = soup.find('div', 'articleBody') + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/focus.py b/parsers/focus.py new file mode 100644 index 00000000..96d3d892 --- /dev/null +++ b/parsers/focus.py @@ -0,0 +1,31 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class FocusParser(BaseParser): + SUFFIX = '' + domains = ['www.focus.de'] + + feeder_pat = '^http://www.focus.de/(politik|finanzen|panorama|gesundheit|wissen)' + feeder_pages = ['http://www.focus.de/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('h1', 'articleIDentH1') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find('span', 'created').getText() + + + div = soup.find('div', 'article-body') + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/zeit.py b/parsers/zeit.py index 1010b7b4..dd8ca23c 100644 --- a/parsers/zeit.py +++ b/parsers/zeit.py @@ -15,7 +15,7 @@ def _parse(self, html): self.meta = soup.findAll('meta') elt = soup.find('span', 'title') - elTopic = soup.find('span', 'supertitle') + elTopic = soup.find('span', 'supertitle') if elt is None: self.real_article = False return diff --git a/website/frontend/models.py b/website/frontend/models.py index 3aa8a7ba..cdd0b548 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -22,6 +22,9 @@ def strip_prefix(string, prefix): 'www.bbc.co.uk': 'BBC', 'www.politico.com': 'Politico', 'www.washingtonpost.com': 'Washington Post', + 'www.zeit.de': 'Zeit Online', + 'www.bild.de': 'Bild', + 'www.focus.de': 'Focus Online', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/views.py b/website/frontend/views.py index e439492e..537f8ae3 100644 --- a/website/frontend/views.py +++ b/website/frontend/views.py @@ -105,7 +105,7 @@ def get_articles(source=None, distance=0): SOURCES = '''nytimes.com cnn.com politico.com washingtonpost.com -bbc.co.uk'''.split() +bbc.co.uk zeit.de bild.de focus.de'''.split() def is_valid_domain(domain): """Cheap method to tell whether a domain is being tracked.""" From aeec6f6337db6c08d429a0c5b052a11824719ef7 Mon Sep 17 00:00:00 2001 From: Anja Kammer Date: Mon, 27 Apr 2015 00:20:49 +0200 Subject: [PATCH 05/64] parsing failes if change occures --- parsers/__init__.py | 1 + parsers/spiegel.py | 30 ++++++++++++++++++++++++++++++ parsers/stern.py | 30 ++++++++++++++++++++++++++++++ parsers/welt.py | 32 ++++++++++++++++++++++++++++++++ website/frontend/models.py | 1 + website/frontend/views.py | 2 +- 6 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 parsers/spiegel.py create mode 100644 parsers/stern.py create mode 100644 parsers/welt.py diff --git a/parsers/__init__.py b/parsers/__init__.py index 7e589ac3..e19cc49b 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -16,6 +16,7 @@ zeit.ZeitParser bild.BildParser focus.FocusParser +spiegel.SpiegelParser """.split() parser_dict = {} diff --git a/parsers/spiegel.py b/parsers/spiegel.py new file mode 100644 index 00000000..fca50f2a --- /dev/null +++ b/parsers/spiegel.py @@ -0,0 +1,30 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class SpiegelParser(BaseParser): + SUFFIX = '' + domains = ['www.spiegel.de'] + + feeder_pat = '^http://www.spiegel.de/(politik|wirtschaft|panorama|netzwelt|gesundheit)/' + feeder_pages = ['http://www.spiegel.de/schlagzeilen/index.html'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('h2', attrs={'class':'article-title'}) + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find(attrs = {'time' : 'datetime'}) + + div = soup.find('div', 'article-section') + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/stern.py b/parsers/stern.py new file mode 100644 index 00000000..ad2b172b --- /dev/null +++ b/parsers/stern.py @@ -0,0 +1,30 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class SternParser(BaseParser): + SUFFIX = '' + domains = ['www.stern.de'] + + feeder_pat = '^http://www.stern.de/(politik|wirtschaft|panorama|lifestyle|wissen|digital)/' + feeder_pages = ['http://www.stern.de/news'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('h2', attrs={'id':'div_article_headline'}) + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find('div', attrs = {'class' : 'datePublished'}) + + div = soup.find('span', attrs={'itemprop':'articleBody'}) + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/parsers/welt.py b/parsers/welt.py new file mode 100644 index 00000000..a9ac0505 --- /dev/null +++ b/parsers/welt.py @@ -0,0 +1,32 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class WeltParser(BaseParser): + SUFFIX = '?config=print' + domains = ['www.welt.de'] + + feeder_pat = '^http://www.welt.de/(politik|wirtschaft|panorama|geld|wissen|regional)/' + feeder_pages = ['http://www.welt.de/'] + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('h1') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find('div', attrs = {'id' : 'currenttime'}) + self.authorids = soup.find('span', attrs={'itemprop':'author'}) + self.authorid = self.authorids.getText() if self.authorids else '' + + div = soup.find('div', 'storyBody') + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) diff --git a/website/frontend/models.py b/website/frontend/models.py index cdd0b548..0d9cc2f0 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -25,6 +25,7 @@ def strip_prefix(string, prefix): 'www.zeit.de': 'Zeit Online', 'www.bild.de': 'Bild', 'www.focus.de': 'Focus Online', + 'www.welt.de': 'Die Welt', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/views.py b/website/frontend/views.py index 537f8ae3..771f81ab 100644 --- a/website/frontend/views.py +++ b/website/frontend/views.py @@ -105,7 +105,7 @@ def get_articles(source=None, distance=0): SOURCES = '''nytimes.com cnn.com politico.com washingtonpost.com -bbc.co.uk zeit.de bild.de focus.de'''.split() +bbc.co.uk zeit.de bild.de focus.de spiegel.de welt.de stern.de'''.split() def is_valid_domain(domain): """Cheap method to tell whether a domain is being tracked.""" From 49ee7b64d0ff9adada82601244d10967a39849d6 Mon Sep 17 00:00:00 2001 From: Anja Kammer Date: Mon, 27 Apr 2015 10:43:21 +0200 Subject: [PATCH 06/64] displayes just german platforms --- parsers/__init__.py | 6 +----- website/frontend/models.py | 8 ++------ website/frontend/views.py | 3 +-- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/parsers/__init__.py b/parsers/__init__.py index e19cc49b..f862a0cc 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -4,11 +4,6 @@ # - create a parser class in another file, based off (say) bbc.BBCParser # - add it to parsers (below) # Test with test_parser.py -#nyt.NYTParser -#cnn.CNNParser -#politico.PoliticoParser -#bbc.BBCParser -#washpo.WashPoParser # List of parsers to import and use based on parser.domains @@ -17,6 +12,7 @@ bild.BildParser focus.FocusParser spiegel.SpiegelParser +stern.SternParser """.split() parser_dict = {} diff --git a/website/frontend/models.py b/website/frontend/models.py index 0d9cc2f0..e1e5637e 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -17,15 +17,11 @@ def strip_prefix(string, prefix): string = string[len(prefix):] return string -PublicationDict = {'www.nytimes.com': 'NYT', - 'edition.cnn.com': 'CNN', - 'www.bbc.co.uk': 'BBC', - 'www.politico.com': 'Politico', - 'www.washingtonpost.com': 'Washington Post', - 'www.zeit.de': 'Zeit Online', +PublicationDict = {'www.zeit.de': 'Zeit Online', 'www.bild.de': 'Bild', 'www.focus.de': 'Focus Online', 'www.welt.de': 'Die Welt', + 'www.stern.de': 'Stern', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/views.py b/website/frontend/views.py index 771f81ab..fd4b6c0f 100644 --- a/website/frontend/views.py +++ b/website/frontend/views.py @@ -104,8 +104,7 @@ def get_articles(source=None, distance=0): return articles -SOURCES = '''nytimes.com cnn.com politico.com washingtonpost.com -bbc.co.uk zeit.de bild.de focus.de spiegel.de welt.de stern.de'''.split() +SOURCES = '''zeit.de bild.de focus.de spiegel.de welt.de stern.de'''.split() def is_valid_domain(domain): """Cheap method to tell whether a domain is being tracked.""" From 674ef0a8a352a01d873c5f960ad5954324d145a4 Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 16:44:03 +0200 Subject: [PATCH 07/64] translated article_history_missing.html --- website/frontend/templates/article_history_missing.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/frontend/templates/article_history_missing.html b/website/frontend/templates/article_history_missing.html index 16f4216e..8f3cdf1e 100644 --- a/website/frontend/templates/article_history_missing.html +++ b/website/frontend/templates/article_history_missing.html @@ -1,13 +1,13 @@ {% extends 'template.html' %} -{% block title %}Article View{% endblock %} +{% block title %}Artikelansicht{% endblock %} {% block content %} {% include "find_by_uri.html" %} -

Article Change Log

+

Änderungsübersicht

{{url}}

-

Alas! We don't seem to know anything about this article. Sorry! :(

+

Huch! Scheint so, als wüssten wir nichts über diesen Artikel. Sorry!

{% endblock %} From b1d1b7c5529e24e8fac34940ae9807fd9299c1cb Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 16:53:39 +0200 Subject: [PATCH 08/64] Update 404.html --- website/frontend/templates/404.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/frontend/templates/404.html b/website/frontend/templates/404.html index 68336dea..98fd677d 100644 --- a/website/frontend/templates/404.html +++ b/website/frontend/templates/404.html @@ -2,12 +2,12 @@ {% load url from future %} -{% block title %}Error{% endblock title%} +{% block title %}Fehler 404{% endblock title%} {% block content %} -

Oops, this doesn't look right

-

Hmm. If you are on this page, something went wrong.

-

If you want to tell us what, you can contact us.

+

Nanu, das sieht nicht richtig aus.

+

Scheint so, als wurde die Seite nicht gefunden.

+

Falls du weißt, was schief gelaufen ist, kannst du uns gerne kontaktieren.

{% endblock content%} From 5f49eb000f9669e78f83c224921e69ba1f519dd4 Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 16:57:47 +0200 Subject: [PATCH 09/64] translated 500.html --- website/frontend/templates/500.html | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/website/frontend/templates/500.html b/website/frontend/templates/500.html index 71b72381..a184b576 100644 --- a/website/frontend/templates/500.html +++ b/website/frontend/templates/500.html @@ -2,13 +2,12 @@ {% load url from future %} -{% block title %}Error{% endblock title%} +{% block title %}Error 500{% endblock title%} {% block content %} -

Oops, this doesn't look right

-

-Hm. If you are on this page, something went wrong. -

If you want to tell us what, you can contact us. +

Nanu, das sieht nicht richtig aus.

+

Wenn du auf dieser Seite bist, scheint etwas schief gelaufen zu sein.

+

Falls du mehr darüber weißt, kannst du uns gerne kontaktieren.

{% endblock content%} From 3919048097f9e8b371ba0c4a654d6aee169c70af Mon Sep 17 00:00:00 2001 From: Robert Piwonski Date: Tue, 28 Apr 2015 17:52:12 +0200 Subject: [PATCH 10/64] added Welt.de-Parser(seems to be working) --- parsers/welt.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 parsers/welt.py diff --git a/parsers/welt.py b/parsers/welt.py new file mode 100644 index 00000000..9e3878cf --- /dev/null +++ b/parsers/welt.py @@ -0,0 +1,33 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag + + +class WeltParser(BaseParser): + domains = ['www.welt.de'] + + feeder_pat = 'article\d*' + feeder_pages = ['http://www.welt.de/'] + #feeder_div = 'groupWrapper' + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + elt = soup.find('h1', 'widget storyContent title prefix_1 grid_8') + if elt is None: + self.real_article = False + return + self.title = elt.getText() + self.byline = '' + self.date = soup.find('meta', {'name':'last-modified'})['content'] + + div = soup.find('div', 'storyBody') + if div is None: + # Hack for video articles + div = soup.find('div', 'emp-decription') + if div is None: + self.real_article = False + return + self.body = '\n'+'\n\n'.join([x.getText() for x in div.childGenerator() + if isinstance(x, Tag) and x.name == 'p']) From 6082ecd92f47954abfcd675de78446be101e8a8d Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 18:40:01 +0200 Subject: [PATCH 11/64] translated about.html --- website/frontend/templates/about.html | 95 +++++++++++---------------- 1 file changed, 39 insertions(+), 56 deletions(-) diff --git a/website/frontend/templates/about.html b/website/frontend/templates/about.html index 4d52d4a4..29313d3f 100644 --- a/website/frontend/templates/about.html +++ b/website/frontend/templates/about.html @@ -9,75 +9,58 @@ href="http://www.newsdiffs.org/diff/192021/192137/www.nytimes.com/2013/03/31/science/space/yvonne-brill-rocket-scientist-dies-at-88.html"> -

About NewsDiffs

+

Über NewsDiffs

-

Why NewsDiffs Exists
+
Warum NewsDiffs existiert
-

In the age of rapid reporting and digital news, there is rarely a single "final" version of an article. -

NewsDiffs watches different versions of highly-placed articles on online news sites, starting with nytimes.com. +

In einer Zeit, in der es vor Eilmeldungen nur so wimmelt und Nachrichten online abrufbar sind, gibt es kaum noch "endgültige" Versionen von Artikeln. +

NewsDiffs beobachtet verschiedene Versionen von Artikeln der Titelseiten von großen Online-Zeitungen, wie z.B. Zeit Online. -

For better or worse, readers can now view "the making of the sausage" that historically was discreetly -tucked away from view with dead-tree editions. Some of those changes provoke criticism. -

NewsDiffs was born of the Knight Mozilla MIT hackathon on June 17, 2012.

+

So können Leser jetzt das "Making-of" eines Artikels, das früher hinter den Kulissen der Nachrichtenagenturen versteckt war, verfolgen. Einige dieser Änderungen provozieren Kritik. -

What Types of Changes?
-

Updates to articles on major news web sites happen all the time.

-

Often the changes to articles are simply minor, small edits that are tightening up (such as "most" to "many.")

-

Sometimes the changes are the insertion or the deletion of a section.

-

Sometimes the story changes as a result of a rapidly breaking news, such as the -death of Rodney King. The story grows and deepens over time as more information comes in. To the right is an example -of a story that evolved about the health of former Egyptian president -Hosni Mubarak, first when it was reported that he was "clinically dead" and then later that he had suffered a stroke. -

Another interesting example would have been the killing of Osama Bin Laden on May 1, 2011, which broke at 10:40 p.m. with a sparse report from Helene Cooper: +

Was für Arten von Änderungen?
+

Artikel aus Online-Zeitungen großer Nachrichtenportale werden ständig aktualisiert.

+

Oft sind diese Änderungen geringfügig, da sie nur dazu dienen den Stil des Artikels zu verbessern (z.B. wird aus einem "jene" ein "diese").

+

Manchmal umfassen die Änderungen das Einfügen oder Löschen eines ganzen Abschnitts.

+

Gelegentlich verändert sich der Artikel aufgrund von schnell hintereinander eintreffenden Eilmeldungen, wie z.B. bei Rodney +Kings Tod. Mit der Zeit wächst und vertieft sich der Bericht sobald mehr Informationen bekannt werden. Ein weiteres Beispiel hierfür ist die Nachrichtenerstattung über den Gesundheitszustand +des ehemaligen ägyptischen Präsidenten Husni Mubarak. Zunächst wurde berichtet, er wäre "klinisch tot", später dass er einen Schlaganfall erlitten habe. +

Noch ein interessantes Beispiel ist die Tötung von Osama Bin Laden, über die am 01. Mai 2011 um 22:40 Uhr EDT die Berichterstattung mit einem dürftigen Bericht von Helene Cooper begann:

WASHINGTON — Osama bin Laden has been killed, a United States official said. President Obama is expected to make an announcement on Sunday night, almost 10 years after the Sept. 11 attacks on the World Trade Center and the Pentagon.
-

Also interesting are the language changes that reflect subtle differences in connotation. For example, whether an election was "democratic" vs. "competitive." -

In some cases, we can see how a story can substantially change as more reporting comes in, such as in a story that -helped inspired this project: the article about the arrests of Occupy Wall Street protestors on October 1, 2011. Two versions, twenty -minutes apart, had substantially different first -paragraphs about the arrests of Occupy Wall Street protestors on the Brooklyn Bridge in October 2011. -The criticism it received was perhaps unfair, but it's hard to determine since the earlier version is -no longer publicly available.

+

Außerdem interessant sind Änderungen der Sprache, die subtile Unterschiede der Konnotation nach sich ziehen. Eine Wahl z.B. kann als "democratic" oder "competitive" bezeichnet werden. +

In einigen Fällen lässt sich beobachten, wie eine Darstellung sich wesentlich ändert, je weiter die Berichterstattung voranschreitet. So war es bei dem Bericht, der die Entstehung von NewsDiffs.org ursprünglich inspirierte: der Artikel über die Festnahmen von Occupy Wall Street Demonstranten am 01. Oktober 2011. Zwei Versionen, zwischen denen nur 20 +Minuten verstrichen, hatten grundsätzlich verschiedene +Einleitungsparagraphen. +Die Kritik, die dieser Bericht erfuhr, war eventuell ungerecht, aber das ist schwer zu ermitteln, da die frühere Version nicht mehr öffentlich verfügbar ist.

-

Why the name NewsDiffs?
+
Warum der Name "NewsDiffs"?
- A diff is a popular tool in computer programming that outputs the differences between two files. It is typically used to show the changes between one version of a file and a former version of the same file. This idea of version control is well known within software engineering, and should be used in journalism as journalism moves toward constantly evolving versions of news stories. - We have had many sessions at many newsy foo and bar camps on "Github for news." Well, this time, we literally put the news into git. +

Das diff ist ein in der Computerprogrammierung viel genutztes Tool, das die Unterschiede (engl.: differences) zwischen zwei Dateien aufzeigt. Es wird normalerweise dazu genutzt die Unterschiede zwischen der jetzigen und einer früheren Version einer Datei sichtbar zu machen. Diese Idee der Versionskontrolle ist in der Softwareentwicklung altbekannt und sollte auch im Journalismus benutzt werden, da der Journalismus sich auf sich ständig weiterentwickelnde Versionen von Nachrichtenartikeln hinbewegt.

-

How NewsDiffs Works
-

NewsDiffs regularly looks at the stories that are linked to (or have been linked to) -from the homepage of major online news publications, starting with nytimes.com and cnn.com. It parses them and stores them in a git repository. -

The records start June 17, 2012. -

Not all articles are stored. Only those with changes are displayed. NewsDiffs focuses mostly on ones that -are linked from the homepage. +

Wie NewsDiffs funktioniert
+

NewsDiffs überprüft regelmäßig die Artikel, die auf den Titelseiten von Onlineausgaben einiger großer deutscher Nachrichtenagenturen verlinkt sind (oder einmal verlinkt waren), u.a. +Zeit Online und Die Welt. Die Artikel werden von NewsDiffs geparst und in einem git Repository gespeichert. +

Diese Speicherung findet seit 2015 statt. +

Nicht alle Artikel der Online-Zeitungen werden gespeichert, NewsDiffs konzetriert sich auf die Artikel, die von der Titelseite verlinkt sind. Nur Artikel, an denen Änderungen vorgenommen wurden, werden angezeigt.

-

What Tools Did You Use?
-

The NewsDiffs source code is available on Github. -

The front end used to view the differences is from the open-source Diff Match Patch library. -

The website is built on Django. -

The prettiness is courtesy of Twitter Bootstrap, which -has -been saving developers from themselves the world -over. +

Welche Tools habt ihr eingesetzt?
+

Der Quelltext zu NewsDiffs ist frei verfügbar auf GitHub. +

Das Frontend, das zum Einsehen der Unterschiede verwendet wird, ist aus der Open-Source Diff Match Patch Library. +

Die Webseite wurde mit Django gebaut. +

Das schöne Layout haben wir Twitter Bootstrap zu verdanken -

Who created NewsDiffs?
-

NewsDiffs is the product of a weekend of work from the Knight Mozilla MIT -hackathon, -by Eric Price, Jennifer 8. Lee and Greg Price. - -

Greg, who works at Tddium, has his masters in theoretical computer science from MIT and a bachelors in -mathematics from Harvard. (He also led the YouTomb project, which tracked videos removed from -YouTube). Eric is currently in his fourth year of a PhD in theoretical computer science from MIT. Jenny was a reporter at The New York Times for nine years, wonders what it's like to be a product manager and has been -tortured by missing semicolons.

+
Wer steckt hinter NewsDiffs?
+

NewsDiffs.de ist im Sommersemester 2015 als Projekt einiger IMI-Studenten der HTW Berlin auf der Grundlage von NewsDiffs.org entstanden. +

Ziel war das Anpassen der Webseite auf den deutschen Nachrichtenraum und der Ausbau durch neue Features. Dafür wurde zunächst NewsDiffs.org geclont, welches während des Knight Mozilla MIT +hackathons 2012 +von Eric Price, Jennifer 8. Lee and Greg Price geschaffen wurde.

{% endblock %} From 1ca3ea88964a41067ba05975dc8d969c1c5cac2a Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 19:30:27 +0200 Subject: [PATCH 12/64] translated article_history.html --- website/frontend/templates/article_history.html | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/website/frontend/templates/article_history.html b/website/frontend/templates/article_history.html index c8c5ec64..61a48a0d 100644 --- a/website/frontend/templates/article_history.html +++ b/website/frontend/templates/article_history.html @@ -1,19 +1,19 @@ {% extends 'template.html' %} -{% block title %}Article View{% endblock %} +{% block title %}Artikelansicht{% endblock %} {% block content %} {% include "find_by_uri.html" %} -

{{article.latest_version.title}} ({{article.publication}}), Change Log

+

{{article.latest_version.title}} ({{article.publication}}), Änderungsübersicht

{{article.url}}

-

{{article.latest_version.byline}} | First archived on {{article.initial_date}}

+

{{article.latest_version.byline}} | Zuerst archiviert am {{article.initial_date}}

- - + + @@ -22,7 +22,7 @@

{{article.latest_version.byline}} | First archived on {{article.initial_date

{% if difflink %} - + {% else %} {% endif %} From 7abf9ca5331f99dbc1b936cf1b226d7fbbb0ac2e Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 19:31:56 +0200 Subject: [PATCH 13/64] translated article_history.xml --- website/frontend/templates/article_history.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/frontend/templates/article_history.xml b/website/frontend/templates/article_history.xml index 13906129..9940454b 100644 --- a/website/frontend/templates/article_history.xml +++ b/website/frontend/templates/article_history.xml @@ -1,6 +1,6 @@ - {{article.latest_version.title}}, Change Log + {{article.latest_version.title}}, Änderungsübersicht http://{{request.META.HTTP_HOST}}{% url article_history_feed article.filename %} From 89e4aad3d3ba665f68ed0de5f4522d47ae296897 Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 19:33:17 +0200 Subject: [PATCH 14/64] translated browse.html --- website/frontend/templates/browse.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/frontend/templates/browse.html b/website/frontend/templates/browse.html index 36345f88..b2a96d5b 100644 --- a/website/frontend/templates/browse.html +++ b/website/frontend/templates/browse.html @@ -2,6 +2,6 @@ {% block active_browse %}class="active"{% endblock %} -{% block title %}Changes{% endblock %} +{% block title %}Änderungen{% endblock %} -{% block browse_fromline %}| {% if source %}{{source}}{% else %}All Sources{% endif %}{% endblock %} +{% block browse_fromline %}| {% if source %}{{source}}{% else %}Alle Quellen{% endif %}{% endblock %} From 416b2e78c7974786acf2374ec2e6e1f8691f152a Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 19:48:12 +0200 Subject: [PATCH 15/64] translated browse_base.html --- website/frontend/templates/browse_base.html | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/frontend/templates/browse_base.html b/website/frontend/templates/browse_base.html index 77e533c2..2ef856cb 100644 --- a/website/frontend/templates/browse_base.html +++ b/website/frontend/templates/browse_base.html @@ -6,7 +6,7 @@ {% include "find_by_uri.html" %} -

Changed Articles {% block browse_fromline %}{% endblock browse_fromline %}

-

Starting {{first_update|date:"F d, Y"}} (with occasional downtime)

+

Veränderte Artikel {% block browse_fromline %}{% endblock browse_fromline %}

+

Erstmals gespeichert {{first_update|date:"F d, Y"}} (mit vereinzelten Ausfallzeiten)

HeadlineDate/Time EST ArchivedSchlagzeileDatum/Zeit der Archivierung Diff
{{version.title}} {{version.date}}(Compare with previous)(Vergleich mit vorheriger Version)
- + {% for article, last_version, versions in articles %} {% for difflink, version in versions %} {% if difflink %} - + {% else %} {% endif %} From e600556a08e0c990e18e5d654184d6646bbcb5cf Mon Sep 17 00:00:00 2001 From: Relana Streckenbach Date: Tue, 28 Apr 2015 20:00:02 +0200 Subject: [PATCH 16/64] translated contact.html --- website/frontend/templates/contact.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/frontend/templates/contact.html b/website/frontend/templates/contact.html index 55bc0d8b..80e40498 100644 --- a/website/frontend/templates/contact.html +++ b/website/frontend/templates/contact.html @@ -2,10 +2,10 @@ {% block active_contact %}class="active"{% endblock %} -{% block title %}Contact{% endblock title%} +{% block title %}Kontakt{% endblock title%} {% block content %} -

Comments, Feedback, Criticism?

+

Kommentare, Feedback, Kritik?

- - - - - - - - - - - - - -
- -
-Find article by full URL: - - - - - - - - -

Changed Articles | All Sources

-

Starting November 24, 2012 (with occasional downtime)

-

- -

ArticleVersionDiff
ArtikelVersionDiff
{{last_version.title}} ({{article.publication}})
{{last_version.byline}}
{{version.date}}(Compare)
(Vergleichen)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ArticleVersionDiff
Once Ubiquitous on Campus, ‘Paterno’ Is No Longer Uttered (NYT)
By BILL PENNINGTON
Nov. 25, 2012, 12:15 a.m.(Compare)
Nov. 24, 2012, 5:43 p.m.
Sex attack on schoolgirl, 11, in Enfield park (BBC)
Nov. 25, 2012, 12:13 a.m.(Compare)
Nov. 24, 2012, 5:42 p.m.
Top Egyptian judicial body rips Morsy (CNN)
Mohamed Fadel Fahmy and Reza Sayah, CNN
Nov. 25, 2012, 12:12 a.m.(Compare)
Nov. 24, 2012, 6:44 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
Missouri Learns From Costly Turnovers in Loss to Slip Past V.C.U. (NYT)
By THE ASSOCIATED PRESS
Nov. 25, 2012, 12:09 a.m.(Compare)
Nov. 24, 2012, 10:17 p.m.
Student Killed in Melee at Afghan University (NYT)
By AZAM AHMED
Nov. 25, 2012, 12:09 a.m.(Compare)
Nov. 24, 2012, 10:16 p.m.
Turnovers Cost Florida State Against Gators (NYT)
By THE ASSOCIATED PRESS
Nov. 25, 2012, 12:09 a.m.(Compare)
Nov. 24, 2012, 10:16 p.m.
Sunday talk show tip sheet (Politico)
By Katie Glueck
Nov. 24, 2012, 10:22 p.m.(Compare)
Nov. 24, 2012, 5:45 p.m.
Hamas Claim Of Progress Complicates Talk of Truce (NYT)
By JODI RUDOREN
Nov. 24, 2012, 10:22 p.m.(Compare)
Nov. 24, 2012, 5:44 p.m.
A Fragile Cease-Fire Achieved by Leaving Thorny Issues Unresolved (NYT)
By MICHAEL R. GORDON
Nov. 24, 2012, 10:21 p.m.(Compare)
Nov. 24, 2012, 5:43 p.m.
Neuroscience: Under Attack (NYT)
By ALISSA QUART
Nov. 24, 2012, 10:20 p.m.(Compare)
Nov. 24, 2012, 5:43 p.m.
Hector Camacho, 50, Boxer Known for His Quick Hands (NYT)
By BRUCE WEBER
Nov. 24, 2012, 10:19 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
Bill Press is keeping it ‘Current’ (Politico)
By Mackenzie Weinger
Nov. 24, 2012, 10:19 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
Woman killed by tree amid more storms (BBC)
Nov. 24, 2012, 10:19 p.m.(Compare)
Nov. 24, 2012, 6:44 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
Away-day blues continue for Real Madrid (CNN)
Nov. 24, 2012, 10:19 p.m.(Compare)
Nov. 24, 2012, 6:43 p.m.
Egyptian Judges Challenge Morsi Over New Power (NYT)
By DAVID D. KIRKPATRICK
Nov. 24, 2012, 10:18 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
PATH Service to Resume Into Lower Manhattan (NYT)
By MARC SANTORA
Nov. 24, 2012, 10:18 p.m.(Compare)
Nov. 24, 2012, 5:40 p.m.
Beating Their Rival, Buckeyes Finish Unbeaten, Untied and Unfulfilled (NYT)
By TODD JONES
Nov. 24, 2012, 10:17 p.m.(Compare)
Nov. 24, 2012, 6:44 p.m.(Compare)
Nov. 24, 2012, 5:40 p.m.
Larry Hagman, ‘Dallas’ Villain With Sinister Smile, Dies at 81 (NYT)
By ENID NEMY
Nov. 24, 2012, 6:46 p.m.(Compare)
Nov. 24, 2012, 5:44 p.m.
Odd Spin on Quarterback Controversy (NYT)
By JOHN BRANCH
Nov. 24, 2012, 6:46 p.m.(Compare)
Nov. 24, 2012, 5:44 p.m.
Larry Hagman, the man behind iconic villain J.R. Ewing, dies (CNN)
Chelsea J. Carter and Greg Botelho, CNN
Nov. 24, 2012, 6:46 p.m.(Compare)
Nov. 24, 2012, 5:43 p.m.
Goal to Go (NYT)
By CHARLES SIEBERT
Nov. 24, 2012, 6:45 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
NJ governor puts Sandy damage at $29.4 billion (CNN)
Julia Talanova and Rande Iaboni, CNN
Nov. 24, 2012, 6:44 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
On This, 2 Sides Agree: Fighting Hardened Positions (NYT)
By JODI RUDOREN and ISABEL KERSHNER
Nov. 24, 2012, 6:44 p.m.(Compare)
Nov. 24, 2012, 5:41 p.m.
-1 - - -


- - - - - - - - - - - diff --git a/website/frontend/templates/examples.html b/website/frontend/templates/examples.html index e421d8ca..3e446b47 100644 --- a/website/frontend/templates/examples.html +++ b/website/frontend/templates/examples.html @@ -6,7 +6,7 @@

Highlights

Hier haben wir ein paar besonders interessante Änderungen zusammengestellt. Hast du eine interessante beim stöbern gefunden, die du uns -mitteilen magst?

+mitteilen magst?