From 3baadf0c80b87318d6d315df63c1d3b30b37ea52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= <veniciusgrjr@gmail.com>
Date: Mon, 21 Sep 2015 14:13:21 -0300
Subject: [PATCH 1/4] Initial submission: crawler for Valor

---
 capture/crawler_valor.py | 203 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100755 capture/crawler_valor.py

diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py
new file mode 100755
index 0000000..b28805a
--- /dev/null
+++ b/capture/crawler_valor.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+from goose import Goose
+import pymongo
+from bs4 import BeautifulSoup
+import requests
+import re
+import pandas as pd
+import datetime
+import zlib
+import cPickle as CP
+import cld
+import sys
+from requests.exceptions import ConnectionError, MissingSchema, Timeout
+import bson
+import settings
+import logging_mc
+
+logger = logging_mc.get_logger( 'valor' )
+
+
+client = pymongo.MongoClient(settings.MONGOHOST, 27017)
+MCDB = client.MCDB
+ARTICLES = MCDB.articles  # Article Collection
+ARTICLES.ensure_index("source")
+
+def find_articles():
+	"""
+	Get the urls of last news
+	:return: last news' urls of all categories
+	:rtype: set()
+	"""
+	urls = ['http://www.valor.com.br/ultimas-noticias/brasil',
+			'http://www.valor.com.br/ultimas-noticias/politica',
+			'http://www.valor.com.br/ultimas-noticias/financas',
+			'http://www.valor.com.br/ultimas-noticias/empresas',
+			'http://www.valor.com.br/ultimas-noticias/agro',
+			'http://www.valor.com.br/ultimas-noticias/internacional',
+			'http://www.valor.com.br/ultimas-noticias/opiniao',
+			'http://www.valor.com.br/ultimas-noticias/legislacao',
+			'http://www.valor.com.br/ultimas-noticias/carreira',
+			'http://www.valor.com.br/ultimas-noticias/cultura']
+	news_urls = list()
+	for INDEX_URL in urls:
+		index = requests.get(INDEX_URL).content
+		soup = BeautifulSoup(index, "lxml")
+		news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2')
+		news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup(  art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
+	return set(news_urls )
+
+def get_published_time(soup):
+	"""
+	Get the news' published datetime
+	:param soup: object with news html page
+    :type soup: BeautifulSoup object
+    :return: news published datetime
+    :rtype: string
+	"""
+	try:
+		time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text
+	except IndexError:
+		logger.error('wrong time tag')
+		return None
+	if time_tag is None:
+		return None
+	else:
+		try:
+			published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M')
+		except ValueError:
+			logger.error('wrong date extraction')
+			return None
+		return published_time
+
+def extract_title(article):
+    """
+	Extract the news title.
+    """
+
+    try:
+        title = article.title
+    except Exception as ex:
+        template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}"
+        message = template.format(type(ex).__name__, ex.args)
+        logger.exception(message)
+        return None
+    return title
+
+def extract_content(article):
+    """
+	Extract relevant information about news page
+    """
+
+    try:
+        body_content = article.cleaned_text
+    except Exception as ex:
+        template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}"
+        message = template.format(type(ex).__name__, ex.args)
+        logger.exception(message)
+        return None
+    return body_content
+
+def detect_language(text):
+    """
+	Detect the language of text using chromium_compact_language_detector
+    :param text: text to be analyzed
+    :return: {"name": portuguese, "pt"}
+    """
+    name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8'))
+    return {"name": name, "code": code}
+
+def compress_content(html):
+    """
+    Compresses and encodes html content so that it can be BSON encoded an store in mongodb
+    :param html: original html document
+    :return: compressed an b64 encoded document
+    """
+    pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL)
+    squished = zlib.compress(pickled)
+    encoded = bson.Binary(squished)  # b64.urlsafe_b64encode(squished)
+    return encoded
+
+def decompress_content(compressed_html):
+    """
+    Decompress data compressed by `compress_content`
+    :param compressed_html: compressed html document
+    :return: original html
+    """
+    # unencoded = b64.urlsafe_b64decode(str(compressed_html))
+    decompressed = zlib.decompress(compressed_html)
+    orig_html = CP.loads(decompressed)
+    return orig_html
+
+
+def download_article(url):
+	"""
+	Download the html content of a news page
+    :param url: news page's url
+    :type url: string
+    :return: news page's content
+    :rtype: requests.models.Response	
+	"""
+	article = {
+        'link': url,
+        'source': 'crawler_Valor',
+    }
+	logger.info("Downloading article: %s", url)
+	try:
+		response = requests.get(url, timeout=30)
+	except ConnectionError:
+		logger.error("Failed to fetch %s", url)
+		return
+	except Timeout:
+		logger.error("Timed out while fetching %s", url)
+		return
+
+	encoding = response.encoding if response.encoding is not None else 'utf8'
+	dec_content = response.content.decode(encoding)
+	soup = BeautifulSoup(dec_content, "lxml")    
+	extractor  = Goose({'use_meta_language': False, 'target_language':'pt'})
+	news = extractor.extract(url=url)
+
+	article['link_content'] = compress_content(dec_content)
+	article['compressed'] = True
+	article['language'] = detect_language(dec_content)
+	article['title'] =  extract_title(news)
+	article['published'] = get_published_time(soup)
+	article['main_text'] = extract_content(news)
+
+	return article
+
+if __name__ =='__main__': 
+	for url in find_articles():
+		print url
+		exists = list(ARTICLES.find({"link": url}))
+		if not exists:
+			article = download_article(url)
+			print 'download done'
+			ARTICLES.insert(article, w=1)
+			print 'salved'
+		else:
+			print 'it already exists'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From e5f5ee1a60894087eb423be239349e26db248d21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= <veniciusgrjr@gmail.com>
Date: Mon, 21 Sep 2015 21:32:35 -0300
Subject: [PATCH 2/4] I fixed the problem related to line breaking, to the new
 style string formatting, to print statements and to unnecessary imports, as
 suggested for @flavioamieiro. Please take a look at this code.

---
 capture/crawler_valor.py | 197 +++++++++++++++++----------------------
 capture/logging_mc.py    |  32 +++++++
 2 files changed, 117 insertions(+), 112 deletions(-)
 create mode 100644 capture/logging_mc.py

diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py
index b28805a..a587f7b 100755
--- a/capture/crawler_valor.py
+++ b/capture/crawler_valor.py
@@ -3,76 +3,72 @@
 import pymongo
 from bs4 import BeautifulSoup
 import requests
-import re
-import pandas as pd
 import datetime
 import zlib
 import cPickle as CP
 import cld
-import sys
-from requests.exceptions import ConnectionError, MissingSchema, Timeout
+from requests.exceptions import ConnectionError, Timeout
 import bson
 import settings
 import logging_mc
 
 logger = logging_mc.get_logger( 'valor' )
 
-
 client = pymongo.MongoClient(settings.MONGOHOST, 27017)
 MCDB = client.MCDB
 ARTICLES = MCDB.articles  # Article Collection
 ARTICLES.ensure_index("source")
 
 def find_articles():
-	"""
-	Get the urls of last news
-	:return: last news' urls of all categories
-	:rtype: set()
-	"""
-	urls = ['http://www.valor.com.br/ultimas-noticias/brasil',
-			'http://www.valor.com.br/ultimas-noticias/politica',
-			'http://www.valor.com.br/ultimas-noticias/financas',
-			'http://www.valor.com.br/ultimas-noticias/empresas',
-			'http://www.valor.com.br/ultimas-noticias/agro',
-			'http://www.valor.com.br/ultimas-noticias/internacional',
-			'http://www.valor.com.br/ultimas-noticias/opiniao',
-			'http://www.valor.com.br/ultimas-noticias/legislacao',
-			'http://www.valor.com.br/ultimas-noticias/carreira',
-			'http://www.valor.com.br/ultimas-noticias/cultura']
-	news_urls = list()
-	for INDEX_URL in urls:
-		index = requests.get(INDEX_URL).content
-		soup = BeautifulSoup(index, "lxml")
-		news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2')
-		news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup(  art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
-	return set(news_urls )
+    """
+    Get the urls of last news
+    :return: last news' urls of all categories
+    :rtype: set()
+    """
+    urls = ['http://www.valor.com.br/ultimas-noticias/brasil',
+            'http://www.valor.com.br/ultimas-noticias/politica',
+            'http://www.valor.com.br/ultimas-noticias/financas',
+            'http://www.valor.com.br/ultimas-noticias/empresas',
+            'http://www.valor.com.br/ultimas-noticias/agro',
+            'http://www.valor.com.br/ultimas-noticias/internacional',
+            'http://www.valor.com.br/ultimas-noticias/opiniao',
+            'http://www.valor.com.br/ultimas-noticias/legislacao',
+            'http://www.valor.com.br/ultimas-noticias/carreira',
+            'http://www.valor.com.br/ultimas-noticias/cultura']
+    news_urls = list()
+    for INDEX_URL in urls:
+        index = requests.get(INDEX_URL).content
+        soup = BeautifulSoup(index, "lxml")
+        news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2')
+        news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup(  art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
+    return set(news_urls )
 
 def get_published_time(soup):
-	"""
-	Get the news' published datetime
-	:param soup: object with news html page
+    """
+    Get the news' published datetime
+    :param soup: object with news html page
     :type soup: BeautifulSoup object
     :return: news published datetime
     :rtype: string
-	"""
-	try:
-		time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text
-	except IndexError:
-		logger.error('wrong time tag')
-		return None
-	if time_tag is None:
-		return None
-	else:
-		try:
-			published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M')
-		except ValueError:
-			logger.error('wrong date extraction')
-			return None
-		return published_time
+    """
+    try:
+        time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text
+    except IndexError:
+        logger.error('wrong time tag')
+        return None
+    if time_tag is None:
+        return None
+    else:
+        try:
+            published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M')
+        except ValueError:
+            logger.error('wrong date extraction')
+            return None
+        return published_time
 
 def extract_title(article):
     """
-	Extract the news title.
+    Extract the news title.
     """
 
     try:
@@ -86,7 +82,7 @@ def extract_title(article):
 
 def extract_content(article):
     """
-	Extract relevant information about news page
+    Extract relevant information about news page
     """
 
     try:
@@ -100,7 +96,7 @@ def extract_content(article):
 
 def detect_language(text):
     """
-	Detect the language of text using chromium_compact_language_detector
+    Detect the language of text using chromium_compact_language_detector
     :param text: text to be analyzed
     :return: {"name": portuguese, "pt"}
     """
@@ -131,73 +127,50 @@ def decompress_content(compressed_html):
 
 
 def download_article(url):
-	"""
-	Download the html content of a news page
+    """
+    Download the html content of a news page
     :param url: news page's url
     :type url: string
     :return: news page's content
-    :rtype: requests.models.Response	
-	"""
-	article = {
+    :rtype: requests.models.Response
+    """
+    article = {
         'link': url,
         'source': 'crawler_Valor',
     }
-	logger.info("Downloading article: %s", url)
-	try:
-		response = requests.get(url, timeout=30)
-	except ConnectionError:
-		logger.error("Failed to fetch %s", url)
-		return
-	except Timeout:
-		logger.error("Timed out while fetching %s", url)
-		return
-
-	encoding = response.encoding if response.encoding is not None else 'utf8'
-	dec_content = response.content.decode(encoding)
-	soup = BeautifulSoup(dec_content, "lxml")    
-	extractor  = Goose({'use_meta_language': False, 'target_language':'pt'})
-	news = extractor.extract(url=url)
-
-	article['link_content'] = compress_content(dec_content)
-	article['compressed'] = True
-	article['language'] = detect_language(dec_content)
-	article['title'] =  extract_title(news)
-	article['published'] = get_published_time(soup)
-	article['main_text'] = extract_content(news)
-
-	return article
-
-if __name__ =='__main__': 
-	for url in find_articles():
-		print url
-		exists = list(ARTICLES.find({"link": url}))
-		if not exists:
-			article = download_article(url)
-			print 'download done'
-			ARTICLES.insert(article, w=1)
-			print 'salved'
-		else:
-			print 'it already exists'
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+    logger.info("Downloading article: {0}".format(url))
+    try:
+        response = requests.get(url, timeout=30)
+    except ConnectionError:
+        logger.error("Failed to fetch {0}".format(url))
+        return
+    except Timeout:
+        logger.error("Timed out while fetching {0}".format(url))
+        return
+
+    encoding = response.encoding if response.encoding is not None else 'utf8'
+    dec_content = response.content.decode(encoding)
+    soup = BeautifulSoup(dec_content, "lxml")
+    extractor  = Goose({'use_meta_language': False, 'target_language':'pt'})
+    news = extractor.extract(url=url)
+
+    article['link_content'] = compress_content(dec_content)
+    article['compressed'] = True
+    article['language'] = detect_language(dec_content)
+    article['title'] =  extract_title(news)
+    article['published'] = get_published_time(soup)
+    article['main_text'] = extract_content(news)
+
+    return article
+
+if __name__ =='__main__':
+    for url in find_articles():
+        logger.info("url: {0}".format(url))
+        exists = list(ARTICLES.find({"link": url}))
+        if not exists:
+            article = download_article(url)
+            logger.info("Download done")
+            ARTICLES.insert(article, w=1)
+            logger.info("Saved")
+        else:
+            logger.info("It already exists")
\ No newline at end of file
diff --git a/capture/logging_mc.py b/capture/logging_mc.py
new file mode 100644
index 0000000..564821a
--- /dev/null
+++ b/capture/logging_mc.py
@@ -0,0 +1,32 @@
+import logging
+from logging.handlers import RotatingFileHandler
+
+def get_logger( source ):
+    """
+    Responsable for save logs of operations
+    :return: logger configured based on source
+    :rtype: logging.getLogger( source)
+    
+    """ 
+
+
+    logger = logging.getLogger(source)
+    logger.setLevel(logging.DEBUG)
+
+# create stream handler and set level to debug
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.DEBUG)
+    file_handler = RotatingFileHandler(    '/tmp/mediacloud_{0}.log'.format(source), maxBytes=5e6, backupCount=3)
+
+# create formatter
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+# add formatter to stream_handler
+    stream_handler.setFormatter(formatter)
+    file_handler.setFormatter(formatter)
+
+# add stream_handler to logger
+    logger.addHandler(stream_handler)  # uncomment for console output of messages
+    logger.addHandler(file_handler)
+    
+    return logger
\ No newline at end of file

From 53e12276903f798c90297f1d1f5c973fba4944f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= <veniciusgrjr@gmail.com>
Date: Tue, 22 Sep 2015 02:16:49 -0300
Subject: [PATCH 3/4] crawler for ZH

---
 capture/crawler_zh.py | 174 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 capture/crawler_zh.py

diff --git a/capture/crawler_zh.py b/capture/crawler_zh.py
new file mode 100644
index 0000000..c5631ec
--- /dev/null
+++ b/capture/crawler_zh.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+from goose import Goose
+import pymongo
+from bs4 import BeautifulSoup
+import requests
+import datetime
+import zlib
+import cPickle as CP
+import cld
+from requests.exceptions import ConnectionError, Timeout
+import bson
+import settings
+import logging_mc
+import re
+
+logger = logging_mc.get_logger( 'ZH' )
+
+client = pymongo.MongoClient(settings.MONGOHOST, 27017)
+MCDB = client.MCDB
+ARTICLES = MCDB.articles  # Article Collection
+ARTICLES.ensure_index("source")
+
+def find_articles():
+    """
+    Get the urls of last news
+    :return: last news' urls of all categories
+    :rtype: set()
+    """
+    urls = ['http://zh.clicrbs.com.br/rs/noticias/ultimas-noticias/',
+            'http://zh.clicrbs.com.br/rs/entretenimento/ultimas-noticias/',
+            'http://zh.clicrbs.com.br/rs/esportes/ultimas-noticias/',
+            'http://zh.clicrbs.com.br/rs/porto-alegre/ultimas-noticias/',
+            'http://zh.clicrbs.com.br/rs/vida-e-estilo/ultimas-noticias/',
+            'http://zh.clicrbs.com.br/rs/ultimas-noticias/']
+    news_urls = list()
+    for INDEX_URL in urls:
+        index = requests.get(INDEX_URL).content
+        soup = BeautifulSoup(index, "lxml")
+        news_index = soup.find_all(attrs={'class': re.compile(r".*\bmateria-manchete\b.*")})
+        news_urls = news_urls + ['' + BeautifulSoup(  art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
+    return set(news_urls )
+
+def get_published_time(soup):
+    """
+    Get the news' published datetime
+    :param soup: object with news html page
+    :type soup: BeautifulSoup object
+    :return: news published datetime
+    :rtype: string
+    """
+    try:
+        time_tag = soup.find('div', class_='meta__date').text
+    except IndexError:
+        logger.error('wrong time tag')
+        return None
+    if time_tag is None:
+        return None
+    else:
+        try:
+            match = re.search(r'\d{2}/\d{2}/\d{4} - \d{2}h\d{2}min', time_tag.encode('utf8'))
+            published_time = datetime.datetime.strptime(match.group(), '%d/%m/%Y - %Hh%Mmin')
+        except ValueError:
+            logger.error('wrong date extraction')
+            return None
+        return published_time
+
+def extract_title(article):
+    """
+    Extract the news title.
+    """
+
+    try:
+        title = article.title
+    except Exception as ex:
+        template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}"
+        message = template.format(type(ex).__name__, ex.args)
+        logger.exception(message)
+        return None
+    return title
+
+def extract_content(article):
+    """
+    Extract relevant information about news page
+    """
+
+    try:
+        body_content = article.cleaned_text
+    except Exception as ex:
+        template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}"
+        message = template.format(type(ex).__name__, ex.args)
+        logger.exception(message)
+        return None
+    return body_content
+
+def detect_language(text):
+    """
+    Detect the language of text using chromium_compact_language_detector
+    :param text: text to be analyzed
+    :return: {"name": portuguese, "pt"}
+    """
+    name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8'))
+    return {"name": name, "code": code}
+
+def compress_content(html):
+    """
+    Compresses and encodes html content so that it can be BSON encoded an store in mongodb
+    :param html: original html document
+    :return: compressed an b64 encoded document
+    """
+    pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL)
+    squished = zlib.compress(pickled)
+    encoded = bson.Binary(squished)  # b64.urlsafe_b64encode(squished)
+    return encoded
+
+def decompress_content(compressed_html):
+    """
+    Decompress data compressed by `compress_content`
+    :param compressed_html: compressed html document
+    :return: original html
+    """
+    # unencoded = b64.urlsafe_b64decode(str(compressed_html))
+    decompressed = zlib.decompress(compressed_html)
+    orig_html = CP.loads(decompressed)
+    return orig_html
+
+
+def download_article(url):
+    """
+    Download the html content of a news page
+    :param url: news page's url
+    :type url: string
+    :return: news page's content
+    :rtype: requests.models.Response
+    """
+    article = {
+        'link': url,
+        'source': 'crawler_ZH',
+    }
+    logger.info("Downloading article: {0}".format(url))
+    try:
+        response = requests.get(url, timeout=30)
+    except ConnectionError:
+        logger.error("Failed to fetch {0}".format(url))
+        return
+    except Timeout:
+        logger.error("Timed out while fetching {0}".format(url))
+        return
+
+    encoding = response.encoding if response.encoding is not None else 'utf8'
+    dec_content = response.content.decode(encoding)
+    soup = BeautifulSoup(dec_content, "lxml")
+    extractor  = Goose({'use_meta_language': False, 'target_language':'pt'})
+    news = extractor.extract(url=url)
+
+    article['link_content'] = compress_content(dec_content)
+    article['compressed'] = True
+    article['language'] = detect_language(dec_content)
+    article['title'] =  extract_title(news)
+    article['published'] = get_published_time(soup)
+    article['main_text'] = extract_content(news)
+
+    return article
+
+if __name__ =='__main__':
+    for url in find_articles():
+        logger.info("url: {0}".format(url))
+        exists = list(ARTICLES.find({"link": url}))
+        if not exists:
+            article = download_article(url)
+            logger.info("Download done")
+            ARTICLES.insert(article, w=1)
+            logger.info("Saved")
+        else:
+            logger.info("It already exists")
\ No newline at end of file

From b644a5f618a032b16814279108b184609d316128 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= <veniciusgrjr@gmail.com>
Date: Tue, 22 Sep 2015 20:14:04 -0300
Subject: [PATCH 4/4] I fixed de problems related to new linecharacter at the
 end of the files and to some unnecessary spaces.

---
 capture/crawler_valor.py | 12 ++++++------
 capture/crawler_zh.py    |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py
index a587f7b..8412b8d 100755
--- a/capture/crawler_valor.py
+++ b/capture/crawler_valor.py
@@ -12,7 +12,7 @@
 import settings
 import logging_mc
 
-logger = logging_mc.get_logger( 'valor' )
+logger = logging_mc.get_logger('valor')
 
 client = pymongo.MongoClient(settings.MONGOHOST, 27017)
 MCDB = client.MCDB
@@ -40,8 +40,8 @@ def find_articles():
         index = requests.get(INDEX_URL).content
         soup = BeautifulSoup(index, "lxml")
         news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2')
-        news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup(  art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
-    return set(news_urls )
+        news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup(art.encode('utf8'),"lxml").find('a').attrs['href'] for art in news_index]
+    return set(news_urls)
 
 def get_published_time(soup):
     """
@@ -60,7 +60,7 @@ def get_published_time(soup):
         return None
     else:
         try:
-            published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M')
+            published_time = datetime.datetime.strptime(time_tag.encode('utf8'), '%d/%m/%Y às %Hh%M')
         except ValueError:
             logger.error('wrong date extraction')
             return None
@@ -151,7 +151,7 @@ def download_article(url):
     encoding = response.encoding if response.encoding is not None else 'utf8'
     dec_content = response.content.decode(encoding)
     soup = BeautifulSoup(dec_content, "lxml")
-    extractor  = Goose({'use_meta_language': False, 'target_language':'pt'})
+    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
     news = extractor.extract(url=url)
 
     article['link_content'] = compress_content(dec_content)
@@ -173,4 +173,4 @@ def download_article(url):
             ARTICLES.insert(article, w=1)
             logger.info("Saved")
         else:
-            logger.info("It already exists")
\ No newline at end of file
+            logger.info("It already exists")
diff --git a/capture/crawler_zh.py b/capture/crawler_zh.py
index c5631ec..fefd971 100644
--- a/capture/crawler_zh.py
+++ b/capture/crawler_zh.py
@@ -13,7 +13,7 @@
 import logging_mc
 import re
 
-logger = logging_mc.get_logger( 'ZH' )
+logger = logging_mc.get_logger('ZH')
 
 client = pymongo.MongoClient(settings.MONGOHOST, 27017)
 MCDB = client.MCDB
@@ -37,8 +37,8 @@ def find_articles():
         index = requests.get(INDEX_URL).content
         soup = BeautifulSoup(index, "lxml")
         news_index = soup.find_all(attrs={'class': re.compile(r".*\bmateria-manchete\b.*")})
-        news_urls = news_urls + ['' + BeautifulSoup(  art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
-    return set(news_urls )
+        news_urls = news_urls + ['' + BeautifulSoup(art.encode('utf8'), "lxml").find('a').attrs['href'] for art in news_index]
+    return set(news_urls)
 
 def get_published_time(soup):
     """
@@ -171,4 +171,4 @@ def download_article(url):
             ARTICLES.insert(article, w=1)
             logger.info("Saved")
         else:
-            logger.info("It already exists")
\ No newline at end of file
+            logger.info("It already exists")