From 3baadf0c80b87318d6d315df63c1d3b30b37ea52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= Date: Mon, 21 Sep 2015 14:13:21 -0300 Subject: [PATCH 1/4] Initial submission: crawler for Valor --- capture/crawler_valor.py | 203 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100755 capture/crawler_valor.py diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py new file mode 100755 index 0000000..b28805a --- /dev/null +++ b/capture/crawler_valor.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +from goose import Goose +import pymongo +from bs4 import BeautifulSoup +import requests +import re +import pandas as pd +import datetime +import zlib +import cPickle as CP +import cld +import sys +from requests.exceptions import ConnectionError, MissingSchema, Timeout +import bson +import settings +import logging_mc + +logger = logging_mc.get_logger( 'valor' ) + + +client = pymongo.MongoClient(settings.MONGOHOST, 27017) +MCDB = client.MCDB +ARTICLES = MCDB.articles # Article Collection +ARTICLES.ensure_index("source") + +def find_articles(): + """ + Get the urls of last news + :return: last news' urls of all categories + :rtype: set() + """ + urls = ['http://www.valor.com.br/ultimas-noticias/brasil', + 'http://www.valor.com.br/ultimas-noticias/politica', + 'http://www.valor.com.br/ultimas-noticias/financas', + 'http://www.valor.com.br/ultimas-noticias/empresas', + 'http://www.valor.com.br/ultimas-noticias/agro', + 'http://www.valor.com.br/ultimas-noticias/internacional', + 'http://www.valor.com.br/ultimas-noticias/opiniao', + 'http://www.valor.com.br/ultimas-noticias/legislacao', + 'http://www.valor.com.br/ultimas-noticias/carreira', + 'http://www.valor.com.br/ultimas-noticias/cultura'] + news_urls = list() + for INDEX_URL in urls: + index = requests.get(INDEX_URL).content + soup = BeautifulSoup(index, "lxml") + news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2') + news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] + return set(news_urls ) + +def get_published_time(soup): + """ + Get the news' published datetime + :param soup: object with news html page + :type soup: BeautifulSoup object + :return: news published datetime + :rtype: string + """ + try: + time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text + except IndexError: + logger.error('wrong time tag') + return None + if time_tag is None: + return None + else: + try: + published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M') + except ValueError: + logger.error('wrong date extraction') + return None + return published_time + +def extract_title(article): + """ + Extract the news title. + """ + + try: + title = article.title + except Exception as ex: + template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.exception(message) + return None + return title + +def extract_content(article): + """ + Extract relevant information about news page + """ + + try: + body_content = article.cleaned_text + except Exception as ex: + template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.exception(message) + return None + return body_content + +def detect_language(text): + """ + Detect the language of text using chromium_compact_language_detector + :param text: text to be analyzed + :return: {"name": portuguese, "pt"} + """ + name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8')) + return {"name": name, "code": code} + +def compress_content(html): + """ + Compresses and encodes html content so that it can be BSON encoded an store in mongodb + :param html: original html document + :return: compressed an b64 encoded document + """ + pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL) + squished = zlib.compress(pickled) + encoded = bson.Binary(squished) # b64.urlsafe_b64encode(squished) + return encoded + +def decompress_content(compressed_html): + """ + Decompress data compressed by `compress_content` + :param compressed_html: compressed html document + :return: original html + """ + # unencoded = b64.urlsafe_b64decode(str(compressed_html)) + decompressed = zlib.decompress(compressed_html) + orig_html = CP.loads(decompressed) + return orig_html + + +def download_article(url): + """ + Download the html content of a news page + :param url: news page's url + :type url: string + :return: news page's content + :rtype: requests.models.Response + """ + article = { + 'link': url, + 'source': 'crawler_Valor', + } + logger.info("Downloading article: %s", url) + try: + response = requests.get(url, timeout=30) + except ConnectionError: + logger.error("Failed to fetch %s", url) + return + except Timeout: + logger.error("Timed out while fetching %s", url) + return + + encoding = response.encoding if response.encoding is not None else 'utf8' + dec_content = response.content.decode(encoding) + soup = BeautifulSoup(dec_content, "lxml") + extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) + news = extractor.extract(url=url) + + article['link_content'] = compress_content(dec_content) + article['compressed'] = True + article['language'] = detect_language(dec_content) + article['title'] = extract_title(news) + article['published'] = get_published_time(soup) + article['main_text'] = extract_content(news) + + return article + +if __name__ =='__main__': + for url in find_articles(): + print url + exists = list(ARTICLES.find({"link": url})) + if not exists: + article = download_article(url) + print 'download done' + ARTICLES.insert(article, w=1) + print 'salved' + else: + print 'it already exists' + + + + + + + + + + + + + + + + + + + + + + + From e5f5ee1a60894087eb423be239349e26db248d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= Date: Mon, 21 Sep 2015 21:32:35 -0300 Subject: [PATCH 2/4] I fixed the problem related to line breaking, to the new style string formatting, to print statements and to unnecessary imports, as suggested for @flavioamieiro. Please take a look at this code. --- capture/crawler_valor.py | 197 +++++++++++++++++---------------------- capture/logging_mc.py | 32 +++++++ 2 files changed, 117 insertions(+), 112 deletions(-) create mode 100644 capture/logging_mc.py diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py index b28805a..a587f7b 100755 --- a/capture/crawler_valor.py +++ b/capture/crawler_valor.py @@ -3,76 +3,72 @@ import pymongo from bs4 import BeautifulSoup import requests -import re -import pandas as pd import datetime import zlib import cPickle as CP import cld -import sys -from requests.exceptions import ConnectionError, MissingSchema, Timeout +from requests.exceptions import ConnectionError, Timeout import bson import settings import logging_mc logger = logging_mc.get_logger( 'valor' ) - client = pymongo.MongoClient(settings.MONGOHOST, 27017) MCDB = client.MCDB ARTICLES = MCDB.articles # Article Collection ARTICLES.ensure_index("source") def find_articles(): - """ - Get the urls of last news - :return: last news' urls of all categories - :rtype: set() - """ - urls = ['http://www.valor.com.br/ultimas-noticias/brasil', - 'http://www.valor.com.br/ultimas-noticias/politica', - 'http://www.valor.com.br/ultimas-noticias/financas', - 'http://www.valor.com.br/ultimas-noticias/empresas', - 'http://www.valor.com.br/ultimas-noticias/agro', - 'http://www.valor.com.br/ultimas-noticias/internacional', - 'http://www.valor.com.br/ultimas-noticias/opiniao', - 'http://www.valor.com.br/ultimas-noticias/legislacao', - 'http://www.valor.com.br/ultimas-noticias/carreira', - 'http://www.valor.com.br/ultimas-noticias/cultura'] - news_urls = list() - for INDEX_URL in urls: - index = requests.get(INDEX_URL).content - soup = BeautifulSoup(index, "lxml") - news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2') - news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] - return set(news_urls ) + """ + Get the urls of last news + :return: last news' urls of all categories + :rtype: set() + """ + urls = ['http://www.valor.com.br/ultimas-noticias/brasil', + 'http://www.valor.com.br/ultimas-noticias/politica', + 'http://www.valor.com.br/ultimas-noticias/financas', + 'http://www.valor.com.br/ultimas-noticias/empresas', + 'http://www.valor.com.br/ultimas-noticias/agro', + 'http://www.valor.com.br/ultimas-noticias/internacional', + 'http://www.valor.com.br/ultimas-noticias/opiniao', + 'http://www.valor.com.br/ultimas-noticias/legislacao', + 'http://www.valor.com.br/ultimas-noticias/carreira', + 'http://www.valor.com.br/ultimas-noticias/cultura'] + news_urls = list() + for INDEX_URL in urls: + index = requests.get(INDEX_URL).content + soup = BeautifulSoup(index, "lxml") + news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2') + news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] + return set(news_urls ) def get_published_time(soup): - """ - Get the news' published datetime - :param soup: object with news html page + """ + Get the news' published datetime + :param soup: object with news html page :type soup: BeautifulSoup object :return: news published datetime :rtype: string - """ - try: - time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text - except IndexError: - logger.error('wrong time tag') - return None - if time_tag is None: - return None - else: - try: - published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M') - except ValueError: - logger.error('wrong date extraction') - return None - return published_time + """ + try: + time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text + except IndexError: + logger.error('wrong time tag') + return None + if time_tag is None: + return None + else: + try: + published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M') + except ValueError: + logger.error('wrong date extraction') + return None + return published_time def extract_title(article): """ - Extract the news title. + Extract the news title. """ try: @@ -86,7 +82,7 @@ def extract_title(article): def extract_content(article): """ - Extract relevant information about news page + Extract relevant information about news page """ try: @@ -100,7 +96,7 @@ def extract_content(article): def detect_language(text): """ - Detect the language of text using chromium_compact_language_detector + Detect the language of text using chromium_compact_language_detector :param text: text to be analyzed :return: {"name": portuguese, "pt"} """ @@ -131,73 +127,50 @@ def decompress_content(compressed_html): def download_article(url): - """ - Download the html content of a news page + """ + Download the html content of a news page :param url: news page's url :type url: string :return: news page's content - :rtype: requests.models.Response - """ - article = { + :rtype: requests.models.Response + """ + article = { 'link': url, 'source': 'crawler_Valor', } - logger.info("Downloading article: %s", url) - try: - response = requests.get(url, timeout=30) - except ConnectionError: - logger.error("Failed to fetch %s", url) - return - except Timeout: - logger.error("Timed out while fetching %s", url) - return - - encoding = response.encoding if response.encoding is not None else 'utf8' - dec_content = response.content.decode(encoding) - soup = BeautifulSoup(dec_content, "lxml") - extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) - news = extractor.extract(url=url) - - article['link_content'] = compress_content(dec_content) - article['compressed'] = True - article['language'] = detect_language(dec_content) - article['title'] = extract_title(news) - article['published'] = get_published_time(soup) - article['main_text'] = extract_content(news) - - return article - -if __name__ =='__main__': - for url in find_articles(): - print url - exists = list(ARTICLES.find({"link": url})) - if not exists: - article = download_article(url) - print 'download done' - ARTICLES.insert(article, w=1) - print 'salved' - else: - print 'it already exists' - - - - - - - - - - - - - - - - - - - - - - - + logger.info("Downloading article: {0}".format(url)) + try: + response = requests.get(url, timeout=30) + except ConnectionError: + logger.error("Failed to fetch {0}".format(url)) + return + except Timeout: + logger.error("Timed out while fetching {0}".format(url)) + return + + encoding = response.encoding if response.encoding is not None else 'utf8' + dec_content = response.content.decode(encoding) + soup = BeautifulSoup(dec_content, "lxml") + extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) + news = extractor.extract(url=url) + + article['link_content'] = compress_content(dec_content) + article['compressed'] = True + article['language'] = detect_language(dec_content) + article['title'] = extract_title(news) + article['published'] = get_published_time(soup) + article['main_text'] = extract_content(news) + + return article + +if __name__ =='__main__': + for url in find_articles(): + logger.info("url: {0}".format(url)) + exists = list(ARTICLES.find({"link": url})) + if not exists: + article = download_article(url) + logger.info("Download done") + ARTICLES.insert(article, w=1) + logger.info("Saved") + else: + logger.info("It already exists") \ No newline at end of file diff --git a/capture/logging_mc.py b/capture/logging_mc.py new file mode 100644 index 0000000..564821a --- /dev/null +++ b/capture/logging_mc.py @@ -0,0 +1,32 @@ +import logging +from logging.handlers import RotatingFileHandler + +def get_logger( source ): + """ + Responsable for save logs of operations + :return: logger configured based on source + :rtype: logging.getLogger( source) + + """ + + + logger = logging.getLogger(source) + logger.setLevel(logging.DEBUG) + +# create stream handler and set level to debug + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.DEBUG) + file_handler = RotatingFileHandler( '/tmp/mediacloud_{0}.log'.format(source), maxBytes=5e6, backupCount=3) + +# create formatter + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# add formatter to stream_handler + stream_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) + +# add stream_handler to logger + logger.addHandler(stream_handler) # uncomment for console output of messages + logger.addHandler(file_handler) + + return logger \ No newline at end of file From 53e12276903f798c90297f1d1f5c973fba4944f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= Date: Tue, 22 Sep 2015 02:16:49 -0300 Subject: [PATCH 3/4] crawler for ZH --- capture/crawler_zh.py | 174 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 capture/crawler_zh.py diff --git a/capture/crawler_zh.py b/capture/crawler_zh.py new file mode 100644 index 0000000..c5631ec --- /dev/null +++ b/capture/crawler_zh.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +from goose import Goose +import pymongo +from bs4 import BeautifulSoup +import requests +import datetime +import zlib +import cPickle as CP +import cld +from requests.exceptions import ConnectionError, Timeout +import bson +import settings +import logging_mc +import re + +logger = logging_mc.get_logger( 'ZH' ) + +client = pymongo.MongoClient(settings.MONGOHOST, 27017) +MCDB = client.MCDB +ARTICLES = MCDB.articles # Article Collection +ARTICLES.ensure_index("source") + +def find_articles(): + """ + Get the urls of last news + :return: last news' urls of all categories + :rtype: set() + """ + urls = ['http://zh.clicrbs.com.br/rs/noticias/ultimas-noticias/', + 'http://zh.clicrbs.com.br/rs/entretenimento/ultimas-noticias/', + 'http://zh.clicrbs.com.br/rs/esportes/ultimas-noticias/', + 'http://zh.clicrbs.com.br/rs/porto-alegre/ultimas-noticias/', + 'http://zh.clicrbs.com.br/rs/vida-e-estilo/ultimas-noticias/', + 'http://zh.clicrbs.com.br/rs/ultimas-noticias/'] + news_urls = list() + for INDEX_URL in urls: + index = requests.get(INDEX_URL).content + soup = BeautifulSoup(index, "lxml") + news_index = soup.find_all(attrs={'class': re.compile(r".*\bmateria-manchete\b.*")}) + news_urls = news_urls + ['' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] + return set(news_urls ) + +def get_published_time(soup): + """ + Get the news' published datetime + :param soup: object with news html page + :type soup: BeautifulSoup object + :return: news published datetime + :rtype: string + """ + try: + time_tag = soup.find('div', class_='meta__date').text + except IndexError: + logger.error('wrong time tag') + return None + if time_tag is None: + return None + else: + try: + match = re.search(r'\d{2}/\d{2}/\d{4} - \d{2}h\d{2}min', time_tag.encode('utf8')) + published_time = datetime.datetime.strptime(match.group(), '%d/%m/%Y - %Hh%Mmin') + except ValueError: + logger.error('wrong date extraction') + return None + return published_time + +def extract_title(article): + """ + Extract the news title. + """ + + try: + title = article.title + except Exception as ex: + template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.exception(message) + return None + return title + +def extract_content(article): + """ + Extract relevant information about news page + """ + + try: + body_content = article.cleaned_text + except Exception as ex: + template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.exception(message) + return None + return body_content + +def detect_language(text): + """ + Detect the language of text using chromium_compact_language_detector + :param text: text to be analyzed + :return: {"name": portuguese, "pt"} + """ + name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8')) + return {"name": name, "code": code} + +def compress_content(html): + """ + Compresses and encodes html content so that it can be BSON encoded an store in mongodb + :param html: original html document + :return: compressed an b64 encoded document + """ + pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL) + squished = zlib.compress(pickled) + encoded = bson.Binary(squished) # b64.urlsafe_b64encode(squished) + return encoded + +def decompress_content(compressed_html): + """ + Decompress data compressed by `compress_content` + :param compressed_html: compressed html document + :return: original html + """ + # unencoded = b64.urlsafe_b64decode(str(compressed_html)) + decompressed = zlib.decompress(compressed_html) + orig_html = CP.loads(decompressed) + return orig_html + + +def download_article(url): + """ + Download the html content of a news page + :param url: news page's url + :type url: string + :return: news page's content + :rtype: requests.models.Response + """ + article = { + 'link': url, + 'source': 'crawler_ZH', + } + logger.info("Downloading article: {0}".format(url)) + try: + response = requests.get(url, timeout=30) + except ConnectionError: + logger.error("Failed to fetch {0}".format(url)) + return + except Timeout: + logger.error("Timed out while fetching {0}".format(url)) + return + + encoding = response.encoding if response.encoding is not None else 'utf8' + dec_content = response.content.decode(encoding) + soup = BeautifulSoup(dec_content, "lxml") + extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) + news = extractor.extract(url=url) + + article['link_content'] = compress_content(dec_content) + article['compressed'] = True + article['language'] = detect_language(dec_content) + article['title'] = extract_title(news) + article['published'] = get_published_time(soup) + article['main_text'] = extract_content(news) + + return article + +if __name__ =='__main__': + for url in find_articles(): + logger.info("url: {0}".format(url)) + exists = list(ARTICLES.find({"link": url})) + if not exists: + article = download_article(url) + logger.info("Download done") + ARTICLES.insert(article, w=1) + logger.info("Saved") + else: + logger.info("It already exists") \ No newline at end of file From b644a5f618a032b16814279108b184609d316128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= Date: Tue, 22 Sep 2015 20:14:04 -0300 Subject: [PATCH 4/4] I fixed de problems related to new linecharacter at the end of the files and to some unnecessary spaces. --- capture/crawler_valor.py | 12 ++++++------ capture/crawler_zh.py | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py index a587f7b..8412b8d 100755 --- a/capture/crawler_valor.py +++ b/capture/crawler_valor.py @@ -12,7 +12,7 @@ import settings import logging_mc -logger = logging_mc.get_logger( 'valor' ) +logger = logging_mc.get_logger('valor') client = pymongo.MongoClient(settings.MONGOHOST, 27017) MCDB = client.MCDB @@ -40,8 +40,8 @@ def find_articles(): index = requests.get(INDEX_URL).content soup = BeautifulSoup(index, "lxml") news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2') - news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] - return set(news_urls ) + news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup(art.encode('utf8'),"lxml").find('a').attrs['href'] for art in news_index] + return set(news_urls) def get_published_time(soup): """ @@ -60,7 +60,7 @@ def get_published_time(soup): return None else: try: - published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M') + published_time = datetime.datetime.strptime(time_tag.encode('utf8'), '%d/%m/%Y às %Hh%M') except ValueError: logger.error('wrong date extraction') return None @@ -151,7 +151,7 @@ def download_article(url): encoding = response.encoding if response.encoding is not None else 'utf8' dec_content = response.content.decode(encoding) soup = BeautifulSoup(dec_content, "lxml") - extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) + extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) news = extractor.extract(url=url) article['link_content'] = compress_content(dec_content) @@ -173,4 +173,4 @@ def download_article(url): ARTICLES.insert(article, w=1) logger.info("Saved") else: - logger.info("It already exists") \ No newline at end of file + logger.info("It already exists") diff --git a/capture/crawler_zh.py b/capture/crawler_zh.py index c5631ec..fefd971 100644 --- a/capture/crawler_zh.py +++ b/capture/crawler_zh.py @@ -13,7 +13,7 @@ import logging_mc import re -logger = logging_mc.get_logger( 'ZH' ) +logger = logging_mc.get_logger('ZH') client = pymongo.MongoClient(settings.MONGOHOST, 27017) MCDB = client.MCDB @@ -37,8 +37,8 @@ def find_articles(): index = requests.get(INDEX_URL).content soup = BeautifulSoup(index, "lxml") news_index = soup.find_all(attrs={'class': re.compile(r".*\bmateria-manchete\b.*")}) - news_urls = news_urls + ['' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] - return set(news_urls ) + news_urls = news_urls + ['' + BeautifulSoup(art.encode('utf8'), "lxml").find('a').attrs['href'] for art in news_index] + return set(news_urls) def get_published_time(soup): """ @@ -171,4 +171,4 @@ def download_article(url): ARTICLES.insert(article, w=1) logger.info("Saved") else: - logger.info("It already exists") \ No newline at end of file + logger.info("It already exists")