From c2add64b3eccc1d83d3fe131d643fddbe418b7f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Code=C3=A7o=20Coelho?= Date: Thu, 26 Jun 2014 10:19:13 -0300 Subject: [PATCH 1/4] merged master --- capture/twitter/GeoLoc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capture/twitter/GeoLoc.py b/capture/twitter/GeoLoc.py index 1fd48ec..0f6cda1 100644 --- a/capture/twitter/GeoLoc.py +++ b/capture/twitter/GeoLoc.py @@ -9,7 +9,7 @@ from dateutil import parser -FORMAT = '%(asctime)s - %(levelname)s - %(message)s' +FORMAT = "%(asctime)s - %(levelname)s - %(message)s" logging.basicConfig(filename='tweet_geoloc.log', format=FORMAT, level=logging.DEBUG) # Initialize connection From dfdbce62cfaf2e60785864568a15c320bebb3773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Code=C3=A7o=20Coelho?= Date: Thu, 26 Jun 2014 10:28:04 -0300 Subject: [PATCH 2/4] Added pypln corpus name to settings; added nlp import line to downloader. --- capture/downloader.py | 2 +- capture/nlp.py | 5 +++-- capture/settings.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/capture/downloader.py b/capture/downloader.py index 587b84a..4d7a9a9 100755 --- a/capture/downloader.py +++ b/capture/downloader.py @@ -28,8 +28,8 @@ from pymongo.errors import DuplicateKeyError import bson from dateutil.parser import parse - import settings +from nlp import send_to_pypln sys.path.append('/'.join(os.getcwd().split("/")[:-1])) diff --git a/capture/nlp.py b/capture/nlp.py index d365e8b..5e7e71d 100644 --- a/capture/nlp.py +++ b/capture/nlp.py @@ -21,11 +21,12 @@ ARTICLES.ensure_index([("pypln_url", pymongo.ASCENDING)], sparse=True) -def get_corpus(corpus_name='MC_articles'): + +def get_corpus(): """ Return the existing Mediacloud corpus or create it and return. - :rtype : Corpus object """ + corpus_name = settings.CORPUS_NAME try: article_corpus = pypln.add_corpus(name=corpus_name, description='MediaCloud Articles') except RuntimeError: diff --git a/capture/settings.py b/capture/settings.py index d5b339d..03a988b 100644 --- a/capture/settings.py +++ b/capture/settings.py @@ -18,4 +18,4 @@ ########## PYPLNHOST = "http://fgv.pypln.org/" PYPLN_CREDENTIALS = ("mediacloud2", "senha do mediacloud") - +CORPUS_NAME = "MC_articles" From dfeb6163753fef4acc52560fdf4e78633f642a48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Code=C3=A7o=20Coelho?= Date: Thu, 26 Jun 2014 10:45:30 -0300 Subject: [PATCH 3/4] Added pagination to sending to avoid cursor timeout --- capture/load_into_pypln.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/capture/load_into_pypln.py b/capture/load_into_pypln.py index 3b9a3ca..9dd980e 100644 --- a/capture/load_into_pypln.py +++ b/capture/load_into_pypln.py @@ -19,10 +19,9 @@ articles = client.MCDB.articles - -def load(corpus_name, skip, limit): - corpus = nlp.get_corpus(corpus_name) - +def load(skip, limit=0): + corpus = nlp.get_corpus() + articles_sent = 0 filter_ = {'pypln_url': {'$exists': False}} find_kwargs = {'sort': [("_id", pymongo.DESCENDING)]} @@ -30,14 +29,20 @@ def load(corpus_name, skip, limit): find_kwargs.update({'skip': skip}) if limit: find_kwargs.update({'limit': limit}) + if limit == 0: + count = articles.count() + else: + count = limit + while articles_sent < count: + cursor = articles.find(filter_, skip=articles_sent, limit=100, **find_kwargs) + for article in cursor: + pypln_document = nlp.send_to_pypln(article, corpus) + _id = article['_id'] + articles.update({'_id': _id}, + {'$set': {"pypln_url": pypln_document.url}}) + sys.stdout.write('inserted document with id {} into PyPLN\n'.format(_id)) + articles_sent += 1 - cursor = articles.find(filter_, **find_kwargs) - for article in cursor: - pypln_document = nlp.send_to_pypln(article, corpus) - _id = article['_id'] - articles.update({'_id': _id}, - {'$set': {"pypln_url": pypln_document.url}}) - sys.stdout.write('inserted document with id {} into PyPLN\n'.format(_id)) if __name__=="__main__": @@ -45,13 +50,13 @@ def load(corpus_name, skip, limit): parser = argparse.ArgumentParser(description=("Load MediaCloud documents" "into a PyPLN instance")) - parser.add_argument("-c", "--corpus_name", type=str, metavar="NAME", - default="MC_articles", - help="Uploads documents to a corpus named NAME") + # parser.add_argument("-c", "--corpus_name", type=str, metavar="NAME", + # default="MC_articles", + # help="Uploads documents to a corpus named NAME") parser.add_argument("-l", "--limit", metavar='N', type=int, default=0, help="Adds limit=N to the mongo query") parser.add_argument("-s", "--skip", metavar='N', type=int, default=0, help="Adds skip=N to the mongo query") args = parser.parse_args() - load(args.corpus_name, args.skip, args.limit) + load(args.skip, args.limit) From 335b7a162090a0449f685bc4fe98d3c81dd6833d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20Code=C3=A7o=20Coelho?= Date: Thu, 26 Jun 2014 10:59:04 -0300 Subject: [PATCH 4/4] Load into Pypln working, albeit very slowly. --- capture/load_into_pypln.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/capture/load_into_pypln.py b/capture/load_into_pypln.py index 9dd980e..49e79be 100644 --- a/capture/load_into_pypln.py +++ b/capture/load_into_pypln.py @@ -1,4 +1,4 @@ -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- u""" Created on 03/04/14 by fccoelho @@ -40,23 +40,23 @@ def load(skip, limit=0): _id = article['_id'] articles.update({'_id': _id}, {'$set': {"pypln_url": pypln_document.url}}) - sys.stdout.write('inserted document with id {} into PyPLN\n'.format(_id)) + sys.stdout.write('inserted document {} of {}, with id {} into PyPLN\n'.format(articles_sent, count, _id)) articles_sent += 1 - -if __name__=="__main__": +if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser(description=("Load MediaCloud documents" - "into a PyPLN instance")) + "into a PyPLN instance")) # parser.add_argument("-c", "--corpus_name", type=str, metavar="NAME", # default="MC_articles", # help="Uploads documents to a corpus named NAME") parser.add_argument("-l", "--limit", metavar='N', type=int, default=0, - help="Adds limit=N to the mongo query") + help="Adds limit=N to the mongo query") parser.add_argument("-s", "--skip", metavar='N', type=int, default=0, - help="Adds skip=N to the mongo query") + help="Adds skip=N to the mongo query") args = parser.parse_args() load(args.skip, args.limit)