From 666fa78e2ad19c7d7cbd7aed372cbaa49d4395da Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Wed, 7 Oct 2015 13:16:15 -0400 Subject: [PATCH] fixes #80 --- README.md | 13 +++++++++++++ setup.py | 4 ++-- utils/{archive.py => twarc-archive.py} | 9 ++++----- 3 files changed, 19 insertions(+), 7 deletions(-) rename utils/{archive.py => twarc-archive.py} (93%) diff --git a/README.md b/README.md index 7f0ef66e..de0e572a 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,19 @@ fetch the full JSON for each tweet and write it to stdout as line-oriented JSON: twarc.py --hydrate ids.txt > tweets.json +## Archive + +In addition to `twarc.py` when you install twarc you will also get a +`twarc-archive.py` command line tool. This uses twarc as a library to +periodically collect data matching a particular search query. It's useful if you +don't necessarily want to collect tweets as they happen with the streaming +api, and are content to perhaps run it every day (perhaps) from cron to collect +what you can. The script will keep the files organized, and is smart enough to +use the most recent file to determine when it can stop collecting so there are +no duplicates. + + twarc-archive.py + ## Use as a Library If you want you can use twarc programatically as a library to collect diff --git a/setup.py b/setup.py index a9be4e5e..7aabb086 100644 --- a/setup.py +++ b/setup.py @@ -28,12 +28,12 @@ def run(self): setup( name='twarc', - version='0.3.3', + version='0.3.4', url='http://github.com/edsu/twarc', author='Ed Summers', author_email='ehs@pobox.com', py_modules=['twarc', ], - scripts=['twarc.py'], + scripts=['twarc.py', 'utils/twarc-archive.py'], description='command line utility to archive Twitter search results as line-oriented-json', cmdclass={'test': PyTest}, install_requires=dependencies, diff --git a/utils/archive.py b/utils/twarc-archive.py similarity index 93% rename from utils/archive.py rename to utils/twarc-archive.py index b819f85f..1455cc31 100755 --- a/utils/archive.py +++ b/utils/twarc-archive.py @@ -8,7 +8,7 @@ So for example if you want to search for tweets mentioning "ferguson" you can run it: - ./archive.py ferguson /mnt/tweets/ferguson + % twarc-archive.py ferguson /mnt/tweets/ferguson The first time you run this it will search twitter for tweets matching "ferguson" and write them to a file: @@ -17,16 +17,15 @@ When you run the exact same command again: - ./archive.py ferguson /mnt/tweets/ferguson + % twarc-archive.py ferguson /mnt/tweets/ferguson it will get the first tweet id in tweets-0001.json and use it to write another file which includes any new tweets since that tweet: /mnt/tweets/ferguson/tweets-0002.json -This functionality was initially part of twarc.py itself (not in a utility). -If it proves useful perhaps it can go back in. But for now twarc.py writes -to stdout to let you manage your data the way you want to. +This functionality was initially part of twarc.py itself, but has been split out +into a separate utility. """