Add optional Google Sheets support.

- Add ability to write to Google Sheets and local CSV. - Google Sheets writing respects Google API write limits for unpaid account. - Update README. - Version bump.
Digital-Feminist-Network · Nov 11, 2024 · a3e9136 · a3e9136
1 parent 04f09ee
commit a3e9136
Show file tree

Hide file tree

Showing 5 changed files with 170 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ __pycache__
 
 .env
 *.csv
+*.json
diff --git a/README.md b/README.md
@@ -9,16 +9,26 @@ A Python utility to capture a thread and commends for a given Reddit thread. Thi
 
 ## Usage
 
+Local CSV
 ```
 crevettes thread_id
 ```
 
+Google Sheets and Local CSV.
+```
+crevettes thread_id google_sheets_folder_id path/to/json/keyfile 
+```
+
 ## Example
 
 ```
 crevettes 1bq51lp
 ```
 
+```
+crevettes 1gnex8a 1Nr1xMEN1WTxrwNfyFz4fg1fTyB3oCAcg
+```
+
 You will need a `.env` file populated with the following:
 
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,14 +4,17 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "crevettes"
-version = "0.0.4"
+version = "0.0.5"
 readme = "README.md"
 requires-python = ">=3.9.9"
 description = "A Python utility to capture a thread and commends for a given Reddit thread."
 authors = [
     { name="Nick Ruest", email="[email protected]" }
 ]
-dependencies = ["praw", "python-dotenv"]
+dependencies = [
+  "praw>=7.8.1",
+  "python-dotenv"
+]
 
 [tool.setuptools]
 packages = ["crevettes"]

diff --git a/src/crevettes/harvester.py b/src/crevettes/harvester.py
@@ -1,10 +1,13 @@
 import csv
 import os
 import re
+import time
 from datetime import datetime
 
+import gspread
 import praw
 from dotenv import load_dotenv
+from oauth2client.service_account import ServiceAccountCredentials
 
 # Load Reddit API environment variables.
 load_dotenv()
@@ -17,86 +20,166 @@
 )
 
 
+# Google Sheets setup.
+def setup_google_sheets(sheet_name, folder_id, keyfile_path):
+    scope = [
+        "https://www.googleapis.com/auth/spreadsheets",
+        "https://www.googleapis.com/auth/drive",
+    ]
+    creds = ServiceAccountCredentials.from_json_keyfile_name(keyfile_path, scope)
+    client = gspread.authorize(creds)
+
+    # Create a new Google Sheet in the given folder.
+    folder = client.create(sheet_name, folder_id=folder_id)
+    sheet = folder.get_worksheet(0)
+    return sheet
+
+
 # Fetch a given Reddit thread by thread id.
-def fetch_reddit_thread(thread_id):
+def fetch_reddit_thread(thread_id, comment_limit=None):
     submission = reddit.submission(id=thread_id)
     submission.comments.replace_more(limit=None)
-    comments = submission.comments.list()
 
-    return submission, comments
+    all_comments = []
+
+    for comment in submission.comments.list():
+        all_comments.append(comment)
+        # Only break if a comment_limit is set and the limit is reached.
+        if comment_limit and len(all_comments) >= comment_limit:
+            break
+        # Delay between API calls to prevent hitting rate limits
+        time.sleep(0.5)
+
+    return submission, all_comments
 
 
 # Clean titles for csv filenames.
 def clean_thread_title(title):
     title = title.lower()
     title = re.sub(r"[^\w\s-]", "", title)
     title = re.sub(r"\s+", "-", title)
-
     return title
 
 
-# Write thread metadata to csv.
-def write_to_csv(submission, comments, thread_id):
+def resize_sheet_if_needed(sheet, total_rows):
+    """Resize the Google Sheet if the required number of rows exceeds current limit."""
+    current_row_count = sheet.row_count
+    if total_rows > current_row_count:
+        new_row_count = max(total_rows, current_row_count * 2)
+        sheet.resize(rows=new_row_count)
+        print(f"Resized sheet to {new_row_count} rows")
+
+
+def find_last_row(sheet):
+    """Find the last non-empty row in the sheet."""
+    str_values = sheet.col_values(1)
+    return len(str_values) if str_values else 0
+
+
+def write_to_csv_and_sheets(submission, comments, thread_id, sheet=None):
     clean_title = clean_thread_title(submission.title)
     csv_filename = f"reddit-{thread_id}--{clean_title}.csv"
 
+    # Define the header row.
+    header_row = [
+        "timestamp",
+        "thread_id",
+        "thread_title",
+        "thread_submitter",
+        "thread_body",
+        "thread_timestamp",
+        "thread_vote_count",
+        "comment_id",
+        "comment_username",
+        "comment_body",
+        "comment_reply_to_id",
+        "comment_vote_count",
+    ]
+
+    # Open the CSV file and write the headers.
     with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
         writer = csv.writer(file)
-        writer.writerow(
-            [
-                "timestamp",
-                "thread_id",
-                "thread_title",
-                "thread_submitter",
-                "thread_body",
-                "thread_timestamp",
-                "thread_vote_count",
-                "comment_id",
-                "comment_username",
-                "comment_body",
-                "comment_reply_to_id",
-                "comment_vote_count"
-            ]
-        )
+        writer.writerow(header_row)
+
+        # Prepare data for batch update including the header, and start with header row.
+        batch_data = [header_row]
 
-        writer.writerow(
-            [
+        # Write the submission (thread) metadata to CSV and batch data.
+        thread_row = [
+            datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
+            submission.id,
+            submission.title,
+            (submission.author.name if submission.author else "Deleted"),
+            submission.selftext,
+            datetime.fromtimestamp(submission.created_utc).strftime(
+                "%Y-%m-%d %H:%M:%S"
+            ),
+            submission.score,
+            "",
+            "",
+            "",
+            "",
+        ]
+        writer.writerow(thread_row)
+        batch_data.append(thread_row)
+
+        # Write the comments data to CSV and batch data.
+        for comment in comments:
+            comment_row = [
                 datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
                 submission.id,
-                submission.title,
-                (submission.author.name if submission.author else "Deleted"),
-                submission.selftext,
-                datetime.fromtimestamp(submission.created_utc).strftime(
-                    "%Y-%m-%d %H:%M:%S"
-                ),
-                submission.score,
                 "",
                 "",
                 "",
                 "",
+                "",
+                comment.id,
+                (comment.author.name if comment.author else "Deleted"),
+                comment.body,
+                (
+                    comment.parent_id.split("_")[1]
+                    if comment.parent_id != submission.id
+                    else ""
+                ),
+                comment.score,
             ]
-        )
+            writer.writerow(comment_row)
+            batch_data.append(comment_row)
 
-        for comment in comments:
-            writer.writerow(
-                [
-                    datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
-                    submission.id,
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                    comment.id,
-                    (comment.author.name if comment.author else "Deleted"),
-                    comment.body,
-                    (
-                        comment.parent_id.split("_")[1]
-                        if comment.parent_id != submission.id
-                        else ""
-                    ),
-                    comment.score
-                ]
-            )
+        # Write to Google Sheets if the sheet is not None.
+        if sheet:
+            try:
+                # Reduce chunk size to prevent hitting limits
+                CHUNK_SIZE = 50
+                for i in range(0, len(batch_data), CHUNK_SIZE):
+                    chunk = batch_data[i : i + CHUNK_SIZE]
+
+                    last_row = find_last_row(sheet)
+                    required_rows = last_row + len(chunk)
+                    resize_sheet_if_needed(sheet, required_rows)
+
+                    requests = [
+                        {
+                            "range": f"A{last_row + 1}:L{last_row + len(chunk)}",
+                            "values": chunk,
+                        }
+                    ]
+                    body = {"valueInputOption": "USER_ENTERED", "data": requests}
+
+                    sheet.spreadsheet.values_batch_update(body)
+                    time.sleep(1)
+
+            except Exception as e:
+                print(f"Error during batch update: {e}")
 
     print(f"CSV saved as {csv_filename}")
+
+
+if __name__ == "__main__":
+    sheet_name = "Reddit Thread Data"
+    folder_id = "YOUR_FOLDER_ID"
+    sheet = setup_google_sheets(sheet_name, folder_id)
+    thread_id = "YOUR_THREAD_ID"
+    submission, comments = fetch_reddit_thread(thread_id, comment_limit=None)
+
+    write_to_csv_and_sheets(submission, comments, thread_id, sheet)
diff --git a/src/crevettes/main.py b/src/crevettes/main.py
@@ -4,18 +4,31 @@
 
 
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: crevettes thread_id")
+    if len(sys.argv) < 3 or len(sys.argv) > 4:
+        print("Usage: crevettes thread_id folder_id [keyfile_path]")
         sys.exit(1)
 
-    # Get thread ID from command-line argument.
+    # Get thread ID and folder ID from command-line arguments.
     thread_id = sys.argv[1]
+    folder_id = sys.argv[2]
+
+    # Check if the optional JSON key file path is provided.
+    keyfile_path = sys.argv[3] if len(sys.argv) == 4 else None
 
     # Fetch thread and comments.
     submission, comments = harvester.fetch_reddit_thread(thread_id)
 
-    # Write thread metadata data to csv.
-    harvester.write_to_csv(submission, comments, thread_id)
+    # Generate a clean title for both CSV and Google Sheets.
+    clean_title = harvester.clean_thread_title(submission.title)
+    gsheets_title = f"reddit-{thread_id}--{clean_title}"
+
+    # Initialize Google Sheet only if JSON key file path is provided.
+    sheet = None
+    if keyfile_path:
+        sheet = harvester.setup_google_sheets(gsheets_title, folder_id, keyfile_path)
+
+    # Write thread metadata to both CSV and Google Sheets if the sheet is set.
+    harvester.write_to_csv_and_sheets(submission, comments, thread_id, sheet)
 
 
 if __name__ == "__main__":