Skip to content

Commit

Permalink
Add optional Google Sheets support.
Browse files Browse the repository at this point in the history
- Add ability to write to Google Sheets and local CSV.
- Google Sheets writing respects Google API write limits for unpaid
  account.
- Update README.
- Version bump.
  • Loading branch information
ruebot committed Nov 11, 2024
1 parent 04f09ee commit a3e9136
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 60 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ __pycache__

.env
*.csv
*.json
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,26 @@ A Python utility to capture a thread and commends for a given Reddit thread. Thi

## Usage

Local CSV
```
crevettes thread_id
```

Google Sheets and Local CSV.
```
crevettes thread_id google_sheets_folder_id path/to/json/keyfile
```

## Example

```
crevettes 1bq51lp
```

```
crevettes 1gnex8a 1Nr1xMEN1WTxrwNfyFz4fg1fTyB3oCAcg
```

You will need a `.env` file populated with the following:

```
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ build-backend = "setuptools.build_meta"

[project]
name = "crevettes"
version = "0.0.4"
version = "0.0.5"
readme = "README.md"
requires-python = ">=3.9.9"
description = "A Python utility to capture a thread and commends for a given Reddit thread."
authors = [
{ name="Nick Ruest", email="[email protected]" }
]
dependencies = ["praw", "python-dotenv"]
dependencies = [
"praw>=7.8.1",
"python-dotenv"
]

[tool.setuptools]
packages = ["crevettes"]
Expand Down
189 changes: 136 additions & 53 deletions src/crevettes/harvester.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import csv
import os
import re
import time
from datetime import datetime

import gspread
import praw
from dotenv import load_dotenv
from oauth2client.service_account import ServiceAccountCredentials

# Load Reddit API environment variables.
load_dotenv()
Expand All @@ -17,86 +20,166 @@
)


# Google Sheets setup.
def setup_google_sheets(sheet_name, folder_id, keyfile_path):
scope = [
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive",
]
creds = ServiceAccountCredentials.from_json_keyfile_name(keyfile_path, scope)
client = gspread.authorize(creds)

# Create a new Google Sheet in the given folder.
folder = client.create(sheet_name, folder_id=folder_id)
sheet = folder.get_worksheet(0)
return sheet


# Fetch a given Reddit thread by thread id.
def fetch_reddit_thread(thread_id):
def fetch_reddit_thread(thread_id, comment_limit=None):
submission = reddit.submission(id=thread_id)
submission.comments.replace_more(limit=None)
comments = submission.comments.list()

return submission, comments
all_comments = []

for comment in submission.comments.list():
all_comments.append(comment)
# Only break if a comment_limit is set and the limit is reached.
if comment_limit and len(all_comments) >= comment_limit:
break
# Delay between API calls to prevent hitting rate limits
time.sleep(0.5)

return submission, all_comments


# Clean titles for csv filenames.
def clean_thread_title(title):
title = title.lower()
title = re.sub(r"[^\w\s-]", "", title)
title = re.sub(r"\s+", "-", title)

return title


# Write thread metadata to csv.
def write_to_csv(submission, comments, thread_id):
def resize_sheet_if_needed(sheet, total_rows):
"""Resize the Google Sheet if the required number of rows exceeds current limit."""
current_row_count = sheet.row_count
if total_rows > current_row_count:
new_row_count = max(total_rows, current_row_count * 2)
sheet.resize(rows=new_row_count)
print(f"Resized sheet to {new_row_count} rows")


def find_last_row(sheet):
"""Find the last non-empty row in the sheet."""
str_values = sheet.col_values(1)
return len(str_values) if str_values else 0


def write_to_csv_and_sheets(submission, comments, thread_id, sheet=None):
clean_title = clean_thread_title(submission.title)
csv_filename = f"reddit-{thread_id}--{clean_title}.csv"

# Define the header row.
header_row = [
"timestamp",
"thread_id",
"thread_title",
"thread_submitter",
"thread_body",
"thread_timestamp",
"thread_vote_count",
"comment_id",
"comment_username",
"comment_body",
"comment_reply_to_id",
"comment_vote_count",
]

# Open the CSV file and write the headers.
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
[
"timestamp",
"thread_id",
"thread_title",
"thread_submitter",
"thread_body",
"thread_timestamp",
"thread_vote_count",
"comment_id",
"comment_username",
"comment_body",
"comment_reply_to_id",
"comment_vote_count"
]
)
writer.writerow(header_row)

# Prepare data for batch update including the header, and start with header row.
batch_data = [header_row]

writer.writerow(
[
# Write the submission (thread) metadata to CSV and batch data.
thread_row = [
datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
submission.id,
submission.title,
(submission.author.name if submission.author else "Deleted"),
submission.selftext,
datetime.fromtimestamp(submission.created_utc).strftime(
"%Y-%m-%d %H:%M:%S"
),
submission.score,
"",
"",
"",
"",
]
writer.writerow(thread_row)
batch_data.append(thread_row)

# Write the comments data to CSV and batch data.
for comment in comments:
comment_row = [
datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
submission.id,
submission.title,
(submission.author.name if submission.author else "Deleted"),
submission.selftext,
datetime.fromtimestamp(submission.created_utc).strftime(
"%Y-%m-%d %H:%M:%S"
),
submission.score,
"",
"",
"",
"",
"",
comment.id,
(comment.author.name if comment.author else "Deleted"),
comment.body,
(
comment.parent_id.split("_")[1]
if comment.parent_id != submission.id
else ""
),
comment.score,
]
)
writer.writerow(comment_row)
batch_data.append(comment_row)

for comment in comments:
writer.writerow(
[
datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
submission.id,
"",
"",
"",
"",
"",
comment.id,
(comment.author.name if comment.author else "Deleted"),
comment.body,
(
comment.parent_id.split("_")[1]
if comment.parent_id != submission.id
else ""
),
comment.score
]
)
# Write to Google Sheets if the sheet is not None.
if sheet:
try:
# Reduce chunk size to prevent hitting limits
CHUNK_SIZE = 50
for i in range(0, len(batch_data), CHUNK_SIZE):
chunk = batch_data[i : i + CHUNK_SIZE]

last_row = find_last_row(sheet)
required_rows = last_row + len(chunk)
resize_sheet_if_needed(sheet, required_rows)

requests = [
{
"range": f"A{last_row + 1}:L{last_row + len(chunk)}",
"values": chunk,
}
]
body = {"valueInputOption": "USER_ENTERED", "data": requests}

sheet.spreadsheet.values_batch_update(body)
time.sleep(1)

except Exception as e:
print(f"Error during batch update: {e}")

print(f"CSV saved as {csv_filename}")


if __name__ == "__main__":
sheet_name = "Reddit Thread Data"
folder_id = "YOUR_FOLDER_ID"
sheet = setup_google_sheets(sheet_name, folder_id)
thread_id = "YOUR_THREAD_ID"
submission, comments = fetch_reddit_thread(thread_id, comment_limit=None)

write_to_csv_and_sheets(submission, comments, thread_id, sheet)
23 changes: 18 additions & 5 deletions src/crevettes/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,31 @@


def main():
if len(sys.argv) != 2:
print("Usage: crevettes thread_id")
if len(sys.argv) < 3 or len(sys.argv) > 4:
print("Usage: crevettes thread_id folder_id [keyfile_path]")
sys.exit(1)

# Get thread ID from command-line argument.
# Get thread ID and folder ID from command-line arguments.
thread_id = sys.argv[1]
folder_id = sys.argv[2]

# Check if the optional JSON key file path is provided.
keyfile_path = sys.argv[3] if len(sys.argv) == 4 else None

# Fetch thread and comments.
submission, comments = harvester.fetch_reddit_thread(thread_id)

# Write thread metadata data to csv.
harvester.write_to_csv(submission, comments, thread_id)
# Generate a clean title for both CSV and Google Sheets.
clean_title = harvester.clean_thread_title(submission.title)
gsheets_title = f"reddit-{thread_id}--{clean_title}"

# Initialize Google Sheet only if JSON key file path is provided.
sheet = None
if keyfile_path:
sheet = harvester.setup_google_sheets(gsheets_title, folder_id, keyfile_path)

# Write thread metadata to both CSV and Google Sheets if the sheet is set.
harvester.write_to_csv_and_sheets(submission, comments, thread_id, sheet)


if __name__ == "__main__":
Expand Down

0 comments on commit a3e9136

Please sign in to comment.