Skip to content

Commit

Permalink
Reddit Stash code initial
Browse files Browse the repository at this point in the history
  • Loading branch information
rhnfzl committed Aug 25, 2024
1 parent d6a67a7 commit 3158e94
Show file tree
Hide file tree
Showing 5 changed files with 358 additions and 2 deletions.
45 changes: 45 additions & 0 deletions .github/workflows/reddit_scraper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Reddit Stash Scraper

on:
schedule:
- cron: "0 23 * * *" # Adjust according to your desired schedule (e.g., 23:00 UTC for midnight CET)
workflow_dispatch: # Allows manual trigger of the workflow

jobs:
run-script:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.10'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download existing Reddit data from Dropbox
env:
DROPBOX_TOKEN: ${{ secrets.DROPBOX_TOKEN }}
run: |
python dropbox_utils.py --download # Add a command-line option to control what the script does
- name: Run Reddit Stash Script
env:
REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
REDDIT_USERNAME: ${{ secrets.REDDIT_USERNAME }}
REDDIT_PASSWORD: ${{ secrets.REDDIT_PASSWORD }}
run: |
python reddit_stash.py # Run the Reddit processing script
- name: Upload updated Reddit data to Dropbox
env:
DROPBOX_TOKEN: ${{ secrets.DROPBOX_TOKEN }}
run: |
python dropbox_utils.py --upload # Another command-line option for the upload process
66 changes: 64 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,64 @@
# reddit-stash
Save your reddit Saved Content and the comments
# Reddit Stash

Reddit Stash is a Python script that automatically saves your Reddit saved posts and comments to Dropbox. It uses GitHub Actions to run the script on a daily schedule.

## Features
- Automatically retrieves saved posts and comments from Reddit.
- Saves the content as markdown files.
- Uploads the files to Dropbox for secure storage.

## Setup

### Prerequisites
- Python 3.10
- A Dropbox account with an API token.
- Reddit API credentials.

### Installation

1. Clone this repository:
```
git clone https://github.com/rhnfzl/reddit-stash.git
cd reddit-stash
```

2. Install the required Python packages:
```
pip install -r requirements.txt
```
3. Set these environment variables in your OS before running the script, create a Reddit app on https://old.reddit.com/prefs/apps/ and then put the client ID, secret key, as well as your reddit username and password.
MacOS and Linux:
```
export REDDIT_CLIENT_ID='your_client_id'
export REDDIT_CLIENT_SECRET='your_client_secret'
export REDDIT_USERNAME='your_username'
export REDDIT_PASSWORD='your_password'
```
Windows:
```
set REDDIT_CLIENT_ID='your_client_id'
set REDDIT_CLIENT_SECRET='your_client_secret'
set REDDIT_USERNAME='your_username'
set REDDIT_PASSWORD='your_password'
```
You can check the config has been setup properly or not below:
```
echo $REDDIT_CLIENT_ID
echo $REDDIT_CLIENT_SECRET
echo $REDDIT_USERNAME
echo $REDDIT_PASSWORD
```
4. Usage Run the script manually:
```
python reddit_stash.py
```
or using the following Automation
This project uses GitHub Actions to automatically run the script daily at midnight CET and upload the files to Dropbox. The workflow is defined in .github/workflows/reddit_scraper.yml.
### Contributing
Feel free to open issues or submit pull requests if you have any improvements or bug fixes.
89 changes: 89 additions & 0 deletions dropbox_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import re
import sys
import dropbox

def sanitize_filename(filename):
"""Sanitize the filename to be Dropbox-compatible."""
sanitized_name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', filename) # Also remove control characters
sanitized_name = sanitized_name.strip() # Remove leading and trailing spaces
reserved_names = {"CON", "PRN", "AUX", "NUL", "COM1", "LPT1", "COM2", "LPT2", "COM3", "LPT3",
"COM4", "LPT4", "COM5", "LPT5", "COM6", "LPT6", "COM7", "LPT7", "COM8", "LPT8",
"COM9", "LPT9"}
if sanitized_name.upper() in reserved_names:
sanitized_name = "_" + sanitized_name # Prefix with underscore to avoid reserved names

return sanitized_name

def list_dropbox_files(dbx, dropbox_folder):
"""List all files in the specified Dropbox folder."""
file_names = set()
try:
result = dbx.files_list_folder(dropbox_folder, recursive=True)
while True:
for entry in result.entries:
if isinstance(entry, dropbox.files.FileMetadata):
file_names.add(entry.path_lower)
if not result.has_more:
break
result = dbx.files_list_folder_continue(result.cursor)
except dropbox.exceptions.ApiError as err:
print(f"Failed to list files in Dropbox folder {dropbox_folder}: {err}")
return file_names

def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
"""Uploads all files in the specified local directory to Dropbox without overwriting."""
dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))

# List all files currently in the Dropbox folder
existing_files = list_dropbox_files(dbx, dropbox_folder)

for root, dirs, files in os.walk(local_directory):
for file_name in files:
# Skip .DS_Store and other hidden files
if file_name.startswith('.'):
continue

sanitized_name = sanitize_filename(file_name)
file_path = os.path.join(root, file_name)
dropbox_path = f"{dropbox_folder}/{os.path.relpath(file_path, local_directory).replace(os.path.sep, '/')}"

# Adjust for sanitized name
dropbox_path = dropbox_path.replace(file_name, sanitized_name)

if dropbox_path.lower() in existing_files:
continue

try:
with open(file_path, "rb") as f:
dbx.files_upload(f.read(), dropbox_path)
except dropbox.exceptions.ApiError as e:
print(f"Failed to upload {file_path} to Dropbox: {e}")

def download_directory_from_dropbox(dbx, dropbox_folder, local_directory):
"""Downloads all files in the specified Dropbox folder to the local directory."""
try:
result = dbx.files_list_folder(dropbox_folder, recursive=True)
while True:
for entry in result.entries:
if isinstance(entry, dropbox.files.FileMetadata):
local_path = os.path.join(local_directory, entry.path_lower[len(dropbox_folder):].lstrip('/'))
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "wb") as f:
metadata, res = dbx.files_download(entry.path_lower)
f.write(res.content)
if not result.has_more:
break
result = dbx.files_list_folder_continue(result.cursor)
except dropbox.exceptions.ApiError as err:
print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}")

if __name__ == "__main__":
dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))
local_dir = 'reddit/' # Local directory for Reddit data
dropbox_folder = "/reddit" # Dropbox folder where Reddit data is stored

if '--download' in sys.argv:
download_directory_from_dropbox(dbx, dropbox_folder, local_dir)
elif '--upload' in sys.argv:
upload_directory_to_dropbox(local_dir, dropbox_folder)
157 changes: 157 additions & 0 deletions reddit_stash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import praw, os, config, time, sys
from praw.models import Submission, Comment
from datetime import datetime
import prawcore
from tqdm import tqdm
import time

# Reddit API configuration
client_id = os.getenv('REDDIT_CLIENT_ID')
client_secret = os.getenv('REDDIT_CLIENT_SECRET')
username = os.getenv('REDDIT_USERNAME')
password = os.getenv('REDDIT_PASSWORD')

if not all([client_id, client_secret, username, password]):
raise Exception("One or more environment variables for Reddit API are missing.")

reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
username=username,
password=password,
user_agent='Reddit Saved Saver by /u/complexrexton'
)

print("Fetching...")

try:
saved = reddit.user.me().saved(limit=1000)
except:
sys.exit("Failed to find your saved posts, did you add your credentials to config.py?")

top_dir = 'reddit/'

if not os.path.exists(top_dir):
os.mkdir(top_dir)

def process_comments(comments, f, depth=0):
"""Process all comments and visualize depth using indentation."""
for i, comment in enumerate(comments):
if isinstance(comment, Comment):
# Write the comment with indentation based on depth
indent = ' ' * depth
f.write(f'{indent}### Comment {i+1} by /u/{comment.author.name if comment.author else "[deleted]"}\n')
f.write(f'{indent}- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n')
f.write(f'{indent}{comment.body}\n\n')

# Process replies recursively without limit
if comment.replies:
process_comments(comment.replies, f, depth + 1)

f.write(f'{indent}---\n\n')

def dynamic_sleep(processed_count, content_length):
"""
Dynamically adjust sleep time based on the number of submissions processed
and the cumulative content length processed.
Parameters:
processed_count (int): The number of submissions processed so far.
content_length (int): The estimated cumulative length of the content processed.
Returns:
float: The number of seconds to sleep.
"""
base_sleep_time = 1 # Base time to start with

# Adjust sleep based on the number of submissions processed
if processed_count > 100:
sleep_time = base_sleep_time * 2
elif processed_count > 50:
sleep_time = base_sleep_time * 1.5
else:
sleep_time = base_sleep_time

# Further adjust sleep based on the content length
if content_length > 10000: # Large content length threshold
sleep_time *= 2
elif content_length > 5000: # Moderate content length threshold
sleep_time *= 1.5

return sleep_time

def lazy_load_comments(submission):
"""Lazily load comments instead of replacing all at once."""
try:
for comment in submission.comments.list():
yield comment
except prawcore.exceptions.TooManyRequests:
print("Rate limit exceeded. Sleeping for 120 seconds...")
time.sleep(120) # Sleep for 120 seconds before retrying
yield from lazy_load_comments(submission) # Retry the request

def save_comment_and_context(comment, f):
"""Save a comment and its context."""
f.write('---\n')
f.write(f'Comment by /u/{comment.author.name if comment.author else "[deleted]"}\n')
f.write(f'- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n')
f.write(f'{comment.body}\n\n')
f.write('---\n\n')

# Fetch and save the parent post or comment for context
parent = comment.parent()
if isinstance(parent, Submission):
f.write(f'## Context: Post by /u/{parent.author.name if parent.author else "[deleted]"}\n')
f.write(f'- **Title:** {parent.title}\n')
f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n')
if parent.is_self:
f.write(f'{parent.selftext}\n\n')
else:
f.write(f'[Link to post content]({parent.url})\n\n')
elif isinstance(parent, Comment):
f.write(f'## Context: Parent Comment by /u/{parent.author.name if parent.author else "[deleted]"}\n')
f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n')
f.write(f'{parent.body}\n\n')

processed_count = 0 # Counter to keep track of processed submissions

for saved_item in tqdm(saved, desc="Processing Saved Items"):
# Determine the save location based on subreddit
sub_dir = top_dir + saved_item.subreddit.display_name + '/'
if not os.path.exists(sub_dir):
os.mkdir(sub_dir)

file_path = sub_dir + saved_item.id + '.md'

# Check if the file already exists to avoid overwriting
if os.path.exists(file_path):
print(f"File {file_path} already exists. Skipping to prevent overwriting.")
continue

with open(file_path, 'w', encoding="utf-8") as f:
if isinstance(saved_item, Submission):
# Save the post and its comments
f.write('---\n')
f.write(f'id: {saved_item.id}\n')
f.write(f'subreddit: /r/{saved_item.subreddit.display_name}\n')
f.write(f'timestamp: {str(datetime.utcfromtimestamp(saved_item.created_utc))}\n')
f.write(f'author: /u/{saved_item.author.name if saved_item.author else "[deleted]"}\n')
f.write(f'permalink: https://reddit.com{saved_item.permalink}\n')
f.write('---\n\n')
f.write(f'# {saved_item.title}\n\n')
f.write(f'**Upvotes:** {saved_item.score} | **Permalink:** [Link](https://reddit.com{saved_item.permalink})\n\n')
if saved_item.is_self:
f.write(saved_item.selftext if saved_item.selftext else '[Deleted Post]')
else:
f.write(saved_item.url if saved_item.url else '[Deleted Post]')
f.write('\n\n## Comments:\n\n')
lazy_comments = lazy_load_comments(saved_item)
process_comments(lazy_comments, f)
elif isinstance(saved_item, Comment):
# Save the comment and its context
save_comment_and_context(saved_item, f)

processed_count += 1 # Increment the processed counter
time.sleep(dynamic_sleep(processed_count, len(saved_item.body if isinstance(saved_item, Comment) else saved_item.selftext or saved_item.url)))

print("All saved items have been processed.")
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
praw
tqdm
dropbox

0 comments on commit 3158e94

Please sign in to comment.