Skip to content

Commit

Permalink
Added the LOG mechanism to accelerate the process without downloading…
Browse files Browse the repository at this point in the history
… the entire directory for processing
  • Loading branch information
rhnfzl committed Aug 31, 2024
1 parent fc4d989 commit 4ac09b4
Show file tree
Hide file tree
Showing 9 changed files with 407 additions and 171 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,6 @@ cython_debug/
archive/
reddit/
*.md
reddit_acvhive*
*.json
reddit_acvhive*
file_log.json
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ After adding all secrets: ![Repository Secrets](resources/repositiory_secrets.pn
3. **Manually Trigger the Workflow**:
- Go to the **Actions** tab > Select the **Reddit Stash Workflow** from the list on the left > Click **Run workflow** > Select the branch `main` > Click the green **Run workflow** button. The workflow will then be triggered, and you can monitor its progress in the Actions tab. Upon successful completion, you should see the Reddit folder in your Dropbox.

4. The workflow should run automatically at midnight CET time automatically.

#### Local Installation

1. **Clone this repository**:
Expand Down Expand Up @@ -131,19 +133,22 @@ The `settings.ini` file in the root directory of the project allows you to confi
save_directory = reddit/ # your system save directory
dropbox_directory = /reddit # your dropbox directory
save_type = ALL # Options: 'ALL' to save all activity, 'SAVED' to save only saved posts/comments
check_type = LOG # Options: 'LOG' to use the logging file to verify the file exisitnece, 'DIR' to verify the file exisitence based on the downloaded directory.
[Configuration]
client_id = None # Can be set here or via environment variables
client_secret = None # Can be set here or via environment variables
username = None # Can be set here or via environment variables
password = None # Can be set here or via environment variables
```
save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in.
dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in.
save_type: Determines what user activity is saved, accepts these two values:
* `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context.
* `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context.

* save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in.
* dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in.
* save_type: Determines what user activity is saved, accepts these two values:
* `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context.
* `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context.
* check_type : Determines if the file existence needs to be checked using the log file only or using the directory.
* `LOG` : Uses the log file only to check the file exisitence, faster processing. Recommneded to use in the github action setup.
* `DIR` : Uses the saved/ downloaded directory to check the file existence, slower processing. Recommended to use in the local setup.
Note: You can still use environment variables as a fallback or override for the Reddit API credentials if they are not set in the settings.ini file.

#### Setting Up Reddit Environment Variables
Expand Down
171 changes: 133 additions & 38 deletions dropbox_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,68 @@
import sys
import dropbox
import requests
import hashlib
import configparser
from dropbox.exceptions import ApiError
from dropbox.files import FileMetadata

# Import the validate_and_set_directory function from utils
from utils.file_path_validate import validate_and_set_directory


class DropboxContentHasher:
"""Implements Dropbox content hashing as per the provided reference code."""

BLOCK_SIZE = 4 * 1024 * 1024

def __init__(self):
self._overall_hasher = hashlib.sha256()
self._block_hasher = hashlib.sha256()
self._block_pos = 0

self.digest_size = self._overall_hasher.digest_size

def update(self, new_data):
if self._overall_hasher is None:
raise AssertionError(
"can't use this object anymore; you already called digest()")

assert isinstance(new_data, bytes), (
"Expecting a byte string, got {!r}".format(new_data))

new_data_pos = 0
while new_data_pos < len(new_data):
if self._block_pos == self.BLOCK_SIZE:
self._overall_hasher.update(self._block_hasher.digest())
self._block_hasher = hashlib.sha256()
self._block_pos = 0

space_in_block = self.BLOCK_SIZE - self._block_pos
part = new_data[new_data_pos:(new_data_pos+space_in_block)]
self._block_hasher.update(part)

self._block_pos += len(part)
new_data_pos += len(part)

def _finish(self):
if self._overall_hasher is None:
raise AssertionError(
"can't use this object anymore; you already called digest() or hexdigest()")

if self._block_pos > 0:
self._overall_hasher.update(self._block_hasher.digest())
self._block_hasher = None
h = self._overall_hasher
self._overall_hasher = None # Make sure we can't use this object anymore.
return h

def digest(self):
return self._finish().digest()

def hexdigest(self):
return self._finish().hexdigest()


def refresh_dropbox_token():
refresh_token = os.getenv('DROPBOX_REFRESH_TOKEN')
client_id = os.getenv('DROPBOX_APP_KEY')
Expand Down Expand Up @@ -41,6 +99,9 @@ def refresh_dropbox_token():
# Fetch the dropbox_folder from the settings.ini file with a fallback
dropbox_folder = config_parser.get('Settings', 'dropbox_directory', fallback='/reddit')

# Fetch the check_type from the settings.ini file with a fallback
check_type = config_parser.get('Settings', 'check_type', fallback='LOG').upper()

def sanitize_filename(filename):
"""Sanitize the filename to be Dropbox-compatible."""
sanitized_name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', filename) # Also remove control characters
Expand All @@ -53,28 +114,39 @@ def sanitize_filename(filename):

return sanitized_name

def list_dropbox_files(dbx, dropbox_folder):
"""List all files in the specified Dropbox folder."""
file_names = set()
def calculate_local_content_hash(file_path):
"""Calculate the Dropbox content hash for a local file."""
hasher = DropboxContentHasher()
with open(file_path, 'rb') as f:
while True:
chunk = f.read(1024 * 1024)
if len(chunk) == 0:
break
hasher.update(chunk)
return hasher.hexdigest()

def list_dropbox_files_with_hashes(dbx, dropbox_folder):
"""List all files in the specified Dropbox folder along with their content hashes."""
file_metadata = {}
try:
result = dbx.files_list_folder(dropbox_folder, recursive=True)
while True:
for entry in result.entries:
if isinstance(entry, dropbox.files.FileMetadata):
file_names.add(entry.path_lower)
if isinstance(entry, FileMetadata):
file_metadata[entry.path_lower] = entry.content_hash
if not result.has_more:
break
result = dbx.files_list_folder_continue(result.cursor)
except dropbox.exceptions.ApiError as err:
except ApiError as err:
print(f"Failed to list files in Dropbox folder {dropbox_folder}: {err}")
return file_names
return file_metadata

def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
"""Uploads all files in the specified local directory to Dropbox without overwriting."""
"""Uploads all files in the specified local directory to Dropbox, replacing only changed files."""
dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))

# List all files currently in the Dropbox folder
existing_files = list_dropbox_files(dbx, dropbox_folder)
# List all files currently in the Dropbox folder along with their content hashes
dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder)

uploaded_count = 0
uploaded_size = 0
Expand All @@ -93,61 +165,84 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
# Adjust for sanitized name
dropbox_path = dropbox_path.replace(file_name, sanitized_name)

if dropbox_path.lower() in existing_files:
local_content_hash = calculate_local_content_hash(file_path)

# Check if the file exists and is the same on Dropbox
if dropbox_path.lower() in dropbox_files and dropbox_files[dropbox_path.lower()] == local_content_hash:
skipped_count += 1
continue

# Upload the file since it doesn't exist or has changed
try:
with open(file_path, "rb") as f:
file_size = os.path.getsize(file_path)
dbx.files_upload(f.read(), dropbox_path)
dbx.files_upload(f.read(), dropbox_path, mode=dropbox.files.WriteMode.overwrite)
uploaded_count += 1
uploaded_size += file_size
except dropbox.exceptions.ApiError as e:
except ApiError as e:
print(f"Failed to upload {file_path} to Dropbox: {e}")

print(f"Upload completed. {uploaded_count} files uploaded ({uploaded_size / (1024 * 1024):.2f} MB).")
print(f"{skipped_count} files were skipped (already existed).")
print(f"{skipped_count} files were skipped (already existed or unchanged).")

def download_directory_from_dropbox(dbx, dropbox_folder, local_directory):
"""Downloads all files in the specified Dropbox folder to the local directory."""
"""Downloads all files in the specified Dropbox folder to the local directory, replacing only changed files."""
downloaded_count = 0
downloaded_size = 0
skipped_count = 0

# List all files currently in the Dropbox folder along with their content hashes
dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder)

try:
result = dbx.files_list_folder(dropbox_folder, recursive=True)
while True:
for entry in result.entries:
if isinstance(entry, dropbox.files.FileMetadata):
local_path = os.path.join(local_directory, entry.path_lower[len(dropbox_folder):].lstrip('/'))

# Skip the download if the file already exists locally
if os.path.exists(local_path):
skipped_count += 1
continue

os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "wb") as f:
metadata, res = dbx.files_download(entry.path_lower)
f.write(res.content)
downloaded_count += 1
downloaded_size += metadata.size
if not result.has_more:
break
result = dbx.files_list_folder_continue(result.cursor)
except dropbox.exceptions.ApiError as err:
for dropbox_path, dropbox_hash in dropbox_files.items():
local_path = os.path.join(local_directory, dropbox_path[len(dropbox_folder):].lstrip('/'))

if os.path.exists(local_path):
local_content_hash = calculate_local_content_hash(local_path)
if local_content_hash == dropbox_hash:
skipped_count += 1
continue

os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "wb") as f:
metadata, res = dbx.files_download(dropbox_path)
f.write(res.content)
downloaded_count += 1
downloaded_size += metadata.size
except ApiError as err:
print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}")

print(f"Download completed. {downloaded_count} files downloaded ({downloaded_size / (1024 * 1024):.2f} MB).")
print(f"{skipped_count} files were skipped (i.e. they already existed).")
print(f"{skipped_count} files were skipped (already existed or unchanged).")

def download_log_file_from_dropbox(dbx, dropbox_folder, local_directory):
"""Download only the log file from Dropbox."""
log_file_path = os.path.join(local_directory, 'file_log.json')

try:
# Download the log file
metadata, res = dbx.files_download(f"{dropbox_folder}/file_log.json")
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
with open(log_file_path, "wb") as f:
f.write(res.content)
print(f"Log file downloaded successfully to {log_file_path}.")
except ApiError as err:
print(f"Failed to download the log file from Dropbox: {err}")

if __name__ == "__main__":
# Refresh the access token because it expires
refresh_dropbox_token()
dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))

if '--download' in sys.argv:
download_directory_from_dropbox(dbx, dropbox_folder, local_dir)
if check_type == 'LOG':
print("Downloading only the log file as check_type is LOG.")
download_log_file_from_dropbox(dbx, dropbox_folder, local_dir)
elif check_type == 'DIR':
print("Downloading the entire directory as check_type is DIR.")
download_directory_from_dropbox(dbx, dropbox_folder, local_dir)
else:
raise ValueError(f"Unknown check_type: {check_type}")
elif '--upload' in sys.argv:
upload_directory_to_dropbox(local_dir, dropbox_folder)
6 changes: 5 additions & 1 deletion reddit_stash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from utils.file_path_validate import validate_and_set_directory
from utils.file_operations import save_user_activity
from utils.env_config import load_config_and_env
from utils.log_utils import load_file_log, log_file, is_file_logged

# Load configuration
config_parser = configparser.ConfigParser()
Expand All @@ -27,8 +28,11 @@
)

if __name__ == "__main__":
# Load the log file from the save directory
file_log = load_file_log(save_directory)

# Process user activity (submissions, comments, and saved items) and get statistics
processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory)
processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory, file_log)

# Print final statistics of processing
print(f"Processing completed. {processed_count} items processed, {skipped_count} items skipped.")
Expand Down
1 change: 1 addition & 0 deletions settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
save_directory = reddit/
dropbox_directory = /reddit
save_type = ALL
check_type = LOG

[Configuration]
client_id = None
Expand Down
Loading

0 comments on commit 4ac09b4

Please sign in to comment.