-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
48ea7d1
commit 66679b4
Showing
67 changed files
with
42,328 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import os | ||
import requests | ||
import json | ||
from pathlib import Path | ||
import time | ||
|
||
# Constants | ||
API_URL = "https://openrouter.ai/api/v1/chat/completions" | ||
OPENROUTER_API_KEY = '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' # Secure this in your environment or a config file | ||
CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings' | ||
OUTPUT_DIR = 'src/data/output-responses' | ||
REQUEST_INTERVAL = 60 # seconds to wait between requests to manage API rate limit | ||
|
||
def make_request(text): | ||
prompt = """ | ||
"Please provide a comprehensive financial analysis including year-over-year growth, key financial ratios, and a detailed discussion on expenses and revenue sources. Compare these figures to industry averages and discuss any significant deviations." | ||
""" | ||
headers = { | ||
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | ||
"Content-Type": "application/json" | ||
} | ||
data = json.dumps({ | ||
"model": "mistralai/mistral-7b-instruct:free", | ||
"messages": [{"role": "user", "content": text + prompt}] | ||
}) | ||
response = requests.post(API_URL, headers=headers, data=data) | ||
if response.status_code == 200: | ||
return response.json() | ||
else: | ||
print(f"Failed to fetch data: {response.status_code}, {response.text}") | ||
return None | ||
|
||
def save_response(response, filename): | ||
"""Save the response data to a JSON file.""" | ||
os.makedirs(OUTPUT_DIR, exist_ok=True) # Ensure the output directory exists | ||
file_path = os.path.join(OUTPUT_DIR, filename) | ||
with open(file_path, 'w', encoding='utf-8') as f: | ||
json.dump(response, f, indent=4) | ||
print(f"Response saved to {file_path}") | ||
|
||
def process_text_files(): | ||
pathlist = Path(CLEANED_TEXT_DIR).rglob('*.txt') # Find all .txt files recursively | ||
for path in pathlist: | ||
file_path = str(path) | ||
print(f"Processing file: {file_path}") | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
text_content = file.read() | ||
|
||
response = make_request(text_content) | ||
if response: | ||
print("Received response:", response) | ||
# Create a filename from the path to save the response | ||
response_filename = f"{path.stem}_response.json" | ||
save_response(response, response_filename) | ||
time.sleep(REQUEST_INTERVAL) # Respect the API rate limit | ||
|
||
if __name__ == '__main__': | ||
process_text_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
import os | ||
from bs4 import BeautifulSoup | ||
from pathlib import Path | ||
|
||
# Define the root directory where the original filings are stored and the directory to save cleaned text | ||
ROOT_DIRECTORY = 'src/data/sec-edgar-filings' | ||
CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings' | ||
|
||
# Ensure the directory for cleaned text exists | ||
os.makedirs(CLEANED_TEXT_DIR, exist_ok=True) | ||
|
||
# Function to clean and extract text using Beautiful Soup | ||
def clean_html(content): | ||
soup = BeautifulSoup(content, 'html.parser') | ||
|
||
# Remove script, style, and meta tags as they do not contain relevant text | ||
for script_or_style in soup(["script", "style", "meta"]): | ||
script_or_style.decompose() | ||
|
||
# Attempt to find the main content div by inspecting common tags used for main content | ||
# Modify this according to actual document structure observed | ||
main_content = soup.find('div', attrs={'class': 'document'}) | ||
if not main_content: | ||
main_content = soup.body # Fallback to using the entire body if specific div is not found | ||
|
||
# Extracting text and reducing whitespace | ||
text = ' '.join(main_content.stripped_strings if main_content else []) | ||
return text | ||
|
||
# Function to read file, clean content, and save the cleaned text | ||
def process_file(file_path, output_dir): | ||
try: | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
content = file.read() | ||
cleaned_text = clean_html(content) | ||
|
||
# Construct a new path in the cleaned directory with the same file structure | ||
relative_path = os.path.relpath(file_path, ROOT_DIRECTORY) | ||
new_path = os.path.join(output_dir, relative_path) | ||
os.makedirs(os.path.dirname(new_path), exist_ok=True) | ||
|
||
with open(new_path, 'w', encoding='utf-8') as file: | ||
file.write(cleaned_text) | ||
print(f"Saved cleaned text to {new_path}") | ||
except Exception as e: | ||
print(f"Failed to process file {file_path}: {e}") | ||
|
||
# Main function to walk through the directory structure and process each filing | ||
def process_filings(root_dir, output_dir): | ||
pathlist = Path(root_dir).rglob('*.txt') # Find all .txt files recursively | ||
for path in pathlist: | ||
file_path = str(path) | ||
print(f"Processing file: {file_path}") | ||
process_file(file_path, output_dir) | ||
|
||
if __name__ == '__main__': | ||
process_filings(ROOT_DIRECTORY, CLEANED_TEXT_DIR) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# import os | ||
# from bs4 import BeautifulSoup | ||
# from pathlib import Path | ||
|
||
# # Define the root directory where original filings are stored and the directory to save cleaned text | ||
# ROOT_DIRECTORY = 'src/data/sec-edgar-filings' | ||
# CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings' | ||
|
||
# # Ensure the directory for cleaned text exists | ||
# os.makedirs(CLEANED_TEXT_DIR, exist_ok=True) | ||
|
||
# # Function to extract text using Beautiful Soup | ||
# def extract_text(file_path): | ||
# try: | ||
# with open(file_path, 'r', encoding='utf-8') as file: | ||
# content = file.read() | ||
# soup = BeautifulSoup(content, 'html.parser') | ||
# text = soup.get_text() | ||
# return text | ||
# except Exception as e: | ||
# print(f"Failed to process file {file_path}: {e}") | ||
# return None | ||
|
||
# # Function to save the cleaned text to a new file | ||
# def save_cleaned_text(text, original_path): | ||
# try: | ||
# # Construct a new path in the cleaned directory with the same file structure | ||
# relative_path = os.path.relpath(original_path, ROOT_DIRECTORY) | ||
# new_path = os.path.join(CLEANED_TEXT_DIR, relative_path) | ||
# os.makedirs(os.path.dirname(new_path), exist_ok=True) | ||
# with open(new_path, 'w', encoding='utf-8') as file: | ||
# file.write(text) | ||
# print(f"Saved cleaned text to {new_path}") | ||
# except Exception as e: | ||
# print(f"Failed to save cleaned text for {original_path}: {e}") | ||
|
||
# # Main function to process all filings | ||
# def process_filings(): | ||
# pathlist = Path(ROOT_DIRECTORY).rglob('*.txt') # Find all .txt files recursively | ||
# for path in pathlist: | ||
# file_path = str(path) | ||
# print(f"Processing file: {file_path}") | ||
# extracted_text = extract_text(file_path) | ||
# if extracted_text: | ||
# save_cleaned_text(extracted_text, file_path) | ||
|
||
# if __name__ == '__main__': | ||
# process_filings() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
pandas | ||
flask | ||
sec_edgar_downloader | ||
sec_edgar_downloader | ||
matplotlib | ||
plotly | ||
json | ||
bs4 |
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-17-000070/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-18-000145/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-19-000119/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-20-000096/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-21-000105/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-22-000108/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-23-000106/full-submission.txt
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
Empty file.
Empty file.
Empty file.
Oops, something went wrong.