Skip to content

Commit

Permalink
Completed till inference.
Browse files Browse the repository at this point in the history
  • Loading branch information
siddharth7113 committed May 3, 2024
1 parent 48ea7d1 commit 66679b4
Show file tree
Hide file tree
Showing 67 changed files with 42,328 additions and 1 deletion.
58 changes: 58 additions & 0 deletions API_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import requests
import json
from pathlib import Path
import time

# Constants
API_URL = "https://openrouter.ai/api/v1/chat/completions"
OPENROUTER_API_KEY = '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' # Secure this in your environment or a config file
CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'
OUTPUT_DIR = 'src/data/output-responses'
REQUEST_INTERVAL = 60 # seconds to wait between requests to manage API rate limit

def make_request(text):
prompt = """
"Please provide a comprehensive financial analysis including year-over-year growth, key financial ratios, and a detailed discussion on expenses and revenue sources. Compare these figures to industry averages and discuss any significant deviations."
"""
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json"
}
data = json.dumps({
"model": "mistralai/mistral-7b-instruct:free",
"messages": [{"role": "user", "content": text + prompt}]
})
response = requests.post(API_URL, headers=headers, data=data)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to fetch data: {response.status_code}, {response.text}")
return None

def save_response(response, filename):
"""Save the response data to a JSON file."""
os.makedirs(OUTPUT_DIR, exist_ok=True) # Ensure the output directory exists
file_path = os.path.join(OUTPUT_DIR, filename)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(response, f, indent=4)
print(f"Response saved to {file_path}")

def process_text_files():
pathlist = Path(CLEANED_TEXT_DIR).rglob('*.txt') # Find all .txt files recursively
for path in pathlist:
file_path = str(path)
print(f"Processing file: {file_path}")
with open(file_path, 'r', encoding='utf-8') as file:
text_content = file.read()

response = make_request(text_content)
if response:
print("Received response:", response)
# Create a filename from the path to save the response
response_filename = f"{path.stem}_response.json"
save_response(response, response_filename)
time.sleep(REQUEST_INTERVAL) # Respect the API rate limit

if __name__ == '__main__':
process_text_files()
141 changes: 141 additions & 0 deletions pre_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import os
from bs4 import BeautifulSoup
from pathlib import Path

# Define the root directory where the original filings are stored and the directory to save cleaned text
ROOT_DIRECTORY = 'src/data/sec-edgar-filings'
CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'

# Ensure the directory for cleaned text exists
os.makedirs(CLEANED_TEXT_DIR, exist_ok=True)

# Function to clean and extract text using Beautiful Soup
def clean_html(content):
soup = BeautifulSoup(content, 'html.parser')

# Remove script, style, and meta tags as they do not contain relevant text
for script_or_style in soup(["script", "style", "meta"]):
script_or_style.decompose()

# Attempt to find the main content div by inspecting common tags used for main content
# Modify this according to actual document structure observed
main_content = soup.find('div', attrs={'class': 'document'})
if not main_content:
main_content = soup.body # Fallback to using the entire body if specific div is not found

# Extracting text and reducing whitespace
text = ' '.join(main_content.stripped_strings if main_content else [])
return text

# Function to read file, clean content, and save the cleaned text
def process_file(file_path, output_dir):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
cleaned_text = clean_html(content)

# Construct a new path in the cleaned directory with the same file structure
relative_path = os.path.relpath(file_path, ROOT_DIRECTORY)
new_path = os.path.join(output_dir, relative_path)
os.makedirs(os.path.dirname(new_path), exist_ok=True)

with open(new_path, 'w', encoding='utf-8') as file:
file.write(cleaned_text)
print(f"Saved cleaned text to {new_path}")
except Exception as e:
print(f"Failed to process file {file_path}: {e}")

# Main function to walk through the directory structure and process each filing
def process_filings(root_dir, output_dir):
pathlist = Path(root_dir).rglob('*.txt') # Find all .txt files recursively
for path in pathlist:
file_path = str(path)
print(f"Processing file: {file_path}")
process_file(file_path, output_dir)

if __name__ == '__main__':
process_filings(ROOT_DIRECTORY, CLEANED_TEXT_DIR)




































# import os
# from bs4 import BeautifulSoup
# from pathlib import Path

# # Define the root directory where original filings are stored and the directory to save cleaned text
# ROOT_DIRECTORY = 'src/data/sec-edgar-filings'
# CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'

# # Ensure the directory for cleaned text exists
# os.makedirs(CLEANED_TEXT_DIR, exist_ok=True)

# # Function to extract text using Beautiful Soup
# def extract_text(file_path):
# try:
# with open(file_path, 'r', encoding='utf-8') as file:
# content = file.read()
# soup = BeautifulSoup(content, 'html.parser')
# text = soup.get_text()
# return text
# except Exception as e:
# print(f"Failed to process file {file_path}: {e}")
# return None

# # Function to save the cleaned text to a new file
# def save_cleaned_text(text, original_path):
# try:
# # Construct a new path in the cleaned directory with the same file structure
# relative_path = os.path.relpath(original_path, ROOT_DIRECTORY)
# new_path = os.path.join(CLEANED_TEXT_DIR, relative_path)
# os.makedirs(os.path.dirname(new_path), exist_ok=True)
# with open(new_path, 'w', encoding='utf-8') as file:
# file.write(text)
# print(f"Saved cleaned text to {new_path}")
# except Exception as e:
# print(f"Failed to save cleaned text for {original_path}: {e}")

# # Main function to process all filings
# def process_filings():
# pathlist = Path(ROOT_DIRECTORY).rglob('*.txt') # Find all .txt files recursively
# for path in pathlist:
# file_path = str(path)
# print(f"Processing file: {file_path}")
# extracted_text = extract_text(file_path)
# if extracted_text:
# save_cleaned_text(extracted_text, file_path)

# if __name__ == '__main__':
# process_filings()
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
pandas
flask
sec_edgar_downloader
sec_edgar_downloader
matplotlib
plotly
json
bs4

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Empty file.
Empty file.
Empty file.
Empty file.
Loading

0 comments on commit 66679b4

Please sign in to comment.