Completed till inference.

siddharth7113 · May 3, 2024 · 66679b4 · 66679b4
1 parent 48ea7d1
commit 66679b4
Show file tree

Hide file tree

Showing 67 changed files with 42,328 additions and 1 deletion.
diff --git a/API_request.py b/API_request.py
@@ -0,0 +1,58 @@
+import os
+import requests
+import json
+from pathlib import Path
+import time
+
+# Constants
+API_URL = "https://openrouter.ai/api/v1/chat/completions"
+OPENROUTER_API_KEY = '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'  # Secure this in your environment or a config file
+CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'
+OUTPUT_DIR = 'src/data/output-responses'
+REQUEST_INTERVAL = 60  # seconds to wait between requests to manage API rate limit
+
+def make_request(text):
+    prompt = """
+    "Please provide a comprehensive financial analysis including year-over-year growth, key financial ratios, and a detailed discussion on expenses and revenue sources. Compare these figures to industry averages and discuss any significant deviations."
+    """
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = json.dumps({
+        "model": "mistralai/mistral-7b-instruct:free",
+        "messages": [{"role": "user", "content": text + prompt}]
+    })
+    response = requests.post(API_URL, headers=headers, data=data)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to fetch data: {response.status_code}, {response.text}")
+        return None
+
+def save_response(response, filename):
+    """Save the response data to a JSON file."""
+    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Ensure the output directory exists
+    file_path = os.path.join(OUTPUT_DIR, filename)
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(response, f, indent=4)
+    print(f"Response saved to {file_path}")
+
+def process_text_files():
+    pathlist = Path(CLEANED_TEXT_DIR).rglob('*.txt')  # Find all .txt files recursively
+    for path in pathlist:
+        file_path = str(path)
+        print(f"Processing file: {file_path}")
+        with open(file_path, 'r', encoding='utf-8') as file:
+            text_content = file.read()
+
+        response = make_request(text_content)
+        if response:
+            print("Received response:", response)
+            # Create a filename from the path to save the response
+            response_filename = f"{path.stem}_response.json"
+            save_response(response, response_filename)
+        time.sleep(REQUEST_INTERVAL)  # Respect the API rate limit
+
+if __name__ == '__main__':
+    process_text_files()
diff --git a/pre_processing.py b/pre_processing.py
@@ -0,0 +1,141 @@
+import os
+from bs4 import BeautifulSoup
+from pathlib import Path
+
+# Define the root directory where the original filings are stored and the directory to save cleaned text
+ROOT_DIRECTORY = 'src/data/sec-edgar-filings'
+CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'
+
+# Ensure the directory for cleaned text exists
+os.makedirs(CLEANED_TEXT_DIR, exist_ok=True)
+
+# Function to clean and extract text using Beautiful Soup
+def clean_html(content):
+    soup = BeautifulSoup(content, 'html.parser')
+
+    # Remove script, style, and meta tags as they do not contain relevant text
+    for script_or_style in soup(["script", "style", "meta"]):
+        script_or_style.decompose()
+
+    # Attempt to find the main content div by inspecting common tags used for main content
+    # Modify this according to actual document structure observed
+    main_content = soup.find('div', attrs={'class': 'document'})
+    if not main_content:
+        main_content = soup.body  # Fallback to using the entire body if specific div is not found
+
+    # Extracting text and reducing whitespace
+    text = ' '.join(main_content.stripped_strings if main_content else [])
+    return text
+
+# Function to read file, clean content, and save the cleaned text
+def process_file(file_path, output_dir):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+        cleaned_text = clean_html(content)
+
+        # Construct a new path in the cleaned directory with the same file structure
+        relative_path = os.path.relpath(file_path, ROOT_DIRECTORY)
+        new_path = os.path.join(output_dir, relative_path)
+        os.makedirs(os.path.dirname(new_path), exist_ok=True)
+
+        with open(new_path, 'w', encoding='utf-8') as file:
+            file.write(cleaned_text)
+        print(f"Saved cleaned text to {new_path}")
+    except Exception as e:
+        print(f"Failed to process file {file_path}: {e}")
+
+# Main function to walk through the directory structure and process each filing
+def process_filings(root_dir, output_dir):
+    pathlist = Path(root_dir).rglob('*.txt')  # Find all .txt files recursively
+    for path in pathlist:
+        file_path = str(path)
+        print(f"Processing file: {file_path}")
+        process_file(file_path, output_dir)
+
+if __name__ == '__main__':
+    process_filings(ROOT_DIRECTORY, CLEANED_TEXT_DIR)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# import os
+# from bs4 import BeautifulSoup
+# from pathlib import Path
+
+# # Define the root directory where original filings are stored and the directory to save cleaned text
+# ROOT_DIRECTORY = 'src/data/sec-edgar-filings'
+# CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'
+
+# # Ensure the directory for cleaned text exists
+# os.makedirs(CLEANED_TEXT_DIR, exist_ok=True)
+
+# # Function to extract text using Beautiful Soup
+# def extract_text(file_path):
+#     try:
+#         with open(file_path, 'r', encoding='utf-8') as file:
+#             content = file.read()
+#         soup = BeautifulSoup(content, 'html.parser')
+#         text = soup.get_text()
+#         return text
+#     except Exception as e:
+#         print(f"Failed to process file {file_path}: {e}")
+#         return None
+
+# # Function to save the cleaned text to a new file
+# def save_cleaned_text(text, original_path):
+#     try:
+#         # Construct a new path in the cleaned directory with the same file structure
+#         relative_path = os.path.relpath(original_path, ROOT_DIRECTORY)
+#         new_path = os.path.join(CLEANED_TEXT_DIR, relative_path)
+#         os.makedirs(os.path.dirname(new_path), exist_ok=True)
+#         with open(new_path, 'w', encoding='utf-8') as file:
+#             file.write(text)
+#         print(f"Saved cleaned text to {new_path}")
+#     except Exception as e:
+#         print(f"Failed to save cleaned text for {original_path}: {e}")
+
+# # Main function to process all filings
+# def process_filings():
+#     pathlist = Path(ROOT_DIRECTORY).rglob('*.txt')  # Find all .txt files recursively
+#     for path in pathlist:
+#         file_path = str(path)
+#         print(f"Processing file: {file_path}")
+#         extracted_text = extract_text(file_path)
+#         if extracted_text:
+#             save_cleaned_text(extracted_text, file_path)
+
+# if __name__ == '__main__':
+#     process_filings()
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,7 @@
 pandas
 flask
-sec_edgar_downloader
+sec_edgar_downloader
+matplotlib
+plotly
+json
+bs4
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-17-000070/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-17-000070/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-18-000145/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-18-000145/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-19-000119/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-19-000119/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-20-000096/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-20-000096/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-21-000105/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-21-000105/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-22-000108/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-22-000108/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-23-000106/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-23-000106/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-95-000016/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-95-000016/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-96-000023/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000320193-96-000023/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000912057-00-053623/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000912057-00-053623/full-submission.txt
diff --git a/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000912057-99-010244/full-submission.txt b/src/data/cleaned-sec-edgar-filings/AAPL/10-K/0000912057-99-010244/full-submission.txt