Inference obtained from LLM

siddharth7113 · May 5, 2024 · e6a66f5 · e6a66f5
1 parent 9b299f0
commit e6a66f5
Show file tree

Hide file tree

Showing 205 changed files with 5,966 additions and 88 deletions.
diff --git a/API_jsontotxt.py b/API_jsontotxt.py
@@ -0,0 +1,63 @@
+import os
+import requests
+from pathlib import Path
+import time
+import json
+
+# Constants
+API_URL = "https://openrouter.ai/api/v1/chat/completions"
+OPENROUTER_API_KEY = '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'
+CLEANED_TEXT_DIR = 'src/data/feature'
+OUTPUT_DIR = 'src/data/output-responses'
+REQUEST_INTERVAL = 10  # seconds to wait between requests to manage API rate limit
+
+def make_request(text):
+    prompt = """Generate a detailed financial analysis for the provided data. Please structure your response in a clear and organized manner..."""
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = json.dumps({
+        "model": "mistralai/mistral-7b-instruct:free",
+        "messages": [{"role": "user", "content": text + prompt}],
+        "max_tokens": 4000  # Control the maximum output length
+    })
+    response = requests.post(API_URL, headers=headers, data=data)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to fetch data: {response.status_code}, {response.text}")
+        return None
+
+def save_response(response, output_path):
+    os.makedirs(output_path.parent, exist_ok=True)  # Ensure the output directory exists
+    with open(output_path, 'w', encoding='utf-8') as f:
+        if 'choices' in response and len(response['choices']) > 0:
+            text = response['choices'][0].get('message', {}).get('content', '')
+            f.write(text)
+    print(f"Response saved to {output_path}")
+
+def process_json_files():
+    input_path = Path(CLEANED_TEXT_DIR)
+    output_path_root = Path(OUTPUT_DIR)
+
+    for path in input_path.rglob('*.json'):
+        relative_path = path.relative_to(input_path)
+        output_path = output_path_root / relative_path
+        output_path = output_path.with_suffix('.txt')  # Ensure the file extension is .txt for plain text
+
+        # Check if the response file already exists
+        if not output_path.exists():
+            print(f"Processing file: {path}")
+            with open(path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+            text_content = ' '.join([str(value) for key, value in data.items() if isinstance(value, str)])
+            response = make_request(text_content)
+            if response:
+                save_response(response, output_path)
+            time.sleep(REQUEST_INTERVAL)  # Respect the API rate limit
+        else:
+            print(f"Skipping {path} as output already exists.")
+
+if __name__ == '__main__':
+    process_json_files()
diff --git a/API_request.py b/API_request.py
diff --git a/feature_extraction.py b/feature_extraction.py
@@ -0,0 +1,91 @@
+import os
+import re
+from pathlib import Path
+import json
+
+# Define the input and output directories
+input_dir = 'src/data/processed-numeric-contexts'
+output_dir = 'src/data/feature'
+
+def extract_features(text):
+    """ Extracts financial terms and their contextual data from the text """
+    features = {}
+    # Define patterns for each feature of interest
+    patterns = {
+        'revenue': r'\brevenue\b[\s\S]{0,500}',
+        'expenses': r'\bexpenses\b[\s\S]{0,500}',
+        'net_income': r'\bnet income\b[\s\S]{0,500}',
+        'assets': r'\bassets\b[\s\S]{0,500}',
+        'liabilities': r'\bliabilities\b[\s\S]{0,500}',
+        'equity': r'\bequity\b[\s\S]{0,500}',
+        'cash_flow': r'\bcash flow\b[\s\S]{0,500}',
+        'operating_margin': r'\boperating margin\b[\s\S]{0,500}',
+        'gross_margin': r'\bgross margin\b[\s\S]{0,500}',
+        'ebitda': r'\bebitda\b[\s\S]{0,500}',
+        'accumulated_depreciation': r'\baccumulated depreciation\b[\s\S]{0,500}',
+        'capital_expenditure': r'\bcapital expenditure\b[\s\S]{0,500}',
+        'debt': r'\bdebt\b[\s\S]{0,500}',
+        'share_repurchase': r'\bshare repurchase\b[\s\S]{0,500}',
+        'dividend_payout': r'\bdividend payout\b[\s\S]{0,500}',
+        'financial_ratios': r'\b(debt-to-equity ratio|return on equity)\b[\s\S]{0,500}',
+        'earnings_per_share': r'\bearnings per share\b[\s\S]{0,500}',
+        'tax_rate': r'\btax rate\b[\s\S]{0,500}',
+        'segment_revenue': r'\bsegment revenue\b[\s\S]{0,500}',
+        'geographic_information': r'\bgeographic\b[\s\S]{0,500}',
+        'investment_gains_losses': r'\b(investment gains|investment losses)\b[\s\S]{0,500}',
+        'regulatory_changes': r'\bregulatory changes\b[\s\S]{0,500}',
+        'legal_issues': r'\blegal issues\b[\s\S]{0,500}',    
+        'accrued_liabilities': r'\baccrued liabilities\b[\s\S]{0,500}',
+        'common_stock': r'\bcommon stock\b[\s\S]{0,500}',
+        'capital_stock': r'\bcapital stock\b[\s\S]{0,500}',
+        'subsequent_events': r'\bsubsequent events\b[\s\S]{0,500}',
+        'noncurrent_assets': r'\bnoncurrent assets\b[\s\S]{0,500}',
+        'fair_value_measurements': r'\bfair value measurements\b[\s\S]{0,500}',
+        'level_1_assets': r'\blevel 1 assets\b[\s\S]{0,500}',
+        'level_2_assets': r'\blevel 2 assets\b[\s\S]{0,500}',
+        'level_3_assets': r'\blevel 3 assets\b[\s\S]{0,500}',
+        'debt_securities': r'\bdebt securities\b[\s\S]{0,500}',
+        'bank_deposits': r'\bbank deposits\b[\s\S]{0,500}',
+        'corporate_debt': r'\bcorporate debt\b[\s\S]{0,500}',
+        'government_bonds': r'\bgovernment bonds\b[\s\S]{0,500}',
+        'mortgage_backed_securities': r'\bmortgage-backed securities\b[\s\S]{0,500}',
+        'asset_backed_securities': r'\basset-backed securities\b[\s\S]{0,500}',
+        'hedging_activities': r'\bhedging activities\b[\s\S]{0,500}',
+        'foreign_exchange_contracts': r'\bforeign exchange contracts\b[\s\S]{0,500}',
+        'designated_hedging': r'\bdesignated hedging\b[\s\S]{0,500}',
+        'nondesignated_hedging': r'\bnondesignated hedging\b[\s\S]{0,500}',
+        'cash_flow_hedges': r'\bcash flow hedges\b[\s\S]{0,500}',
+        'derivative_instruments': r'\bderivative instruments\b[\s\S]{0,500}',
+        'geographic_concentration_risk': r'\bgeographic concentration risk\b[\s\S]{0,500}'
+    }
+
+    for key, pattern in patterns.items():
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            features[key] = ' '.join(matches)
+    return features
+
+def process_files():
+    """ Process each file in the directory, extract features, and save them in a structured format """
+    for root, dirs, files in os.walk(input_dir):
+        for file in files:
+            if file.endswith('.txt'):
+                file_path = os.path.join(root, file)
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    text = f.read()
+
+                features = extract_features(text)
+
+                # Define the output path
+                relative_path = os.path.relpath(root, input_dir)
+                output_path = os.path.join(output_dir, relative_path)
+                os.makedirs(output_path, exist_ok=True)
+
+                output_file_path = os.path.join(output_path, f'{Path(file).stem}_features.json')
+                with open(output_file_path, 'w', encoding='utf-8') as f:
+                    json.dump(features, f, indent=4)
+
+                print(f'Features extracted and saved for {file}')
+
+if __name__ == '__main__':
+    process_files()
diff --git a/lemmatization.py b/lemmatization.py
@@ -0,0 +1,60 @@
+import nltk
+import os
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from pathlib import Path
+import re
+
+# Ensure you have the necessary NLTK resources downloaded
+nltk.download('punkt')
+nltk.download('wordnet')
+
+def process_text(text):
+    """Apply lemmatization to the text."""
+    tokens = word_tokenize(text)
+    lemmatizer = WordNetLemmatizer()
+    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
+    return ' '.join(lemmatized)
+
+def extract_numeric_contexts(text, window_size=5):
+    """Extract context windows around numeric data in the text."""
+    tokens = word_tokenize(text)
+    numeric_contexts = []
+    numeric_regex = r'\d+[\d,]*\.?\d*'
+
+    for i, token in enumerate(tokens):
+        if re.match(numeric_regex, token):
+            left_context = tokens[max(i - window_size, 0):i]
+            right_context = tokens[i + 1:min(i + window_size + 1, len(tokens))]
+            context = left_context + [token] + right_context
+            numeric_contexts.append(' '.join(context))
+
+    return ' '.join(numeric_contexts)
+
+def process_files(input_dir, output_dir):
+    """Process files from input_dir and save processed contexts to output_dir, mirroring the directory structure."""
+    input_path = Path(input_dir)
+    for path in input_path.rglob('*.txt'):
+        relative_path = path.relative_to(input_path)
+        output_path = Path(output_dir) / relative_path
+
+        print(f"Processing file: {path}")
+        with open(path, 'r', encoding='utf-8') as file:
+            text_content = file.read()
+
+        # Process text to get lemmatized and extract numeric contexts
+        lemmatized_text = process_text(text_content)
+        contexts = extract_numeric_contexts(lemmatized_text)
+
+        # Ensure the output directory exists
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as file:
+            file.write(contexts)
+        print(f"Processed contexts saved to {output_path}")
+
+# Define paths
+CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'
+OUTPUT_DIR = 'src/data/processed-numeric-contexts'
+
+if __name__ == '__main__':
+    process_files(CLEANED_TEXT_DIR, OUTPUT_DIR)
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ flask
 sec_edgar_downloader
 matplotlib
 plotly
-bs4
+bs4
+nltk
diff --git a/src/analysis/llm_analysis.py b/src/analysis/llm_analysis.py
@@ -0,0 +1,59 @@
+# import requests
+# import json
+# from pathlib import Path
+
+# # Constants
+# API_URL = "https://openrouter.ai/api/v1/chat/completions"
+# OPENROUTER_API_KEY = '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'
+
+# INPUT_DIR = 'src/analysis/pre-analysis_combined'
+# CSV_OUTPUT_DIR = 'src/analysis/csv'
+# CODE_OUTPUT_DIR = 'src/analysis/code'
+# TEXT_OUTPUT_DIR = 'src/analysis/text'
+
+# def make_api_call(text_content):
+#     prompt = f"""
+#     Analyze the financial data provided and give a json file  containing csv,python and text:
+#     1. Generate a CSV file with key financial metrics for plotting.
+#     2. Provide Python code for creating plots and animations based on the CSV.
+#     3. Provide key financial insights in text format that can be displayed on a web page.
+#     """
+#     headers = {
+#         "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+#         "Content-Type": "application/json"
+#     }
+#     data = json.dumps({
+#         "model": "nousresearch/nous-capybara-7b:free",
+#         "messages": [
+#             {"role": "user", "content": text_content + prompt} 
+#         ]
+#     })
+#     response = requests.post(API_URL, headers=headers, json=data)
+#     return response.json()
+
+# def save_output(data, output_path, file_type):
+#     Path(output_path).mkdir(parents=True, exist_ok=True)
+#     file_path = Path(output_path) / f"{file_type}.txt"
+#     with open(file_path, 'w', encoding='utf-8') as file:
+#         file.write(data)
+#     print(f"Output saved to {file_path}")
+
+# def process_files():
+#     input_path = Path(INPUT_DIR)
+#     for text_file in input_path.rglob('*.txt'):
+#         print(f"Processing: {text_file}")
+#         with open(text_file, 'r', encoding='utf-8') as file:
+#             text_content = file.read()
+
+#         response = make_api_call(text_content)
+#         if response:
+#             # Assuming the API returns a structured JSON with keys for different types of outputs
+#             if 'csv' in response:
+#                 save_output(response['csv'], CSV_OUTPUT_DIR, text_file.stem)
+#             if 'code' in response:
+#                 save_output(response['code'], CODE_OUTPUT_DIR, text_file.stem)
+#             if 'text' in response:
+#                 save_output(response['text'], TEXT_OUTPUT_DIR, text_file.stem)
+
+# if __name__ == '__main__':
+#     process_files()
diff --git a/src/analysis/output_combine.py b/src/analysis/output_combine.py
@@ -0,0 +1,45 @@
+import os
+from pathlib import Path
+
+# Define the directory containing the output text files
+output_dir = Path('src/data/output-responses')
+combined_dir = Path('src/analysis/pre-analysis_combined')
+combined_dir.mkdir(exist_ok=True)  # Ensure the combined directory exists
+
+def combine_firm_files():
+    # Dictionary to hold combined texts for each firm
+    firm_texts = {}
+
+    # Traverse through all text files in the output directory
+    for file_path in output_dir.rglob('full-submission_features.txt'):
+        # Debugging: Print the path to see what's being captured
+        print(f"Current file path: {file_path}")
+        print(f"Path parts: {file_path.parts}")
+
+        # Extract the firm name based on the file structure
+        # src/data/output-responses/<FIRM_NAME>/.../full-submission_features.txt
+        # Ensure this index corresponds correctly to the firm name part in the path
+        if len(file_path.parts) >= 5:  # Check there are enough parts
+            firm_name = file_path.parts[3]  # Index where the firm name appears, adjusted if needed
+        else:
+            continue  # Skip if the path is not deep enough to contain a firm name
+
+        # Read the content of the file
+        with open(file_path, 'r', encoding='utf-8') as file:
+            text = file.read()
+
+        # Append the text to the corresponding firm's entry in the dictionary
+        if firm_name in firm_texts:
+            firm_texts[firm_name] += "\n" + text
+        else:
+            firm_texts[firm_name] = text
+
+    # Write combined texts to new files, one per firm
+    for firm_name, text in firm_texts.items():
+        combined_file_path = combined_dir / f"{firm_name}_combined.txt"
+        with open(combined_file_path, 'w', encoding='utf-8') as file:
+            file.write(text)
+        print(f"Combined file created for firm {firm_name}: {combined_file_path}")
+
+if __name__ == '__main__':
+    combine_firm_files()
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,5 @@ flask @@
     sec_edgar_downloader
     matplotlib
     plotly
-    bs4
+    bs4
+    nltk