Skip to content

Commit

Permalink
Inference obtained from LLM
Browse files Browse the repository at this point in the history
  • Loading branch information
siddharth7113 committed May 5, 2024
1 parent 9b299f0 commit e6a66f5
Show file tree
Hide file tree
Showing 205 changed files with 5,966 additions and 88 deletions.
63 changes: 63 additions & 0 deletions API_jsontotxt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import requests
from pathlib import Path
import time
import json

# Constants
API_URL = "https://openrouter.ai/api/v1/chat/completions"
OPENROUTER_API_KEY = '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'
CLEANED_TEXT_DIR = 'src/data/feature'
OUTPUT_DIR = 'src/data/output-responses'
REQUEST_INTERVAL = 10 # seconds to wait between requests to manage API rate limit

def make_request(text):
prompt = """Generate a detailed financial analysis for the provided data. Please structure your response in a clear and organized manner..."""
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json"
}
data = json.dumps({
"model": "mistralai/mistral-7b-instruct:free",
"messages": [{"role": "user", "content": text + prompt}],
"max_tokens": 4000 # Control the maximum output length
})
response = requests.post(API_URL, headers=headers, data=data)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to fetch data: {response.status_code}, {response.text}")
return None

def save_response(response, output_path):
os.makedirs(output_path.parent, exist_ok=True) # Ensure the output directory exists
with open(output_path, 'w', encoding='utf-8') as f:
if 'choices' in response and len(response['choices']) > 0:
text = response['choices'][0].get('message', {}).get('content', '')
f.write(text)
print(f"Response saved to {output_path}")

def process_json_files():
input_path = Path(CLEANED_TEXT_DIR)
output_path_root = Path(OUTPUT_DIR)

for path in input_path.rglob('*.json'):
relative_path = path.relative_to(input_path)
output_path = output_path_root / relative_path
output_path = output_path.with_suffix('.txt') # Ensure the file extension is .txt for plain text

# Check if the response file already exists
if not output_path.exists():
print(f"Processing file: {path}")
with open(path, 'r', encoding='utf-8') as file:
data = json.load(file)
text_content = ' '.join([str(value) for key, value in data.items() if isinstance(value, str)])
response = make_request(text_content)
if response:
save_response(response, output_path)
time.sleep(REQUEST_INTERVAL) # Respect the API rate limit
else:
print(f"Skipping {path} as output already exists.")

if __name__ == '__main__':
process_json_files()
58 changes: 0 additions & 58 deletions API_request.py

This file was deleted.

91 changes: 91 additions & 0 deletions feature_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import re
from pathlib import Path
import json

# Define the input and output directories
input_dir = 'src/data/processed-numeric-contexts'
output_dir = 'src/data/feature'

def extract_features(text):
""" Extracts financial terms and their contextual data from the text """
features = {}
# Define patterns for each feature of interest
patterns = {
'revenue': r'\brevenue\b[\s\S]{0,500}',
'expenses': r'\bexpenses\b[\s\S]{0,500}',
'net_income': r'\bnet income\b[\s\S]{0,500}',
'assets': r'\bassets\b[\s\S]{0,500}',
'liabilities': r'\bliabilities\b[\s\S]{0,500}',
'equity': r'\bequity\b[\s\S]{0,500}',
'cash_flow': r'\bcash flow\b[\s\S]{0,500}',
'operating_margin': r'\boperating margin\b[\s\S]{0,500}',
'gross_margin': r'\bgross margin\b[\s\S]{0,500}',
'ebitda': r'\bebitda\b[\s\S]{0,500}',
'accumulated_depreciation': r'\baccumulated depreciation\b[\s\S]{0,500}',
'capital_expenditure': r'\bcapital expenditure\b[\s\S]{0,500}',
'debt': r'\bdebt\b[\s\S]{0,500}',
'share_repurchase': r'\bshare repurchase\b[\s\S]{0,500}',
'dividend_payout': r'\bdividend payout\b[\s\S]{0,500}',
'financial_ratios': r'\b(debt-to-equity ratio|return on equity)\b[\s\S]{0,500}',
'earnings_per_share': r'\bearnings per share\b[\s\S]{0,500}',
'tax_rate': r'\btax rate\b[\s\S]{0,500}',
'segment_revenue': r'\bsegment revenue\b[\s\S]{0,500}',
'geographic_information': r'\bgeographic\b[\s\S]{0,500}',
'investment_gains_losses': r'\b(investment gains|investment losses)\b[\s\S]{0,500}',
'regulatory_changes': r'\bregulatory changes\b[\s\S]{0,500}',
'legal_issues': r'\blegal issues\b[\s\S]{0,500}',
'accrued_liabilities': r'\baccrued liabilities\b[\s\S]{0,500}',
'common_stock': r'\bcommon stock\b[\s\S]{0,500}',
'capital_stock': r'\bcapital stock\b[\s\S]{0,500}',
'subsequent_events': r'\bsubsequent events\b[\s\S]{0,500}',
'noncurrent_assets': r'\bnoncurrent assets\b[\s\S]{0,500}',
'fair_value_measurements': r'\bfair value measurements\b[\s\S]{0,500}',
'level_1_assets': r'\blevel 1 assets\b[\s\S]{0,500}',
'level_2_assets': r'\blevel 2 assets\b[\s\S]{0,500}',
'level_3_assets': r'\blevel 3 assets\b[\s\S]{0,500}',
'debt_securities': r'\bdebt securities\b[\s\S]{0,500}',
'bank_deposits': r'\bbank deposits\b[\s\S]{0,500}',
'corporate_debt': r'\bcorporate debt\b[\s\S]{0,500}',
'government_bonds': r'\bgovernment bonds\b[\s\S]{0,500}',
'mortgage_backed_securities': r'\bmortgage-backed securities\b[\s\S]{0,500}',
'asset_backed_securities': r'\basset-backed securities\b[\s\S]{0,500}',
'hedging_activities': r'\bhedging activities\b[\s\S]{0,500}',
'foreign_exchange_contracts': r'\bforeign exchange contracts\b[\s\S]{0,500}',
'designated_hedging': r'\bdesignated hedging\b[\s\S]{0,500}',
'nondesignated_hedging': r'\bnondesignated hedging\b[\s\S]{0,500}',
'cash_flow_hedges': r'\bcash flow hedges\b[\s\S]{0,500}',
'derivative_instruments': r'\bderivative instruments\b[\s\S]{0,500}',
'geographic_concentration_risk': r'\bgeographic concentration risk\b[\s\S]{0,500}'
}

for key, pattern in patterns.items():
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
features[key] = ' '.join(matches)
return features

def process_files():
""" Process each file in the directory, extract features, and save them in a structured format """
for root, dirs, files in os.walk(input_dir):
for file in files:
if file.endswith('.txt'):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()

features = extract_features(text)

# Define the output path
relative_path = os.path.relpath(root, input_dir)
output_path = os.path.join(output_dir, relative_path)
os.makedirs(output_path, exist_ok=True)

output_file_path = os.path.join(output_path, f'{Path(file).stem}_features.json')
with open(output_file_path, 'w', encoding='utf-8') as f:
json.dump(features, f, indent=4)

print(f'Features extracted and saved for {file}')

if __name__ == '__main__':
process_files()
60 changes: 60 additions & 0 deletions lemmatization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from pathlib import Path
import re

# Ensure you have the necessary NLTK resources downloaded
nltk.download('punkt')
nltk.download('wordnet')

def process_text(text):
"""Apply lemmatization to the text."""
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(lemmatized)

def extract_numeric_contexts(text, window_size=5):
"""Extract context windows around numeric data in the text."""
tokens = word_tokenize(text)
numeric_contexts = []
numeric_regex = r'\d+[\d,]*\.?\d*'

for i, token in enumerate(tokens):
if re.match(numeric_regex, token):
left_context = tokens[max(i - window_size, 0):i]
right_context = tokens[i + 1:min(i + window_size + 1, len(tokens))]
context = left_context + [token] + right_context
numeric_contexts.append(' '.join(context))

return ' '.join(numeric_contexts)

def process_files(input_dir, output_dir):
"""Process files from input_dir and save processed contexts to output_dir, mirroring the directory structure."""
input_path = Path(input_dir)
for path in input_path.rglob('*.txt'):
relative_path = path.relative_to(input_path)
output_path = Path(output_dir) / relative_path

print(f"Processing file: {path}")
with open(path, 'r', encoding='utf-8') as file:
text_content = file.read()

# Process text to get lemmatized and extract numeric contexts
lemmatized_text = process_text(text_content)
contexts = extract_numeric_contexts(lemmatized_text)

# Ensure the output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(contexts)
print(f"Processed contexts saved to {output_path}")

# Define paths
CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings'
OUTPUT_DIR = 'src/data/processed-numeric-contexts'

if __name__ == '__main__':
process_files(CLEANED_TEXT_DIR, OUTPUT_DIR)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ flask
sec_edgar_downloader
matplotlib
plotly
bs4
bs4
nltk
59 changes: 59 additions & 0 deletions src/analysis/llm_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# import requests
# import json
# from pathlib import Path

# # Constants
# API_URL = "https://openrouter.ai/api/v1/chat/completions"
# OPENROUTER_API_KEY = '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'

# INPUT_DIR = 'src/analysis/pre-analysis_combined'
# CSV_OUTPUT_DIR = 'src/analysis/csv'
# CODE_OUTPUT_DIR = 'src/analysis/code'
# TEXT_OUTPUT_DIR = 'src/analysis/text'

# def make_api_call(text_content):
# prompt = f"""
# Analyze the financial data provided and give a json file containing csv,python and text:
# 1. Generate a CSV file with key financial metrics for plotting.
# 2. Provide Python code for creating plots and animations based on the CSV.
# 3. Provide key financial insights in text format that can be displayed on a web page.
# """
# headers = {
# "Authorization": f"Bearer {OPENROUTER_API_KEY}",
# "Content-Type": "application/json"
# }
# data = json.dumps({
# "model": "nousresearch/nous-capybara-7b:free",
# "messages": [
# {"role": "user", "content": text_content + prompt}
# ]
# })
# response = requests.post(API_URL, headers=headers, json=data)
# return response.json()

# def save_output(data, output_path, file_type):
# Path(output_path).mkdir(parents=True, exist_ok=True)
# file_path = Path(output_path) / f"{file_type}.txt"
# with open(file_path, 'w', encoding='utf-8') as file:
# file.write(data)
# print(f"Output saved to {file_path}")

# def process_files():
# input_path = Path(INPUT_DIR)
# for text_file in input_path.rglob('*.txt'):
# print(f"Processing: {text_file}")
# with open(text_file, 'r', encoding='utf-8') as file:
# text_content = file.read()

# response = make_api_call(text_content)
# if response:
# # Assuming the API returns a structured JSON with keys for different types of outputs
# if 'csv' in response:
# save_output(response['csv'], CSV_OUTPUT_DIR, text_file.stem)
# if 'code' in response:
# save_output(response['code'], CODE_OUTPUT_DIR, text_file.stem)
# if 'text' in response:
# save_output(response['text'], TEXT_OUTPUT_DIR, text_file.stem)

# if __name__ == '__main__':
# process_files()
45 changes: 45 additions & 0 deletions src/analysis/output_combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
from pathlib import Path

# Define the directory containing the output text files
output_dir = Path('src/data/output-responses')
combined_dir = Path('src/analysis/pre-analysis_combined')
combined_dir.mkdir(exist_ok=True) # Ensure the combined directory exists

def combine_firm_files():
# Dictionary to hold combined texts for each firm
firm_texts = {}

# Traverse through all text files in the output directory
for file_path in output_dir.rglob('full-submission_features.txt'):
# Debugging: Print the path to see what's being captured
print(f"Current file path: {file_path}")
print(f"Path parts: {file_path.parts}")

# Extract the firm name based on the file structure
# src/data/output-responses/<FIRM_NAME>/.../full-submission_features.txt
# Ensure this index corresponds correctly to the firm name part in the path
if len(file_path.parts) >= 5: # Check there are enough parts
firm_name = file_path.parts[3] # Index where the firm name appears, adjusted if needed
else:
continue # Skip if the path is not deep enough to contain a firm name

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()

# Append the text to the corresponding firm's entry in the dictionary
if firm_name in firm_texts:
firm_texts[firm_name] += "\n" + text
else:
firm_texts[firm_name] = text

# Write combined texts to new files, one per firm
for firm_name, text in firm_texts.items():
combined_file_path = combined_dir / f"{firm_name}_combined.txt"
with open(combined_file_path, 'w', encoding='utf-8') as file:
file.write(text)
print(f"Combined file created for firm {firm_name}: {combined_file_path}")

if __name__ == '__main__':
combine_firm_files()
Loading

0 comments on commit e6a66f5

Please sign in to comment.