-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9b299f0
commit e6a66f5
Showing
205 changed files
with
5,966 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import os | ||
import requests | ||
from pathlib import Path | ||
import time | ||
import json | ||
|
||
# Constants | ||
API_URL = "https://openrouter.ai/api/v1/chat/completions" | ||
OPENROUTER_API_KEY = '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' | ||
CLEANED_TEXT_DIR = 'src/data/feature' | ||
OUTPUT_DIR = 'src/data/output-responses' | ||
REQUEST_INTERVAL = 10 # seconds to wait between requests to manage API rate limit | ||
|
||
def make_request(text): | ||
prompt = """Generate a detailed financial analysis for the provided data. Please structure your response in a clear and organized manner...""" | ||
headers = { | ||
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | ||
"Content-Type": "application/json" | ||
} | ||
data = json.dumps({ | ||
"model": "mistralai/mistral-7b-instruct:free", | ||
"messages": [{"role": "user", "content": text + prompt}], | ||
"max_tokens": 4000 # Control the maximum output length | ||
}) | ||
response = requests.post(API_URL, headers=headers, data=data) | ||
if response.status_code == 200: | ||
return response.json() | ||
else: | ||
print(f"Failed to fetch data: {response.status_code}, {response.text}") | ||
return None | ||
|
||
def save_response(response, output_path): | ||
os.makedirs(output_path.parent, exist_ok=True) # Ensure the output directory exists | ||
with open(output_path, 'w', encoding='utf-8') as f: | ||
if 'choices' in response and len(response['choices']) > 0: | ||
text = response['choices'][0].get('message', {}).get('content', '') | ||
f.write(text) | ||
print(f"Response saved to {output_path}") | ||
|
||
def process_json_files(): | ||
input_path = Path(CLEANED_TEXT_DIR) | ||
output_path_root = Path(OUTPUT_DIR) | ||
|
||
for path in input_path.rglob('*.json'): | ||
relative_path = path.relative_to(input_path) | ||
output_path = output_path_root / relative_path | ||
output_path = output_path.with_suffix('.txt') # Ensure the file extension is .txt for plain text | ||
|
||
# Check if the response file already exists | ||
if not output_path.exists(): | ||
print(f"Processing file: {path}") | ||
with open(path, 'r', encoding='utf-8') as file: | ||
data = json.load(file) | ||
text_content = ' '.join([str(value) for key, value in data.items() if isinstance(value, str)]) | ||
response = make_request(text_content) | ||
if response: | ||
save_response(response, output_path) | ||
time.sleep(REQUEST_INTERVAL) # Respect the API rate limit | ||
else: | ||
print(f"Skipping {path} as output already exists.") | ||
|
||
if __name__ == '__main__': | ||
process_json_files() |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import os | ||
import re | ||
from pathlib import Path | ||
import json | ||
|
||
# Define the input and output directories | ||
input_dir = 'src/data/processed-numeric-contexts' | ||
output_dir = 'src/data/feature' | ||
|
||
def extract_features(text): | ||
""" Extracts financial terms and their contextual data from the text """ | ||
features = {} | ||
# Define patterns for each feature of interest | ||
patterns = { | ||
'revenue': r'\brevenue\b[\s\S]{0,500}', | ||
'expenses': r'\bexpenses\b[\s\S]{0,500}', | ||
'net_income': r'\bnet income\b[\s\S]{0,500}', | ||
'assets': r'\bassets\b[\s\S]{0,500}', | ||
'liabilities': r'\bliabilities\b[\s\S]{0,500}', | ||
'equity': r'\bequity\b[\s\S]{0,500}', | ||
'cash_flow': r'\bcash flow\b[\s\S]{0,500}', | ||
'operating_margin': r'\boperating margin\b[\s\S]{0,500}', | ||
'gross_margin': r'\bgross margin\b[\s\S]{0,500}', | ||
'ebitda': r'\bebitda\b[\s\S]{0,500}', | ||
'accumulated_depreciation': r'\baccumulated depreciation\b[\s\S]{0,500}', | ||
'capital_expenditure': r'\bcapital expenditure\b[\s\S]{0,500}', | ||
'debt': r'\bdebt\b[\s\S]{0,500}', | ||
'share_repurchase': r'\bshare repurchase\b[\s\S]{0,500}', | ||
'dividend_payout': r'\bdividend payout\b[\s\S]{0,500}', | ||
'financial_ratios': r'\b(debt-to-equity ratio|return on equity)\b[\s\S]{0,500}', | ||
'earnings_per_share': r'\bearnings per share\b[\s\S]{0,500}', | ||
'tax_rate': r'\btax rate\b[\s\S]{0,500}', | ||
'segment_revenue': r'\bsegment revenue\b[\s\S]{0,500}', | ||
'geographic_information': r'\bgeographic\b[\s\S]{0,500}', | ||
'investment_gains_losses': r'\b(investment gains|investment losses)\b[\s\S]{0,500}', | ||
'regulatory_changes': r'\bregulatory changes\b[\s\S]{0,500}', | ||
'legal_issues': r'\blegal issues\b[\s\S]{0,500}', | ||
'accrued_liabilities': r'\baccrued liabilities\b[\s\S]{0,500}', | ||
'common_stock': r'\bcommon stock\b[\s\S]{0,500}', | ||
'capital_stock': r'\bcapital stock\b[\s\S]{0,500}', | ||
'subsequent_events': r'\bsubsequent events\b[\s\S]{0,500}', | ||
'noncurrent_assets': r'\bnoncurrent assets\b[\s\S]{0,500}', | ||
'fair_value_measurements': r'\bfair value measurements\b[\s\S]{0,500}', | ||
'level_1_assets': r'\blevel 1 assets\b[\s\S]{0,500}', | ||
'level_2_assets': r'\blevel 2 assets\b[\s\S]{0,500}', | ||
'level_3_assets': r'\blevel 3 assets\b[\s\S]{0,500}', | ||
'debt_securities': r'\bdebt securities\b[\s\S]{0,500}', | ||
'bank_deposits': r'\bbank deposits\b[\s\S]{0,500}', | ||
'corporate_debt': r'\bcorporate debt\b[\s\S]{0,500}', | ||
'government_bonds': r'\bgovernment bonds\b[\s\S]{0,500}', | ||
'mortgage_backed_securities': r'\bmortgage-backed securities\b[\s\S]{0,500}', | ||
'asset_backed_securities': r'\basset-backed securities\b[\s\S]{0,500}', | ||
'hedging_activities': r'\bhedging activities\b[\s\S]{0,500}', | ||
'foreign_exchange_contracts': r'\bforeign exchange contracts\b[\s\S]{0,500}', | ||
'designated_hedging': r'\bdesignated hedging\b[\s\S]{0,500}', | ||
'nondesignated_hedging': r'\bnondesignated hedging\b[\s\S]{0,500}', | ||
'cash_flow_hedges': r'\bcash flow hedges\b[\s\S]{0,500}', | ||
'derivative_instruments': r'\bderivative instruments\b[\s\S]{0,500}', | ||
'geographic_concentration_risk': r'\bgeographic concentration risk\b[\s\S]{0,500}' | ||
} | ||
|
||
for key, pattern in patterns.items(): | ||
matches = re.findall(pattern, text, re.IGNORECASE) | ||
if matches: | ||
features[key] = ' '.join(matches) | ||
return features | ||
|
||
def process_files(): | ||
""" Process each file in the directory, extract features, and save them in a structured format """ | ||
for root, dirs, files in os.walk(input_dir): | ||
for file in files: | ||
if file.endswith('.txt'): | ||
file_path = os.path.join(root, file) | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
text = f.read() | ||
|
||
features = extract_features(text) | ||
|
||
# Define the output path | ||
relative_path = os.path.relpath(root, input_dir) | ||
output_path = os.path.join(output_dir, relative_path) | ||
os.makedirs(output_path, exist_ok=True) | ||
|
||
output_file_path = os.path.join(output_path, f'{Path(file).stem}_features.json') | ||
with open(output_file_path, 'w', encoding='utf-8') as f: | ||
json.dump(features, f, indent=4) | ||
|
||
print(f'Features extracted and saved for {file}') | ||
|
||
if __name__ == '__main__': | ||
process_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import nltk | ||
import os | ||
from nltk.tokenize import word_tokenize | ||
from nltk.stem import WordNetLemmatizer | ||
from pathlib import Path | ||
import re | ||
|
||
# Ensure you have the necessary NLTK resources downloaded | ||
nltk.download('punkt') | ||
nltk.download('wordnet') | ||
|
||
def process_text(text): | ||
"""Apply lemmatization to the text.""" | ||
tokens = word_tokenize(text) | ||
lemmatizer = WordNetLemmatizer() | ||
lemmatized = [lemmatizer.lemmatize(token) for token in tokens] | ||
return ' '.join(lemmatized) | ||
|
||
def extract_numeric_contexts(text, window_size=5): | ||
"""Extract context windows around numeric data in the text.""" | ||
tokens = word_tokenize(text) | ||
numeric_contexts = [] | ||
numeric_regex = r'\d+[\d,]*\.?\d*' | ||
|
||
for i, token in enumerate(tokens): | ||
if re.match(numeric_regex, token): | ||
left_context = tokens[max(i - window_size, 0):i] | ||
right_context = tokens[i + 1:min(i + window_size + 1, len(tokens))] | ||
context = left_context + [token] + right_context | ||
numeric_contexts.append(' '.join(context)) | ||
|
||
return ' '.join(numeric_contexts) | ||
|
||
def process_files(input_dir, output_dir): | ||
"""Process files from input_dir and save processed contexts to output_dir, mirroring the directory structure.""" | ||
input_path = Path(input_dir) | ||
for path in input_path.rglob('*.txt'): | ||
relative_path = path.relative_to(input_path) | ||
output_path = Path(output_dir) / relative_path | ||
|
||
print(f"Processing file: {path}") | ||
with open(path, 'r', encoding='utf-8') as file: | ||
text_content = file.read() | ||
|
||
# Process text to get lemmatized and extract numeric contexts | ||
lemmatized_text = process_text(text_content) | ||
contexts = extract_numeric_contexts(lemmatized_text) | ||
|
||
# Ensure the output directory exists | ||
output_path.parent.mkdir(parents=True, exist_ok=True) | ||
with open(output_path, 'w', encoding='utf-8') as file: | ||
file.write(contexts) | ||
print(f"Processed contexts saved to {output_path}") | ||
|
||
# Define paths | ||
CLEANED_TEXT_DIR = 'src/data/cleaned-sec-edgar-filings' | ||
OUTPUT_DIR = 'src/data/processed-numeric-contexts' | ||
|
||
if __name__ == '__main__': | ||
process_files(CLEANED_TEXT_DIR, OUTPUT_DIR) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,4 +3,5 @@ flask | |
sec_edgar_downloader | ||
matplotlib | ||
plotly | ||
bs4 | ||
bs4 | ||
nltk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# import requests | ||
# import json | ||
# from pathlib import Path | ||
|
||
# # Constants | ||
# API_URL = "https://openrouter.ai/api/v1/chat/completions" | ||
# OPENROUTER_API_KEY = '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' | ||
|
||
# INPUT_DIR = 'src/analysis/pre-analysis_combined' | ||
# CSV_OUTPUT_DIR = 'src/analysis/csv' | ||
# CODE_OUTPUT_DIR = 'src/analysis/code' | ||
# TEXT_OUTPUT_DIR = 'src/analysis/text' | ||
|
||
# def make_api_call(text_content): | ||
# prompt = f""" | ||
# Analyze the financial data provided and give a json file containing csv,python and text: | ||
# 1. Generate a CSV file with key financial metrics for plotting. | ||
# 2. Provide Python code for creating plots and animations based on the CSV. | ||
# 3. Provide key financial insights in text format that can be displayed on a web page. | ||
# """ | ||
# headers = { | ||
# "Authorization": f"Bearer {OPENROUTER_API_KEY}", | ||
# "Content-Type": "application/json" | ||
# } | ||
# data = json.dumps({ | ||
# "model": "nousresearch/nous-capybara-7b:free", | ||
# "messages": [ | ||
# {"role": "user", "content": text_content + prompt} | ||
# ] | ||
# }) | ||
# response = requests.post(API_URL, headers=headers, json=data) | ||
# return response.json() | ||
|
||
# def save_output(data, output_path, file_type): | ||
# Path(output_path).mkdir(parents=True, exist_ok=True) | ||
# file_path = Path(output_path) / f"{file_type}.txt" | ||
# with open(file_path, 'w', encoding='utf-8') as file: | ||
# file.write(data) | ||
# print(f"Output saved to {file_path}") | ||
|
||
# def process_files(): | ||
# input_path = Path(INPUT_DIR) | ||
# for text_file in input_path.rglob('*.txt'): | ||
# print(f"Processing: {text_file}") | ||
# with open(text_file, 'r', encoding='utf-8') as file: | ||
# text_content = file.read() | ||
|
||
# response = make_api_call(text_content) | ||
# if response: | ||
# # Assuming the API returns a structured JSON with keys for different types of outputs | ||
# if 'csv' in response: | ||
# save_output(response['csv'], CSV_OUTPUT_DIR, text_file.stem) | ||
# if 'code' in response: | ||
# save_output(response['code'], CODE_OUTPUT_DIR, text_file.stem) | ||
# if 'text' in response: | ||
# save_output(response['text'], TEXT_OUTPUT_DIR, text_file.stem) | ||
|
||
# if __name__ == '__main__': | ||
# process_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import os | ||
from pathlib import Path | ||
|
||
# Define the directory containing the output text files | ||
output_dir = Path('src/data/output-responses') | ||
combined_dir = Path('src/analysis/pre-analysis_combined') | ||
combined_dir.mkdir(exist_ok=True) # Ensure the combined directory exists | ||
|
||
def combine_firm_files(): | ||
# Dictionary to hold combined texts for each firm | ||
firm_texts = {} | ||
|
||
# Traverse through all text files in the output directory | ||
for file_path in output_dir.rglob('full-submission_features.txt'): | ||
# Debugging: Print the path to see what's being captured | ||
print(f"Current file path: {file_path}") | ||
print(f"Path parts: {file_path.parts}") | ||
|
||
# Extract the firm name based on the file structure | ||
# src/data/output-responses/<FIRM_NAME>/.../full-submission_features.txt | ||
# Ensure this index corresponds correctly to the firm name part in the path | ||
if len(file_path.parts) >= 5: # Check there are enough parts | ||
firm_name = file_path.parts[3] # Index where the firm name appears, adjusted if needed | ||
else: | ||
continue # Skip if the path is not deep enough to contain a firm name | ||
|
||
# Read the content of the file | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
text = file.read() | ||
|
||
# Append the text to the corresponding firm's entry in the dictionary | ||
if firm_name in firm_texts: | ||
firm_texts[firm_name] += "\n" + text | ||
else: | ||
firm_texts[firm_name] = text | ||
|
||
# Write combined texts to new files, one per firm | ||
for firm_name, text in firm_texts.items(): | ||
combined_file_path = combined_dir / f"{firm_name}_combined.txt" | ||
with open(combined_file_path, 'w', encoding='utf-8') as file: | ||
file.write(text) | ||
print(f"Combined file created for firm {firm_name}: {combined_file_path}") | ||
|
||
if __name__ == '__main__': | ||
combine_firm_files() |
Oops, something went wrong.