-
Notifications
You must be signed in to change notification settings - Fork 3
/
sentimentx.py
78 lines (59 loc) · 2.42 KB
/
sentimentx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
import csv
import os
from typing import Any, List
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from chain import extract_data_chain
load_dotenv()
def remove_css_and_outer_html(html):
# Parse the HTML
soup = BeautifulSoup(html, 'html.parser')
# Remove all <style> tags
for style in soup.find_all('style'):
style.decompose()
# Extract only the body contents
body_contents = str(soup.body)
return body_contents
class SentimentX:
def __init__(self):
self.api_key = os.getenv('OPENAI_API_KEY')
if not self.api_key:
raise ValueError("OPENAI_API_KEY environment variable is not set")
self.llm = ChatOpenAI(temperature=1)
def extract_signal(self, data: str) -> Any:
# placeholder for the actual function implementation
output = extract_data_chain(self.llm, data)
return output
def get_article_content(self, url: str) -> str:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.get_text(strip=True)
def run(self, urls: List[str], csv_fname: str = None):
article_data = []
for url in urls:
content = self.get_article_content(url)
structured_data = self.extract_signal(content)
structured_data['url'] = url
article_data.append(structured_data)
if csv_fname:
with open(csv_fname, 'w', newline='') as f:
# Get the field names from the keys of the first dictionary in the list
fieldnames = article_data[0].keys()
writer = csv.DictWriter(f, fieldnames)
# Write the header (optional)
writer.writeheader()
# Write each dictionary to a new row
for data in article_data:
writer.writerow(data)
return article_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='SentimentX - News Article Sentiment Analysis.')
parser.add_argument('--url', nargs='+', required=True, help='URL(s) to the news article.')
parser.add_argument('--csv', type=str, required=False, help='Saves the json to a csv file instead of printing to stdout.')
args = parser.parse_args()
sentimentx = SentimentX()
data = sentimentx.run(args.url, args.csv)
print(data)