-
Notifications
You must be signed in to change notification settings - Fork 1
/
MoneyStuff.py
134 lines (110 loc) · 4.29 KB
/
MoneyStuff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.15.2
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
import requests
import datetime
import re
from bs4 import BeautifulSoup as BS
import jonlog
from ChatPodcastGPT import *
logger = jonlog.getLogger()
# -
def clean_text(txt):
txt = txt.replace('\r', '\n').replace('\xa0', ' ')
txt = re.sub('\[\d\]', '', txt)
return txt
class MoneyStuff:
BASE_URL = 'https://newsletterhunt.com/'
LIST_URL = BASE_URL + 'newsletters/money-stuff-by-matt-levine'
def list(self):
html_content = requests.get(self.LIST_URL).content
soup = BS(html_content, 'html.parser')
now = datetime.datetime.now()
# Extracting articles with their titles, publish dates, and links
articles = []
for article in soup.find_all('article'):
title = article.find('h2')
date_text = article.find('time').get_text(strip=True) if article.find('time') else None
link = article.find('a', href=True)
if title and date_text and link and ('hours ago' in date_text or 'hour ago' in date_text):
articles.append({
'title': title.get_text(strip=True),
'link': link['href']
})
return articles
def get(self, article):
html_content = requests.get(self.BASE_URL + article['link']).content
soup = BS(html_content, 'html.parser')
soup = BS(str(soup.find('iframe')['srcdoc']), 'html.parser')
all_text = soup.find(class_='body-component__content').get_text()
# Replace carriage returns and non-breaking spaces with normal spaces
text = clean_text(all_text)
lines = [x for x in text.split('\n') if x]
return lines
# +
# class ZviBlog:
# BASE_URL = "https://thezvi.wordpress.com/"
# LIST_URL = BASE_URL
# def list(self):
# html_content = requests.get(self.LIST_URL).content
# soup = BS(html_content, 'html.parser')
# articles = soup.find_all('div', class_='post')
# # Current time and 24 hours ago
# now = datetime.datetime.now()
# one_day_ago = now - datetime.timedelta(days=2)
# articles_list = []
# for article in articles:
# # Extracting the title and URL - these selectors might need adjustment
# title = clean_text(article.find(class_='entry-title').get_text(strip=True))
# article_url = article.find('a', href=True)['href']
# # Extracting and parsing the publish datetime - this format might need adjustment
# article_date = article_url.split(self.BASE_URL)[1][:10]
# article_time = article.find(title=True)['title']
# datetime_str = f'{article_date} {article_time}'
# article_datetime = datetime.datetime.strptime(datetime_str, '%Y/%m/%d %I:%M %p')
# if article_datetime >= one_day_ago:
# articles_list.append({
# 'datetime': article_datetime, 'title': title, 'link': article_url
# })
# return articles_list
# def get(self, article):
# html_content = requests.get(article['link']).content
# soup = BS(html_content, 'html.parser')
# all_text = soup.find('div', class_='entry-content').get_text()
# # Replace carriage returns and non-breaking spaces with normal spaces
# text = clean_text(all_text)
# lines = [x for x in text.split('\n') if x]
# return lines
# -
def run(ndays=1):
for src, prefix in [(MoneyStuff(), 'MoneyStuff')]:
print(prefix)
# if prefix == 'MoneyStuff': continue
articles = src.list()
print(articles)
now = datetime.datetime.now()
for article in articles:
logger.info(f"Writing {article}")
lines = src.get(article)
ep = Episode(
topic=article['title'],
episode_type='pure_tts',
)
ep.step(msg=lines)
ep.upload(f"[{prefix}] " + article['title'][:200], f'{prefix} tts: {article["title"]}')
# +
# run(1)
# +
# # %debug
# -