Skip to content

Commit

Permalink
Move WP blogs to RSS (#89)
Browse files Browse the repository at this point in the history
  • Loading branch information
mruwnik authored Jul 12, 2023
1 parent 99f7f27 commit 4d99a1c
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 182 deletions.
12 changes: 2 additions & 10 deletions align_data/blogs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,9 @@
WordpressBlog(name="aisafety.camp", url="https://aisafety.camp"),
WordpressBlog(name="miri", url="https://intelligence.org"),
WordpressBlog(name="jsteinhardt_blog", url="https://jsteinhardt.wordpress.com"),
WordpressBlog(
name="qualiacomputing",
url="https://qualiacomputing.com",
strip=["^by [^\n].*\n"]
),
WordpressBlog(name="qualiacomputing", url="https://qualiacomputing.com"),
WordpressBlog(name="vkrakovna_blog", url="https://vkrakovna.wordpress.com"),
WordpressBlog(
name="yudkowsky_blog",
url="https://yudkowsky.net",
strip=["^\s*Download as PDF\n"]
),
WordpressBlog(name="yudkowsky_blog", url="https://yudkowsky.net"),
MediumBlog(name="deepmind_blog", url="https://deepmindsafetyresearch.medium.com/", authors=["DeepMind Safety Research"]),
GwernBlog(name="gwern_blog", url='https://www.gwern.net/', authors=["Gwern Branwen"]),
ColdTakes(
Expand Down
84 changes: 19 additions & 65 deletions align_data/blogs/wp_blog.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,49 @@
from datetime import datetime, timezone
from calendar import c
from dataclasses import dataclass, field
from dataclasses import dataclass
import logging
import feedparser
from tqdm import tqdm

from markdownify import markdownify
from align_data.common import utils
from align_data.common.alignment_dataset import AlignmentDataset
from align_data.common.html_dataset import RSSDataset

from typing import List

logger = logging.getLogger(__name__)


@dataclass
class WordpressBlog(AlignmentDataset):
url: str
strip: List = field(default_factory=lambda: [])
class WordpressBlog(RSSDataset):
summary_key = 'summary'
done_key = 'paged_url'

def setup(self):
"""
url: URL of the blog
strip: list of regexes to strip from the HTML
"""
super().setup()
self.feed_url = self.url + "/feed"
self.name = utils.url_to_filename(self.url)
@property
def feed_url(self):
return self.url + "/feed"

def get_item_key(self, item):
return item

@property
def items_list(self):
logger.info(f"Fetching entries from {self.feed_url}")

pages = []
page_number = 0
last_title = None
self.items = {}
page_number = 1
prev_title = None

with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
paged_url = f"{self.feed_url}?paged={page_number + 1}"
paged_url = f"{self.feed_url}?paged={page_number}"
logging.info(f"Fetching {paged_url}")

feed = feedparser.parse(paged_url)
if (("feed" not in feed) or ("title" not in feed["feed"]) or (feed["feed"]["title"] == last_title)):
title = feed.get('feed', {}).get('title')
if not title or title == prev_title:
break
last_title = feed["feed"]["title"]

pages.extend({**entry, 'paged_url': paged_url} for entry in feed['entries'])
prev_title = feed["feed"]["title"]
page_number += 1

for item in feed['entries']:
self.items[item['link']] = item

# update the tqdm progress bar
pbar.set_postfix_str(f"page {page_number}", refresh=True) # Set postfix to "page X"
pbar.update() # Here we increment the progress bar by 1

logger.info(f'Got {len(pages)} pages')

return pages

def get_item_key(self, item):
"""Get the identifier of the given `item` so it can be checked to see whether it's been output.
The default assumption is that the `item` is a Path to a file.
"""
return item['title']

def _get_published_date(self, item):
date_published = item.get('published')
if not date_published:
return ''
date_published = datetime.strptime(date_published, '%a, %d %b %Y %H:%M:%S %z')
return self._format_datetime(date_published)

def fetch_entries(self):
for entry in self.unprocessed_items():
content_text = markdownify(entry["content"][0]["value"]).strip()
text = entry["title"] + "\n\n" + content_text

new_entry = self.make_data_entry({
"text": text,
"url": entry['link'],
"title": text.split("\n")[0],
"source": self.name,
"source_type": "blog",
"date_published": self._get_published_date(entry),
"paged_url": entry['paged_url'],
"authors": [e['name'] for e in entry.get('authors', [])],
})
new_entry.add_id()

yield new_entry
logger.info(f'Got {len(self.items)} pages')
return list(self.items.keys())
79 changes: 0 additions & 79 deletions align_data/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,79 +0,0 @@
import bs4
import jsonlines
import os
import re
import time
import html2text
from urllib.parse import urlparse
import os
from functools import reduce
import operator
import unicodedata
import re


def random_delay():
import random
time.sleep(random.randint(1, 10))


def url_to_filename(url):
"""
Convert a URL to a suitable filename.
"""
url = urlparse(url)
path = url.path.lstrip(os.sep).rstrip(os.sep).split(os.sep)
return "-".join([url.netloc] + list(filter(None, path)))


class ExitCodeError(Exception):
pass


def sh(x):
if os.system(x):
raise ExitCodeError()


def ls(x):
return [x + "/" + fn for fn in os.listdir(x)]


def lsr(x):
if os.path.isdir(x):
return reduce(operator.add, map(lsr, ls(x)), [])
else:
return [x]


def fwrite(fname, content):
with open(fname, "w") as fh:
fh.write(content)


def fread(fname):
with open(fname) as fh:
return fh.read()


def chdir_up_n(n):
"""Goes up n times in the directory tree."""
for i in range(n):
os.chdir("..")


def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')
55 changes: 27 additions & 28 deletions tests/align_data/test_blogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest
from bs4 import BeautifulSoup
from requests import request

from align_data.blogs import CaradoMoe, ColdTakes, GenerativeInk, GwernBlog, MediumBlog, SubstackBlog, WordpressBlog

Expand Down Expand Up @@ -441,33 +442,28 @@ def test_substack_blog_process_entry():

def test_wordpress_blog_setup():
blog = WordpressBlog(
name='blog',
name='blog_name',
url="https://www.bla.yudkowsky.net",
)
blog.setup()
assert blog.feed_url == 'https://www.bla.yudkowsky.net/feed'
assert blog.name == "www.bla.yudkowsky.net"
assert blog.name == "blog_name"


@patch('feedparser.parse', return_value=WORDPRESS_FEED)
def test_wordpress_blog_items_list(feedparser_parse):
blog = WordpressBlog(
name='blog',
url="https://www.bla.yudkowsky.net",
)
blog.setup()
items = blog.items_list
assert len(items) == 1
assert items[0]['title'] == 'Prospiracy Theory'

blog = WordpressBlog(name='blog', url="https://www.bla.yudkowsky.net")
assert blog.items_list == ['https://www.yudkowsky.net/other/fiction/prospiracy-theory']


def test_wordpress_blog_get_item_key():
blog = WordpressBlog(
name='blog',
url="https://www.bla.yudkowsky.net",
)
item_key = blog.get_item_key({'title': 'Test Entry'})
assert item_key == 'Test Entry'

item = {'title': 'Test Entry'}
assert item == blog.get_item_key(item)


def test_wordpress_blog_get_published_date():
blog = WordpressBlog(
name='blog',
Expand All @@ -476,20 +472,23 @@ def test_wordpress_blog_get_published_date():
date_published = blog._get_published_date({'published': "Mon, 26 Jun 2023 13:40:01 +0000"})
assert date_published == '2023-06-26T13:40:01Z'


@patch('feedparser.parse', return_value=WORDPRESS_FEED)
def test_wordpress_blog_fetch_entries(feedparser_parse):
def test_wordpress_blog_process_entry(feedparser_parse):
blog = WordpressBlog(
name='blog',
name='blog_name',
url="https://www.bla.yudkowsky.net",
)
blog.setup()
entries = list(blog.fetch_entries())
assert len(entries) == 1
entry = entries[0].to_dict()
assert entry['url'] == 'https://www.yudkowsky.net/other/fiction/prospiracy-theory'
assert entry['title'] == 'Prospiracy Theory'
assert entry['source'] == 'www.bla.yudkowsky.net'
assert entry['source_type'] == 'blog'
assert entry['date_published'] == '2020-09-04T04:11:23Z'
assert entry['authors'] == ['Eliezer S. Yudkowsky']
assert entry['text'] == 'Prospiracy Theory\n\nbla bla bla [a link](http://ble.com) bla bla'
blog.items = {i['link']: i for i in WORDPRESS_FEED['entries']}
entry = blog.process_entry('https://www.yudkowsky.net/other/fiction/prospiracy-theory')
assert entry == {
'authors': ['Eliezer S. Yudkowsky'],
'date_published': '2020-09-04T04:11:23Z',
'id': None,
'source': 'blog_name',
'source_type': 'blog',
'summary': [],
'text': 'bla bla bla [a link](http://ble.com) bla bla',
'title': 'Prospiracy Theory',
'url': 'https://www.yudkowsky.net/other/fiction/prospiracy-theory',
}

0 comments on commit 4d99a1c

Please sign in to comment.