-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
48 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,95 +1,49 @@ | ||
from datetime import datetime, timezone | ||
from calendar import c | ||
from dataclasses import dataclass, field | ||
from dataclasses import dataclass | ||
import logging | ||
import feedparser | ||
from tqdm import tqdm | ||
|
||
from markdownify import markdownify | ||
from align_data.common import utils | ||
from align_data.common.alignment_dataset import AlignmentDataset | ||
from align_data.common.html_dataset import RSSDataset | ||
|
||
from typing import List | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@dataclass | ||
class WordpressBlog(AlignmentDataset): | ||
url: str | ||
strip: List = field(default_factory=lambda: []) | ||
class WordpressBlog(RSSDataset): | ||
summary_key = 'summary' | ||
done_key = 'paged_url' | ||
|
||
def setup(self): | ||
""" | ||
url: URL of the blog | ||
strip: list of regexes to strip from the HTML | ||
""" | ||
super().setup() | ||
self.feed_url = self.url + "/feed" | ||
self.name = utils.url_to_filename(self.url) | ||
@property | ||
def feed_url(self): | ||
return self.url + "/feed" | ||
|
||
def get_item_key(self, item): | ||
return item | ||
|
||
@property | ||
def items_list(self): | ||
logger.info(f"Fetching entries from {self.feed_url}") | ||
|
||
pages = [] | ||
page_number = 0 | ||
last_title = None | ||
self.items = {} | ||
page_number = 1 | ||
prev_title = None | ||
|
||
with tqdm(desc=f"Loading {self.name} pages") as pbar: | ||
while True: | ||
paged_url = f"{self.feed_url}?paged={page_number + 1}" | ||
paged_url = f"{self.feed_url}?paged={page_number}" | ||
logging.info(f"Fetching {paged_url}") | ||
|
||
feed = feedparser.parse(paged_url) | ||
if (("feed" not in feed) or ("title" not in feed["feed"]) or (feed["feed"]["title"] == last_title)): | ||
title = feed.get('feed', {}).get('title') | ||
if not title or title == prev_title: | ||
break | ||
last_title = feed["feed"]["title"] | ||
|
||
pages.extend({**entry, 'paged_url': paged_url} for entry in feed['entries']) | ||
prev_title = feed["feed"]["title"] | ||
page_number += 1 | ||
|
||
for item in feed['entries']: | ||
self.items[item['link']] = item | ||
|
||
# update the tqdm progress bar | ||
pbar.set_postfix_str(f"page {page_number}", refresh=True) # Set postfix to "page X" | ||
pbar.update() # Here we increment the progress bar by 1 | ||
|
||
logger.info(f'Got {len(pages)} pages') | ||
|
||
return pages | ||
|
||
def get_item_key(self, item): | ||
"""Get the identifier of the given `item` so it can be checked to see whether it's been output. | ||
The default assumption is that the `item` is a Path to a file. | ||
""" | ||
return item['title'] | ||
|
||
def _get_published_date(self, item): | ||
date_published = item.get('published') | ||
if not date_published: | ||
return '' | ||
date_published = datetime.strptime(date_published, '%a, %d %b %Y %H:%M:%S %z') | ||
return self._format_datetime(date_published) | ||
|
||
def fetch_entries(self): | ||
for entry in self.unprocessed_items(): | ||
content_text = markdownify(entry["content"][0]["value"]).strip() | ||
text = entry["title"] + "\n\n" + content_text | ||
|
||
new_entry = self.make_data_entry({ | ||
"text": text, | ||
"url": entry['link'], | ||
"title": text.split("\n")[0], | ||
"source": self.name, | ||
"source_type": "blog", | ||
"date_published": self._get_published_date(entry), | ||
"paged_url": entry['paged_url'], | ||
"authors": [e['name'] for e in entry.get('authors', [])], | ||
}) | ||
new_entry.add_id() | ||
|
||
yield new_entry | ||
logger.info(f'Got {len(self.items)} pages') | ||
return list(self.items.keys()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,79 +0,0 @@ | ||
import bs4 | ||
import jsonlines | ||
import os | ||
import re | ||
import time | ||
import html2text | ||
from urllib.parse import urlparse | ||
import os | ||
from functools import reduce | ||
import operator | ||
import unicodedata | ||
import re | ||
|
||
|
||
def random_delay(): | ||
import random | ||
time.sleep(random.randint(1, 10)) | ||
|
||
|
||
def url_to_filename(url): | ||
""" | ||
Convert a URL to a suitable filename. | ||
""" | ||
url = urlparse(url) | ||
path = url.path.lstrip(os.sep).rstrip(os.sep).split(os.sep) | ||
return "-".join([url.netloc] + list(filter(None, path))) | ||
|
||
|
||
class ExitCodeError(Exception): | ||
pass | ||
|
||
|
||
def sh(x): | ||
if os.system(x): | ||
raise ExitCodeError() | ||
|
||
|
||
def ls(x): | ||
return [x + "/" + fn for fn in os.listdir(x)] | ||
|
||
|
||
def lsr(x): | ||
if os.path.isdir(x): | ||
return reduce(operator.add, map(lsr, ls(x)), []) | ||
else: | ||
return [x] | ||
|
||
|
||
def fwrite(fname, content): | ||
with open(fname, "w") as fh: | ||
fh.write(content) | ||
|
||
|
||
def fread(fname): | ||
with open(fname) as fh: | ||
return fh.read() | ||
|
||
|
||
def chdir_up_n(n): | ||
"""Goes up n times in the directory tree.""" | ||
for i in range(n): | ||
os.chdir("..") | ||
|
||
|
||
def slugify(value, allow_unicode=False): | ||
""" | ||
Taken from https://github.com/django/django/blob/master/django/utils/text.py | ||
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated | ||
dashes to single dashes. Remove characters that aren't alphanumerics, | ||
underscores, or hyphens. Convert to lowercase. Also strip leading and | ||
trailing whitespace, dashes, and underscores. | ||
""" | ||
value = str(value) | ||
if allow_unicode: | ||
value = unicodedata.normalize('NFKC', value) | ||
else: | ||
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') | ||
value = re.sub(r'[^\w\s-]', '', value.lower()) | ||
return re.sub(r'[-\s]+', '-', value).strip('-_') | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters