Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implementing the UNIMPLEMENTED_PARSERS #97

Merged
merged 40 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
c8f77aa
to start the pr to add comments
Thomas-Lemoine Jul 19, 2023
90517c4
removed spaces
Thomas-Lemoine Jul 19, 2023
3617c9e
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Jul 19, 2023
049fb56
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Jul 23, 2023
6848dee
create logger_config and reorder the imports
Thomas-Lemoine Jul 28, 2023
682e96e
main's logger
Thomas-Lemoine Jul 28, 2023
3b38600
ignore the log files
Thomas-Lemoine Jul 28, 2023
367f5df
postprocess notes
Thomas-Lemoine Jul 28, 2023
932561a
fix test with new download order for pdfarticles
Thomas-Lemoine Jul 28, 2023
728b124
Handle special docs
mruwnik Aug 4, 2023
b9999b4
Fetch new items from indices
mruwnik Aug 4, 2023
7ee7f9a
fixed domain getter from network location
Thomas-Lemoine Aug 4, 2023
042fc67
logger and minor fixes
Thomas-Lemoine Aug 4, 2023
f6b0afc
comment: add www2. and www6. handling
Thomas-Lemoine Aug 4, 2023
3381f1b
Merge branch 'special_docs' into special_docs_with_parsers
Thomas-Lemoine Aug 6, 2023
e85b04c
removed logger_config
Thomas-Lemoine Aug 6, 2023
cad2749
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Aug 16, 2023
43905ef
merge with main and minor changes
Thomas-Lemoine Aug 16, 2023
720ec97
Merge remote-tracking branch 'origin/implement_more_parsers' into imp…
Thomas-Lemoine Aug 16, 2023
654d76a
rm logger_config.py
Thomas-Lemoine Aug 16, 2023
e41ad00
minor fixes
Thomas-Lemoine Aug 16, 2023
40cc96c
minor fixes 2
Thomas-Lemoine Aug 16, 2023
d36687d
parsers type signature
Thomas-Lemoine Aug 17, 2023
34ccba9
test_arxiv_process_entry_retracted fixed
Thomas-Lemoine Aug 17, 2023
a5115cd
Refactor of special_indices
Thomas-Lemoine Aug 17, 2023
f2a3b96
1239283019481293043902
Thomas-Lemoine Aug 17, 2023
3cff71b
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Aug 21, 2023
7c5c4ab
Merge branch 'special_indices_refactor' into implement_more_parsers
Thomas-Lemoine Aug 21, 2023
00b70be
alignmentdataset class removed some init fields
Thomas-Lemoine Aug 21, 2023
6ef15f3
removed the wrong arxivpapers file
Thomas-Lemoine Aug 21, 2023
ad89b44
minor changes
Thomas-Lemoine Aug 21, 2023
70a9757
Merge branch 'special_docs_with_parsers' into implement_more_parsers
Thomas-Lemoine Aug 21, 2023
cf0bdf4
Merge branch 'main' into implement_more_parsers
Thomas-Lemoine Aug 27, 2023
8add5de
pdf date_published is a datetime
Thomas-Lemoine Aug 27, 2023
057015b
revert some useless changes
Thomas-Lemoine Aug 31, 2023
789a9c8
revert type annotation change
Thomas-Lemoine Aug 31, 2023
662db51
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Sep 8, 2023
e58292f
nits
henri123lemoine Sep 8, 2023
15efdb8
nits 2
henri123lemoine Sep 9, 2023
f05c4a9
nits 2
henri123lemoine Sep 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,6 @@ carado.moe/
*.epub

credentials.json
data/raw/
data/raw/

*.log
3 changes: 2 additions & 1 deletion align_data/analysis/analyse_jsonl_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import datetime
from pathlib import Path
from collections import defaultdict

import jsonlines

from collections import defaultdict


def is_valid_date_format(data_dict, format="%Y-%m-%dT%H:%M:%SZ"):
Expand Down
2 changes: 2 additions & 0 deletions align_data/analysis/count_tokens.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

from transformers import AutoTokenizer
import jsonlines
import logging
Expand Down
113 changes: 53 additions & 60 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,20 @@
import time
from dataclasses import dataclass, field, KW_ONLY
from pathlib import Path
from typing import Iterable, List, Optional, Set
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
from typing import List, Optional, Dict, Any, Set, Iterable, Tuple
import pytz
from datetime import datetime

from sqlalchemy import select, Select, JSON
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload, Session
import jsonlines
import pytz
from dateutil.parser import parse, ParserError
from tqdm import tqdm

from align_data.db.models import Article, Summary
from align_data.db.session import make_session

INIT_DICT = {
"source": None,
"id": None,
"text": None,
"date_published": None,
"title": None,
"url": None,
"authors": lambda: [],
"source_type": None,
"status": None,
"comments": None,
}
from align_data.settings import ARTICLE_MAIN_KEYS

logger = logging.getLogger(__name__)

Expand All @@ -41,7 +31,16 @@ class AlignmentDataset:

_: KW_ONLY

files_path = Path("")
id_fields: List[str] = field(default_factory=lambda: ["url", "title"])
"""A list of fields to use as the id of the entry. If not set, will use ['url', 'title']"""

# Internal housekeeping variables
_outputted_items: Set[str] = field(default_factory=set)
"""A set of the ids of all previously processed items"""

data_path: Path = field(init=False)
raw_data_path: Path = field(init=False)
files_path: Path = field(init=False)
"""The path where data can be found. Usually a folder"""

done_key = "id"
Expand All @@ -55,23 +54,12 @@ class AlignmentDataset:
batch_size = 20
"""The number of items to collect before flushing to the database."""

# Internal housekeeping variables
_entry_idx = 0
"""Used internally for writing debugging info - each file write will increment it"""
_outputted_items = set()
"""A set of the ids of all previously processed items"""
_: KW_ONLY
id_fields: List[str] = field(default_factory=lambda: ["url", "title"])
"""A list of fields to use as the id of the entry. If not set, will use ['url', 'title']"""

def __str__(self) -> str:
return self.name

def __post_init__(self, data_path=Path(__file__).parent / "../../data/"):
self.data_path = data_path
def __post_init__(self, data_path: Optional[Path] = None):
self.data_path = data_path or (Path(__file__).parent / "../../data/").resolve()
self.raw_data_path = self.data_path / "raw"

# set the default place to look for data
self.files_path = self.raw_data_path / self.name

def _add_authors(self, article: Article, authors: List[str]) -> Article:
Expand All @@ -81,57 +69,57 @@ def _add_authors(self, article: Article, authors: List[str]) -> Article:
article.authors = ",".join(article.authors[:1024].split(",")[:-1])
return article

def make_data_entry(self, data, **kwargs) -> Article:
def make_data_entry(self, data: Dict[str, Any], **kwargs) -> Article:
data = dict(data, **kwargs)
summary = data.pop("summary", None)
authors = data.pop("authors", [])

article = Article(
id_fields=self.id_fields,
meta={k: v for k, v in data.items() if k not in INIT_DICT and v is not None},
**{k: v for k, v in data.items() if k in INIT_DICT},
meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
**{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
)
self._add_authors(article, authors)
if summary:
article.summaries.append(Summary(text=summary, source=self.name))
return article

def to_jsonl(self, out_path=None, filename=None) -> Path:
if not out_path:
out_path = Path(__file__).parent / "../../data/"

if not filename:
filename = f"{self.name}.jsonl"
filename = Path(out_path) / filename
def to_jsonl(self, out_path: Path | None = None, filename: str | None = None) -> Path:
out_path = out_path or self.data_path
filename = filename or f"{self.name}.jsonl"
filepath = out_path / filename

with jsonlines.open(filename, "w") as jsonl_writer:
with jsonlines.open(filepath, "w") as jsonl_writer:
for article in self.read_entries():
jsonl_writer.write(article.to_dict())
return filename.resolve()
return filepath.resolve()

@property
def _query_items(self):
def _query_items(self) -> Select[Tuple[Article]]:
return select(Article).where(Article.source == self.name)

def read_entries(self, sort_by=None):
def read_entries(self, sort_by=None) -> Iterable[Article]:
"""Iterate through all the saved entries."""
with make_session() as session:
query = self._query_items.options(joinedload(Article.summaries))
if sort_by is not None:
query = query.order_by(sort_by)
for item in session.scalars(query).unique():
yield item

result = session.scalars(query)
for article in result.unique(): # removes duplicates
yield article

def _add_batch(self, session, batch):
def _add_batch(self, session: Session, batch):
session.add_all(batch)

def add_entries(self, entries):
def commit():
def commit() -> bool:
try:
session.commit()
return True
except IntegrityError:
session.rollback()
return False

with make_session() as session:
items = iter(entries)
Expand Down Expand Up @@ -161,7 +149,11 @@ def get_item_key(self, item):
return item.name

def _load_outputted_items(self) -> Set[str]:
"""Load the output file (if it exists) in order to know which items have already been output."""
"""
Loads the outputted items from the database and returns them as a set.

if the done_key is not an attribute of Article, it will try to load it from the meta field.
"""
with make_session() as session:
if hasattr(Article, self.done_key):
# This doesn't filter by self.name. The good thing about that is that it should handle a lot more
Expand All @@ -170,10 +162,10 @@ def _load_outputted_items(self) -> Set[str]:
return set(
session.scalars(select(getattr(Article, self.done_key))).all()
)
# TODO: Properly handle this - it should create a proper SQL JSON select
return {
item.get(self.done_key)
for item in session.scalars(select(Article.meta)).all()
meta[self.done_key]
for meta in session.scalars(select(Article.meta)).all()
if isinstance(meta, JSON) and meta.get(self.done_key)
}

def unprocessed_items(self, items=None) -> Iterable:
Expand All @@ -183,6 +175,7 @@ def unprocessed_items(self, items=None) -> Iterable:
based on the contents of the output file.
"""
self.setup()
items = items or self.items_list

def not_processed(item):
# NOTE: `self._outputted_items` reads in all items. Which could potentially be a lot. If this starts to
Expand All @@ -191,15 +184,15 @@ def not_processed(item):
# If it get's to that level, consider batching it somehow
return self.get_item_key(item) not in self._outputted_items

filtered = filter(not_processed, items or self.items_list)
items_to_process = filter(not_processed, items)

# greedily fetch all items if not lazy eval. This makes the progress bar look nice
if not self.lazy_eval:
filtered = list(filtered)
items_to_process = list(items_to_process)

return filtered
return items_to_process

def fetch_entries(self):
def fetch_entries(self) -> Article:
"""Get all entries to be written to the file."""
for item in tqdm(self.unprocessed_items(), desc=f"Processing {self.name}"):
entry = self.process_entry(item)
Expand All @@ -216,11 +209,11 @@ def process_entry(self, entry) -> Optional[Article]:
raise NotImplementedError

@staticmethod
def _format_datetime(date) -> str:
def _format_datetime(date: datetime) -> str:
return date.strftime("%Y-%m-%dT%H:%M:%SZ")

@staticmethod
def _get_published_date(date) -> Optional[datetime]:
def _get_published_date(date: str) -> Optional[datetime]:
try:
# Totally ignore any timezone info, forcing everything to UTC
return parse(str(date)).replace(tzinfo=pytz.UTC)
Expand Down
34 changes: 16 additions & 18 deletions align_data/common/html_dataset.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
import pytz
import regex as re
import logging
from datetime import datetime
from dateutil.parser import parse
from dataclasses import dataclass, field, KW_ONLY
from dataclasses import dataclass, field
from urllib.parse import urljoin
from typing import List
import re

import requests
import feedparser
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from markdownify import markdownify

from align_data.common.alignment_dataset import AlignmentDataset

logger = logging.getLogger(__name__)


@dataclass
@dataclass()
class HTMLDataset(AlignmentDataset):
"""
Fetches articles from a different blog by collecting links to articles from an index page.
Expand All @@ -27,36 +26,35 @@ class HTMLDataset(AlignmentDataset):
done_key = "url"

authors: List[str] = field(default_factory=list)
_: KW_ONLY
source_key: str = None
summary_key: str = None

item_selector = "article"
title_selector = "article h1"
text_selector = "article"
source_type = "blog"
ignored_selectors = []

def extract_authors(self, article):
def extract_authors(self, article): #TODO: make this work
return self.authors

def get_item_key(self, item):
article_url = item.find_all("a")[0]["href"].split("?")[0]
return urljoin(self.url, article_url)

def get_item_key(self, item: Tag) -> str:
first_href = item.find("a")["href"]
href_base, *_ = first_href.split("?")
return urljoin(self.url, href_base)

@property
def items_list(self):
def items_list(self) -> ResultSet[Tag]:
logger.info(f"Fetching entries from {self.url}")
response = requests.get(self.url, allow_redirects=True)
soup = BeautifulSoup(response.content, "html.parser")
articles = soup.select(self.item_selector)
logger.info(f"Found {len(articles)} articles")
return articles

def _extra_values(self, contents):
def _extra_values(self, contents: BeautifulSoup):
return {}

def process_entry(self, article):
def process_entry(self, article: Tag):
article_url = self.get_item_key(article)
contents = self._get_contents(article_url)

Expand All @@ -79,7 +77,7 @@ def process_entry(self, article):
}
)

def _get_contents(self, url):
def _get_contents(self, url: str):
logger.info("Fetching {}".format(url))
resp = requests.get(url, allow_redirects=True)
return BeautifulSoup(resp.content, "html.parser")
Expand All @@ -97,7 +95,7 @@ def _get_text(self, contents):

def _find_date(self, items):
for i in items:
if re.match("\w+ \d{1,2}, \d{4}", i.text):
if re.match(r"\w+ \d{1,2}, \d{4}", i.text):
return datetime.strptime(i.text, "%b %d, %Y").replace(tzinfo=pytz.UTC)

def _extract_markdown(self, element):
Expand Down Expand Up @@ -132,7 +130,7 @@ def _get_text(self, item):
text = item.get("content") and item["content"][0].get("value")
return self._extract_markdown(text)

def _get_contents(self, url):
def _get_contents(self, url: str):
item = self.items[url]
if "content" in item:
return item
Expand Down
Empty file removed align_data/common/utils.py
Empty file.
Loading