Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implementing the UNIMPLEMENTED_PARSERS #97

Merged
merged 40 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
c8f77aa
to start the pr to add comments
Thomas-Lemoine Jul 19, 2023
90517c4
removed spaces
Thomas-Lemoine Jul 19, 2023
3617c9e
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Jul 19, 2023
049fb56
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Jul 23, 2023
6848dee
create logger_config and reorder the imports
Thomas-Lemoine Jul 28, 2023
682e96e
main's logger
Thomas-Lemoine Jul 28, 2023
3b38600
ignore the log files
Thomas-Lemoine Jul 28, 2023
367f5df
postprocess notes
Thomas-Lemoine Jul 28, 2023
932561a
fix test with new download order for pdfarticles
Thomas-Lemoine Jul 28, 2023
728b124
Handle special docs
mruwnik Aug 4, 2023
b9999b4
Fetch new items from indices
mruwnik Aug 4, 2023
7ee7f9a
fixed domain getter from network location
Thomas-Lemoine Aug 4, 2023
042fc67
logger and minor fixes
Thomas-Lemoine Aug 4, 2023
f6b0afc
comment: add www2. and www6. handling
Thomas-Lemoine Aug 4, 2023
3381f1b
Merge branch 'special_docs' into special_docs_with_parsers
Thomas-Lemoine Aug 6, 2023
e85b04c
removed logger_config
Thomas-Lemoine Aug 6, 2023
cad2749
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Aug 16, 2023
43905ef
merge with main and minor changes
Thomas-Lemoine Aug 16, 2023
720ec97
Merge remote-tracking branch 'origin/implement_more_parsers' into imp…
Thomas-Lemoine Aug 16, 2023
654d76a
rm logger_config.py
Thomas-Lemoine Aug 16, 2023
e41ad00
minor fixes
Thomas-Lemoine Aug 16, 2023
40cc96c
minor fixes 2
Thomas-Lemoine Aug 16, 2023
d36687d
parsers type signature
Thomas-Lemoine Aug 17, 2023
34ccba9
test_arxiv_process_entry_retracted fixed
Thomas-Lemoine Aug 17, 2023
a5115cd
Refactor of special_indices
Thomas-Lemoine Aug 17, 2023
f2a3b96
1239283019481293043902
Thomas-Lemoine Aug 17, 2023
3cff71b
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Aug 21, 2023
7c5c4ab
Merge branch 'special_indices_refactor' into implement_more_parsers
Thomas-Lemoine Aug 21, 2023
00b70be
alignmentdataset class removed some init fields
Thomas-Lemoine Aug 21, 2023
6ef15f3
removed the wrong arxivpapers file
Thomas-Lemoine Aug 21, 2023
ad89b44
minor changes
Thomas-Lemoine Aug 21, 2023
70a9757
Merge branch 'special_docs_with_parsers' into implement_more_parsers
Thomas-Lemoine Aug 21, 2023
cf0bdf4
Merge branch 'main' into implement_more_parsers
Thomas-Lemoine Aug 27, 2023
8add5de
pdf date_published is a datetime
Thomas-Lemoine Aug 27, 2023
057015b
revert some useless changes
Thomas-Lemoine Aug 31, 2023
789a9c8
revert type annotation change
Thomas-Lemoine Aug 31, 2023
662db51
Merge remote-tracking branch 'origin/main' into implement_more_parsers
Thomas-Lemoine Sep 8, 2023
e58292f
nits
henri123lemoine Sep 8, 2023
15efdb8
nits 2
henri123lemoine Sep 9, 2023
f05c4a9
nits 2
henri123lemoine Sep 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion align_data/analysis/analyse_jsonl_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import datetime
from pathlib import Path
from collections import defaultdict

import jsonlines

from collections import defaultdict


def is_valid_date_format(data_dict, format="%Y-%m-%dT%H:%M:%SZ"):
Expand Down
5 changes: 3 additions & 2 deletions align_data/analysis/count_tokens.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Tuple
import logging

from transformers import AutoTokenizer
import jsonlines
import logging
from typing import Tuple

logger = logging.getLogger(__name__)

Expand Down
96 changes: 52 additions & 44 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
from itertools import islice
import logging
import time
from dataclasses import dataclass, KW_ONLY
from dataclasses import dataclass, field, KW_ONLY
from pathlib import Path
from typing import Iterable, List, Optional, Set
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
from typing import List, Optional, Set, Iterable, Tuple, Generator

import jsonlines
import pytz
from sqlalchemy import select, Select, JSON
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload, Session
import jsonlines
from dateutil.parser import parse, ParserError
from tqdm import tqdm

from align_data.db.models import Article, Summary
from align_data.db.session import make_session
from align_data.settings import ARTICLE_MAIN_KEYS
Expand All @@ -27,40 +28,42 @@ class AlignmentDataset:
"""The base dataset class."""

name: str
"""The name of the dataset"""
"""The name of the dataset."""

_: KW_ONLY

files_path = Path("")
"""The path where data can be found. Usually a folder"""
data_path: Path = Path(__file__).parent / "../../data/"
"""The path where data can be found. Usually a folder."""

# Derived paths
raw_data_path: Path = field(init=False)
files_path: Path = field(init=False)

# Internal housekeeping variables
_outputted_items: Set[str] = field(default_factory=set, init=False)
"""A set of the ids of all previously processed items."""

done_key = "id"
"""The key of the entry to use as the id when checking if already processed."""

COOLDOWN = 0
"""An optional cool down between processing entries"""
"""An optional cool down between processing entries."""

lazy_eval = False
"""Whether to lazy fetch items. This is nice in that it will start processing, but messes up the progress bar."""

batch_size = 20
"""The number of items to collect before flushing to the database."""

# Internal housekeeping variables
_entry_idx = 0
"""Used internally for writing debugging info - each file write will increment it"""
_outputted_items = set()
"""A set of the ids of all previously processed items"""
def __post_init__(self):
self.data_path = self.data_path.resolve()

def __str__(self) -> str:
return self.name

def __post_init__(self, data_path=Path(__file__).parent / "../../data/"):
self.data_path = data_path
self.raw_data_path = self.data_path / "raw"

# set the default place to look for data
self.files_path = self.raw_data_path / self.name

def __str__(self) -> str:
return self.name

def _add_authors(self, article: Article, authors: List[str]) -> Article:
# TODO: Don't keep adding the same authors - come up with some way to reuse them
article.authors = ",".join(authors)
Expand All @@ -83,42 +86,42 @@ def make_data_entry(self, data, **kwargs) -> Article:
article.summaries.append(Summary(text=summary, source=self.name))
return article

def to_jsonl(self, out_path=None, filename=None) -> Path:
if not out_path:
out_path = Path(__file__).parent / "../../data/"

if not filename:
filename = f"{self.name}.jsonl"
filename = Path(out_path) / filename
def to_jsonl(self, out_path: Path | None = None, filename: str | None = None) -> Path:
out_path = out_path or self.data_path
filename = filename or f"{self.name}.jsonl"
filepath = out_path / filename

with jsonlines.open(filename, "w") as jsonl_writer:
with jsonlines.open(filepath, "w") as jsonl_writer:
for article in self.read_entries():
jsonl_writer.write(article.to_dict())
return filename.resolve()
return filepath.resolve()

@property
def _query_items(self):
def _query_items(self) -> Select[Tuple[Article]]:
return select(Article).where(Article.source == self.name)

def read_entries(self, sort_by=None):
def read_entries(self, sort_by=None) -> Iterable[Article]:
"""Iterate through all the saved entries."""
with make_session() as session:
query = self._query_items.options(joinedload(Article.summaries))
if sort_by is not None:
query = query.order_by(sort_by)
for item in session.scalars(query).unique():
yield item

result = session.scalars(query)
for article in result.unique(): # removes duplicates
yield article

def _add_batch(self, session, batch):
def _add_batch(self, session: Session, batch: tuple):
session.add_all(batch)

def add_entries(self, entries):
def commit():
def commit() -> bool:
try:
session.commit()
return True
except IntegrityError:
session.rollback()
return False

with make_session() as session:
items = iter(entries)
Expand Down Expand Up @@ -148,7 +151,11 @@ def get_item_key(self, item):
return item.name

def _load_outputted_items(self) -> Set[str]:
"""Load the output file (if it exists) in order to know which items have already been output."""
"""
Loads the outputted items from the database and returns them as a set.

if the done_key is not an attribute of Article, it will try to load it from the meta field.
"""
with make_session() as session:
if hasattr(Article, self.done_key):
# This doesn't filter by self.name. The good thing about that is that it should handle a lot more
Expand All @@ -165,21 +172,22 @@ def not_processed(self, item):
# If it get's to that level, consider batching it somehow
return self.get_item_key(item) not in self._outputted_items

def unprocessed_items(self, items=None) -> Iterable:
def unprocessed_items(self, items=None) -> list | filter:
"""Return a list of all items to be processed.

This will automatically remove any items that have already been processed,
based on the contents of the output file.
"""
self.setup()
items = items or self.items_list

filtered = filter(self.not_processed, items or self.items_list)
items_to_process = filter(self.not_processed, items)

# greedily fetch all items if not lazy eval. This makes the progress bar look nice
if not self.lazy_eval:
filtered = list(filtered)
return list(items_to_process)

return filtered
return items_to_process

def fetch_entries(self):
"""Get all entries to be written to the file."""
Expand All @@ -204,7 +212,7 @@ def process_entry(self, entry) -> Article | None:
raise NotImplementedError

@staticmethod
def _format_datetime(date) -> str:
def _format_datetime(date: datetime) -> str:
return date.strftime("%Y-%m-%dT%H:%M:%SZ")

@staticmethod
Expand Down Expand Up @@ -242,7 +250,7 @@ def _load_outputted_items(self) -> Set[str]:
)
)

def _add_batch(self, session, batch):
def _add_batch(self, session: Session, batch: tuple):
def merge(item):
if prev := self.articles.get(item.url):
return session.merge(item.update(prev))
Expand Down
38 changes: 19 additions & 19 deletions align_data/common/html_dataset.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import pytz
import regex as re
import logging
from datetime import datetime
from dataclasses import dataclass, field, KW_ONLY
from dataclasses import dataclass, field
from urllib.parse import urljoin
from typing import List
from typing import List, Dict, Any
import re

import pytz
import requests
import feedparser
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from markdownify import markdownify

from align_data.db.models import Article
from align_data.common.alignment_dataset import AlignmentDataset

logger = logging.getLogger(__name__)


Expand All @@ -26,9 +27,6 @@ class HTMLDataset(AlignmentDataset):
done_key = "url"

authors: List[str] = field(default_factory=list)
_: KW_ONLY
source_key: str = None
summary_key: str = None

item_selector = "article"
title_selector = "article h1"
Expand All @@ -39,23 +37,25 @@ class HTMLDataset(AlignmentDataset):
def extract_authors(self, article):
return self.authors

def get_item_key(self, item):
article_url = item.find_all("a")[0]["href"].split("?")[0]
return urljoin(self.url, article_url)

def get_item_key(self, item: Tag) -> str:
first_href = item.find("a")["href"]
href_base, *_ = first_href.split("?")
return urljoin(self.url, href_base)

@property
def items_list(self):
def items_list(self) -> ResultSet[Tag]:
logger.info(f"Fetching entries from {self.url}")
response = requests.get(self.url, allow_redirects=True)
soup = BeautifulSoup(response.content, "html.parser")
articles = soup.select(self.item_selector)
logger.info(f"Found {len(articles)} articles")
return articles

def _extra_values(self, contents):
def _extra_values(self, contents: BeautifulSoup):
return {}

def get_contents(self, article_url):
def get_contents(self, article_url: str) -> Dict[str, Any]:
contents = self.fetch_contents(article_url)

title = self._get_title(contents)
Expand All @@ -72,16 +72,16 @@ def get_contents(self, article_url):
**self._extra_values(contents),
}

def process_entry(self, article):
def process_entry(self, article: Tag) -> Article:
article_url = self.get_item_key(article)
contents = self.get_contents(article_url)
if not contents.get("text"):
return None

return self.make_data_entry(contents)

def fetch_contents(self, url):
logger.info("Fetching {}".format(url))
def fetch_contents(self, url: str):
logger.info(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
return BeautifulSoup(resp.content, "html.parser")

Expand All @@ -101,7 +101,7 @@ def _get_text(self, contents):

def _find_date(self, items):
for i in items:
if re.match("\w+ \d{1,2}, \d{4}", i.text):
if re.match(r"\w+ \d{1,2}, \d{4}", i.text):
return datetime.strptime(i.text, "%b %d, %Y").replace(tzinfo=pytz.UTC)

def _extract_markdown(self, element):
Expand Down Expand Up @@ -136,7 +136,7 @@ def _get_text(self, item):
text = item.get("content") and item["content"][0].get("value")
return self._extract_markdown(text)

def fetch_contents(self, url):
def fetch_contents(self, url: str):
item = self.items[url]
if "content" in item:
return item
Expand Down
12 changes: 8 additions & 4 deletions align_data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytz
import hashlib
from datetime import datetime
from typing import List, Optional
from typing import List, Optional, Dict, Any
from sqlalchemy import (
JSON,
DateTime,
Expand Down Expand Up @@ -105,7 +105,10 @@ def verify_id_fields(self):
missing = [field for field in self.__id_fields if not getattr(self, field)]
assert not missing, f"Entry is missing the following fields: {missing}"

def update(self, other):
def update(self, other: "Article"):
"""
Update this article with the values from another article.
"""
for field in self.__table__.columns.keys():
if field not in ["id", "hash_id", "metadata"] and getattr(other, field):
setattr(self, field, getattr(other, field))
Expand All @@ -120,7 +123,7 @@ def _set_id(self):
id_string = self.generate_id_string()
self.id = hashlib.md5(id_string).hexdigest()

def add_meta(self, key, val):
def add_meta(self, key: str, val):
if self.meta is None:
self.meta = {}
self.meta[key] = val
Expand Down Expand Up @@ -153,7 +156,7 @@ def is_valid(cls):
)

@classmethod
def before_write(cls, _mapper, _connection, target):
def before_write(cls, _mapper, _connection, target: "Article"):
target.verify_id_fields()

if not target.status and target.missing_fields:
Expand Down Expand Up @@ -190,5 +193,6 @@ def to_dict(self):
}



event.listen(Article, "before_insert", Article.before_write)
event.listen(Article, "before_update", Article.before_write)
1 change: 0 additions & 1 deletion align_data/db/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from align_data.settings import DB_CONNECTION_URI, MIN_CONFIDENCE
from align_data.db.models import Article


logger = logging.getLogger(__name__)

# We create a single engine for the entire application
Expand Down
Loading