diff --git a/align_data/sources/articles/google_cloud.py b/align_data/sources/articles/google_cloud.py index 3a9f096..9ff56a2 100644 --- a/align_data/sources/articles/google_cloud.py +++ b/align_data/sources/articles/google_cloud.py @@ -93,8 +93,8 @@ def iterate_rows(sheet: Worksheet) -> Iterator[SheetRow]: SheetRow.set_sheet(sheet) # we start the enumeration at 2 to avoid the header row - for i, row_data in enumerate(sheet.get_all_records(), 2): - yield SheetRow(i, row_data) + for row_id, row_data in enumerate(sheet.get_all_records(), 2): + yield SheetRow(row_id, row_data) def upload_file(filename, bytes_contents, mimetype, parent_id=None): diff --git a/align_data/sources/articles/indices.py b/align_data/sources/articles/indices.py index cccea84..1a61e0e 100644 --- a/align_data/sources/articles/indices.py +++ b/align_data/sources/articles/indices.py @@ -25,8 +25,9 @@ def fetcher(): if contents := fetch_element(url, main_selector): return list(filter(None, map(formatter, contents.select(item_selector)))) return [] - fetcher.__name__ = formatter.__name__.replace("format_", "") + '_fetcher' + fetcher.__name__ = formatter.__name__.replace("format_", "") + '_fetcher' # formatter called "format_anthropic" -> fetcher called "anthropic_fetcher" + #TODO: Make this more explicit return fetcher diff --git a/align_data/sources/arxiv_papers.py b/align_data/sources/arxiv_papers.py index 15ad233..45b3148 100644 --- a/align_data/sources/arxiv_papers.py +++ b/align_data/sources/arxiv_papers.py @@ -52,12 +52,6 @@ def get_version(id: str) -> str | None: return res.group(1) -def is_withdrawn(url: str): - if elem := fetch_element(canonical_url(url), ".extra-services .full-text ul"): - return elem.text.strip().lower() == "withdrawn" - return None - - def is_withdrawn(url: str) -> bool: if elem := fetch_element(canonical_url(url), '.extra-services .full-text ul'): return elem.text.strip().lower() == 'withdrawn' diff --git a/align_data/sources/greaterwrong/greaterwrong.py b/align_data/sources/greaterwrong/greaterwrong.py index d1bf11f..579a468 100644 --- a/align_data/sources/greaterwrong/greaterwrong.py +++ b/align_data/sources/greaterwrong/greaterwrong.py @@ -63,11 +63,11 @@ class GreaterWrong(AlignmentDataset): start_year: int min_karma: int """Posts must have at least this much karma to be returned.""" - af: bool = False + af: bool """Whether alignment forum posts should be returned""" limit = 50 - COOLDOWN_TIME : float = 0.5 + COOLDOWN_TIME: float = 0.5 done_key = "url" lazy_eval = True source_type = 'GreaterWrong'