From 7dfd81c90e2dc883559cbd0696216c088f68781e Mon Sep 17 00:00:00 2001
From: Thomas Lemoine <lemoine123thomas@gmail.com>
Date: Mon, 11 Sep 2023 15:50:10 -0400
Subject: [PATCH 1/2] add debug logging for logging that takes up a lot of
 space

---
 align_data/common/html_dataset.py        | 4 ++--
 align_data/db/models.py                  | 3 ++-
 align_data/sources/arbital/arbital.py    | 2 +-
 align_data/sources/articles/articles.py  | 8 ++++----
 align_data/sources/articles/datasets.py  | 2 +-
 align_data/sources/blogs/blogs.py        | 2 +-
 align_data/sources/blogs/gwern_blog.py   | 2 +-
 align_data/sources/blogs/wp_blog.py      | 2 +-
 align_data/sources/ebooks/agentmodels.py | 2 +-
 align_data/sources/stampy/stampy.py      | 2 +-
 main.py                                  | 4 ++--
 11 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/align_data/common/html_dataset.py b/align_data/common/html_dataset.py
index e5e4d277..950c0e55 100644
--- a/align_data/common/html_dataset.py
+++ b/align_data/common/html_dataset.py
@@ -81,7 +81,7 @@ def process_entry(self, article):
         return self.make_data_entry(contents)
 
     def fetch_contents(self, url):
-        logger.info("Fetching {}".format(url))
+        logger.debug(f"Fetching {url}")
         resp = requests.get(url, allow_redirects=True)
         return BeautifulSoup(resp.content, "html.parser")
 
@@ -141,7 +141,7 @@ def fetch_contents(self, url):
         if "content" in item:
             return item
 
-        logger.info("Fetching {}".format(url))
+        logger.debug(f"Fetching {url}")
         resp = requests.get(url, allow_redirects=True)
         soup = BeautifulSoup(resp.content, "html.parser")
         return dict(
diff --git a/align_data/db/models.py b/align_data/db/models.py
index e79da232..36bd1ff3 100644
--- a/align_data/db/models.py
+++ b/align_data/db/models.py
@@ -82,7 +82,8 @@ class Article(Base):
     )
 
     def __repr__(self) -> str:
-        return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={self.date_published!r})"
+        formatted_date = self.date_published.strftime('%Y-%m-%d %H:%M:%S%z')
+        return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={formatted_date!r})"
 
     def generate_id_string(self) -> bytes:
         return "".join(
diff --git a/align_data/sources/arbital/arbital.py b/align_data/sources/arbital/arbital.py
index b08393c4..47a87ef0 100644
--- a/align_data/sources/arbital/arbital.py
+++ b/align_data/sources/arbital/arbital.py
@@ -184,7 +184,7 @@ def process_entry(self, alias: str):
 
             return self.make_data_entry(
                 {
-                    "title": page.get("title") or "",
+                    "title": page.get("title") or None,
                     "text": text,
                     "date_published": self._get_published_date(page),
                     "url": f'https://arbital.com/p/{page.get("alias") or alias}',
diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py
index 7db94a7b..953a3253 100644
--- a/align_data/sources/articles/articles.py
+++ b/align_data/sources/articles/articles.py
@@ -49,7 +49,7 @@ def save_pdf(filename, link):
 @with_retry(times=3, exceptions=gspread.exceptions.APIError)
 def process_row(row, sheets):
     """Check the given `row` and fetch its metadata + optional extra stuff."""
-    logger.info('Checking "%s"', row["title"])
+    logger.debug('Checking "%s"', row["title"])
 
     missing = [field for field in REQUIRED_FIELDS if not row.get(field)]
     if missing:
@@ -91,7 +91,7 @@ def process_spreadsheets(source_sheet, output_sheets):
     :param Worksheet source_sheet: the worksheet to be processed - each row should be a separate entry
     :param Dict[str, Worksheet] output_sheets: a dict of per data type worksheets to be updated
     """
-    logger.info("fetching seen urls")
+    logger.info("fetching seen urls in {output_sheets}")
     seen = {
         url
         for sheet in output_sheets.values()
@@ -120,8 +120,8 @@ def update_new_items(source_spreadsheet, source_sheet, output_spreadsheet):
     return process_spreadsheets(source_sheet, sheets)
 
 
-def check_new_articles(source_spreadsheet, source_sheet):
-    """Goes through the special indices looking for unseen articles."""
+def check_new_articles(source_spreadsheet, source_sheet) -> int:
+    """Goes through the special indices looking for unseen articles to update. Returns the number of updated rows."""
     source_sheet = get_sheet(source_spreadsheet, source_sheet)
     current = {row.get("title"): row for row in iterate_rows(source_sheet)}
     seen_urls = {
diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
index cbf7f9d9..b881f364 100644
--- a/align_data/sources/articles/datasets.py
+++ b/align_data/sources/articles/datasets.py
@@ -245,6 +245,6 @@ def get_contents(cls, item) -> Dict:
         return contents
 
     def process_entry(self, item):
-        logger.info(f"Processing {item.title}")
+        logger.debug(f"Processing {item.title}")
 
         return self.make_data_entry(self.get_contents(item), source=self.name)
diff --git a/align_data/sources/blogs/blogs.py b/align_data/sources/blogs/blogs.py
index 1245aec6..11267c2b 100644
--- a/align_data/sources/blogs/blogs.py
+++ b/align_data/sources/blogs/blogs.py
@@ -94,7 +94,7 @@ def items_list(self):
         page = 1
         with tqdm(desc=f"Loading {self.name} pages") as pbar:
             while True:
-                logger.info(f"Fetching entries from {self.url}")
+                logger.debug(f"Fetching entries from {self.url}")
                 response = requests.get(
                     self.url, allow_redirects=True, params={"73df3071_page": page}
                 )
diff --git a/align_data/sources/blogs/gwern_blog.py b/align_data/sources/blogs/gwern_blog.py
index 1d573a8e..f3a82882 100644
--- a/align_data/sources/blogs/gwern_blog.py
+++ b/align_data/sources/blogs/gwern_blog.py
@@ -71,7 +71,7 @@ def extract(item):
         return dict(filter(None, map(extract, header.splitlines())))
 
     def _get_article(self, url):
-        logger.info("Fetching {}".format(url))
+        logger.debug(f"Fetching {url}")
         return requests.get(url, allow_redirects=True)
 
     @staticmethod
diff --git a/align_data/sources/blogs/wp_blog.py b/align_data/sources/blogs/wp_blog.py
index cd409d98..b7a60ef7 100644
--- a/align_data/sources/blogs/wp_blog.py
+++ b/align_data/sources/blogs/wp_blog.py
@@ -28,7 +28,7 @@ def items_list(self):
         with tqdm(desc=f"Loading {self.name} pages") as pbar:
             while True:
                 paged_url = f"{self.feed_url}?paged={page_number}"
-                logging.info(f"Fetching {paged_url}")
+                logger.debug(f"Fetching {paged_url}")
 
                 feed = feedparser.parse(paged_url)
                 title = feed.get("feed", {}).get("title")
diff --git a/align_data/sources/ebooks/agentmodels.py b/align_data/sources/ebooks/agentmodels.py
index 65b52502..8915fbe3 100644
--- a/align_data/sources/ebooks/agentmodels.py
+++ b/align_data/sources/ebooks/agentmodels.py
@@ -21,7 +21,7 @@ def setup(self):
         super().setup()
         self.base_dir = self.raw_data_path / "agentmodels.org"
         if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
-            logger.info("Cloning repo")
+            logger.info(f"Cloning repo {self.repo}")
             Repo.clone_from(self.repo, self.base_dir)
         self.repository = Repo(self.base_dir)
         self.files_path = self.base_dir / "chapters"
diff --git a/align_data/sources/stampy/stampy.py b/align_data/sources/stampy/stampy.py
index 95319820..5dc24cf0 100644
--- a/align_data/sources/stampy/stampy.py
+++ b/align_data/sources/stampy/stampy.py
@@ -49,7 +49,7 @@ def clean_text(text):
         answer = clean_text(entry["Rich Text"])
         url = "https://aisafety.info?state=" + entry["UI ID"]
 
-        logger.info(f"Processing {question}")
+        logger.debug(f"Processing {question}")
 
         return self.make_data_entry(
             {
diff --git a/main.py b/main.py
index 82c30f07..c047ce9b 100644
--- a/main.py
+++ b/main.py
@@ -60,7 +60,7 @@ def fetch_all(self, *skip) -> None:
         """
         names = [name for name in ALL_DATASETS if name not in skip]
         for name in names:
-            print(name)
+            logger.debug(name)
             self.fetch(name)
 
     def generate_jsonl_files(self, *names):
@@ -74,7 +74,7 @@ def generate_jsonl_files(self, *names):
         assert not missing, f"{missing} are not valid dataset names"
         for name in names:
             dataset = get_dataset(name)
-            print(dataset.to_jsonl())
+            logger.info(dataset.to_jsonl())
 
     def count_tokens(self, merged_dataset_path: str) -> None:
         """

From 990d9524979d35a3944a107933b4965fdb3e186d Mon Sep 17 00:00:00 2001
From: Thomas Lemoine <lemoine123thomas@gmail.com>
Date: Thu, 14 Sep 2023 15:50:27 -0400
Subject: [PATCH 2/2] logger.info expects a string as its first argument

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index c047ce9b..a9a67db5 100644
--- a/main.py
+++ b/main.py
@@ -74,7 +74,7 @@ def generate_jsonl_files(self, *names):
         assert not missing, f"{missing} are not valid dataset names"
         for name in names:
             dataset = get_dataset(name)
-            logger.info(dataset.to_jsonl())
+            logger.info("%s", dataset.to_jsonl())
 
     def count_tokens(self, merged_dataset_path: str) -> None:
         """