Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add debug logging for logging that takes up a lot of space #190

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions align_data/common/html_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def process_entry(self, article):
return self.make_data_entry(contents)

def fetch_contents(self, url):
logger.info("Fetching {}".format(url))
logger.debug(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
return BeautifulSoup(resp.content, "html.parser")

Expand Down Expand Up @@ -141,7 +141,7 @@ def fetch_contents(self, url):
if "content" in item:
return item

logger.info("Fetching {}".format(url))
logger.debug(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
soup = BeautifulSoup(resp.content, "html.parser")
return dict(
Expand Down
3 changes: 2 additions & 1 deletion align_data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class Article(Base):
)

def __repr__(self) -> str:
return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={self.date_published!r})"
formatted_date = self.date_published.strftime('%Y-%m-%d %H:%M:%S%z')
return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={formatted_date!r})"

def generate_id_string(self) -> bytes:
return "".join(
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/arbital/arbital.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def process_entry(self, alias: str):

return self.make_data_entry(
{
"title": page.get("title") or "",
"title": page.get("title") or None,
"text": text,
"date_published": self._get_published_date(page),
"url": f'https://arbital.com/p/{page.get("alias") or alias}',
Expand Down
8 changes: 4 additions & 4 deletions align_data/sources/articles/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def save_pdf(filename, link):
@with_retry(times=3, exceptions=gspread.exceptions.APIError)
def process_row(row, sheets):
"""Check the given `row` and fetch its metadata + optional extra stuff."""
logger.info('Checking "%s"', row["title"])
logger.debug('Checking "%s"', row["title"])

missing = [field for field in REQUIRED_FIELDS if not row.get(field)]
if missing:
Expand Down Expand Up @@ -91,7 +91,7 @@ def process_spreadsheets(source_sheet, output_sheets):
:param Worksheet source_sheet: the worksheet to be processed - each row should be a separate entry
:param Dict[str, Worksheet] output_sheets: a dict of per data type worksheets to be updated
"""
logger.info("fetching seen urls")
logger.info("fetching seen urls in {output_sheets}")
seen = {
url
for sheet in output_sheets.values()
Expand Down Expand Up @@ -120,8 +120,8 @@ def update_new_items(source_spreadsheet, source_sheet, output_spreadsheet):
return process_spreadsheets(source_sheet, sheets)


def check_new_articles(source_spreadsheet, source_sheet):
"""Goes through the special indices looking for unseen articles."""
def check_new_articles(source_spreadsheet, source_sheet) -> int:
"""Goes through the special indices looking for unseen articles to update. Returns the number of updated rows."""
source_sheet = get_sheet(source_spreadsheet, source_sheet)
current = {row.get("title"): row for row in iterate_rows(source_sheet)}
seen_urls = {
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,6 @@ def get_contents(cls, item) -> Dict:
return contents

def process_entry(self, item):
logger.info(f"Processing {item.title}")
logger.debug(f"Processing {item.title}")

return self.make_data_entry(self.get_contents(item), source=self.name)
2 changes: 1 addition & 1 deletion align_data/sources/blogs/blogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def items_list(self):
page = 1
with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
logger.info(f"Fetching entries from {self.url}")
logger.debug(f"Fetching entries from {self.url}")
response = requests.get(
self.url, allow_redirects=True, params={"73df3071_page": page}
)
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/blogs/gwern_blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def extract(item):
return dict(filter(None, map(extract, header.splitlines())))

def _get_article(self, url):
logger.info("Fetching {}".format(url))
logger.debug(f"Fetching {url}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strictly speaking, when logging you should use params, like logger.debug("Fetching %s", url). It's fine here, but can potentially be a problem if casting url to a string is a slow or costly operation - with fstrings it's always done, but when provided as an argument it only does it when needed (i.e. log level == 'debug'). Again - here it's fine, but it's worth bearing in mind

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see, I hadn't thought of that

return requests.get(url, allow_redirects=True)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/blogs/wp_blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def items_list(self):
with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
paged_url = f"{self.feed_url}?paged={page_number}"
logging.info(f"Fetching {paged_url}")
logger.debug(f"Fetching {paged_url}")

feed = feedparser.parse(paged_url)
title = feed.get("feed", {}).get("title")
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/ebooks/agentmodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setup(self):
super().setup()
self.base_dir = self.raw_data_path / "agentmodels.org"
if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
logger.info("Cloning repo")
logger.info(f"Cloning repo {self.repo}")
Repo.clone_from(self.repo, self.base_dir)
self.repository = Repo(self.base_dir)
self.files_path = self.base_dir / "chapters"
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/stampy/stampy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def clean_text(text):
answer = clean_text(entry["Rich Text"])
url = "https://aisafety.info?state=" + entry["UI ID"]

logger.info(f"Processing {question}")
logger.debug(f"Processing {question}")

return self.make_data_entry(
{
Expand Down
4 changes: 2 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def fetch_all(self, *skip) -> None:
"""
names = [name for name in ALL_DATASETS if name not in skip]
for name in names:
print(name)
logger.debug(name)
self.fetch(name)

def generate_jsonl_files(self, *names):
Expand All @@ -74,7 +74,7 @@ def generate_jsonl_files(self, *names):
assert not missing, f"{missing} are not valid dataset names"
for name in names:
dataset = get_dataset(name)
print(dataset.to_jsonl())
logger.info("%s", dataset.to_jsonl())

def count_tokens(self, merged_dataset_path: str) -> None:
"""
Expand Down