Skip to content

Commit

Permalink
Copy Parse Markdown and Generate JSON from Source Repo
Browse files Browse the repository at this point in the history
  • Loading branch information
DmitryRyumin authored and github-actions[bot] committed Jan 21, 2024
1 parent 9c13be4 commit be0fe92
Showing 1 changed file with 24 additions and 1 deletion.
25 changes: 24 additions & 1 deletion code/markdown_to_json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,30 @@ def parse_paper_links(html):
def extract_paper_data(paper_section, columns):
title_column = columns[0]
# title = title_column.get_text(strip=True)
title = title_column.a.encode_contents().decode("utf-8")
title = (
title_column.a.encode_contents().decode("utf-8")
if title_column.a is not None
else (
title_column.encode_contents().decode("utf-8")
if title_column.get_text(strip=True) is not None
else None
)
)

title = re.sub(r"<(?:br\s*/?>|img[^>]*>)", "", title)
title = title.strip()

html_entities = {
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": '"',
"&apos;": "'",
}
title = re.sub(
r"(&\w+;)", lambda x: html_entities.get(x.group(0), x.group(0)), title
)

title_link = title_column.find("a")
title_page = title_link["href"] if title_link else None

Expand Down

0 comments on commit be0fe92

Please sign in to comment.