-
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Retrieve and rewrite Detailed licensing pages content
- Loading branch information
Showing
5 changed files
with
223 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ on: | |
push: | ||
branches: | ||
- main | ||
workflow_dispatch: | ||
|
||
jobs: | ||
publish: | ||
|
98 changes: 98 additions & 0 deletions
98
scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from typing import Any | ||
|
||
from jinja2 import Template | ||
from pydantic import BaseModel | ||
from zimscraperlib.rewriting.html import HtmlRewriter | ||
|
||
from mindtouch2zim.client import LibraryPage, MindtouchClient | ||
from mindtouch2zim.constants import logger | ||
from mindtouch2zim.context import CONTEXT | ||
|
||
|
||
class LicenseStatistic(BaseModel): | ||
label: str | ||
version: str | None | ||
percent: float | ||
count: int | ||
link: str | ||
|
||
|
||
class LicenseInfo(BaseModel): | ||
statistics: list[LicenseStatistic] | ||
details: list | ||
|
||
|
||
class PageInfo(BaseModel): | ||
license_label: str | ||
license_version: str | ||
url: str | ||
title: str | ||
children: list["PageInfo"] | ||
|
||
|
||
def _get_licensing_report_data(cover_url: str) -> Any: | ||
""" | ||
Get licensing report from libretexts.org | ||
Logic to get the data has been adapted from `buildLicensingReport` function | ||
at https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js | ||
Probably coming from | ||
https://github.com/LibreTexts/Libretext/blob/master/public/DynamicLicensing/dynamicLicensing.js | ||
""" | ||
api_url = f"https://api.libretexts.org/endpoint/licensereport/{cover_url}" | ||
logger.debug(f"Calling API at {api_url}") | ||
resp = CONTEXT.web_session.get( | ||
url=api_url, | ||
headers={"Origin": "https://www.libretexts.org"}, # kinda authorization header | ||
timeout=CONTEXT.http_timeout_long_seconds, | ||
) | ||
resp.raise_for_status() | ||
return resp.json() | ||
|
||
|
||
def _render_html_from_data(jinja2_template: Template, licensing_data: Any) -> str: | ||
if not licensing_data.get("meta", {}).get("specialRestrictions", None): | ||
special_restrictions = None | ||
else: | ||
|
||
def get_restriction_label(restriction_key: str): | ||
if restriction_key == "noncommercial": | ||
return "Noncommercial" | ||
elif restriction_key == "noderivatives": | ||
return "No Derivatives" | ||
elif restriction_key == "fairuse": | ||
return "Fair Use" | ||
else: | ||
return restriction_key | ||
|
||
special_restrictions = ", ".join( | ||
[ | ||
get_restriction_label(restriction) | ||
for restriction in licensing_data["meta"]["specialRestrictions"] | ||
] | ||
) | ||
return jinja2_template.render( | ||
data=licensing_data, special_restrictions=special_restrictions | ||
) | ||
|
||
|
||
def rewrite_detailed_licensing( | ||
rewriter: HtmlRewriter, | ||
jinja2_template: Template, | ||
mindtouch_client: MindtouchClient, | ||
page: LibraryPage, | ||
) -> str: | ||
""" | ||
Get and statically rewrite the detailed licensing info of libretexts.org | ||
""" | ||
|
||
return rewriter.rewrite( | ||
_render_html_from_data( | ||
jinja2_template=jinja2_template, | ||
licensing_data=_get_licensing_report_data( | ||
mindtouch_client.get_cover_page_encoded_url(page) | ||
), | ||
) | ||
).content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
scraper/src/mindtouch2zim/templates/libretexts.detailed-licensing.html.jinja2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
<h2>Overview</h2> | ||
<p> | ||
<strong>Title:</strong> | ||
<a href="{{ data.text.url }}" target="_blank" rel="noreferrer">{{ data.text.title }}</a> | ||
</p> | ||
<p><strong>Webpages:</strong>{{ data.text.totalPages}}</p> | ||
{% if special_restrictions %} | ||
<p> | ||
<strong>Applicable Restrictions:</strong> | ||
{{ special_restrictions }} | ||
</p> | ||
{% endif %} | ||
<p><strong>All licenses found:</strong></p> | ||
<ul> | ||
{% for license in data.meta.licenses %} | ||
<li> | ||
<a href="{{ license.link }}" target="_blank" rel="noreferrer">{{ license.label }}{% if license.version %} {{ license.version }}{% endif %}</a>: | ||
{{ license.percent }}% ({{ license.count }} {% if license.count > 1 %}pages{% else %}page{% endif %}) | ||
</li> | ||
{% endfor %} | ||
</ul> | ||
<h2>By Page</h2> | ||
{% macro render_detail(detail) -%} | ||
<li><a href="{{ detail.url }}" target="_blank">{{ detail.title }}</a> | ||
{% if detail.license %} | ||
- <a href="{{ detail.license.link }}" target="_blank" rel="noreferrer"> <em>{{ detail.license.label }} {{ detail.license.version or "" }}</em></a> | ||
{% endif %} | ||
{% if detail.children %} | ||
<ul> | ||
{% for child in detail.children %} | ||
{{ render_detail(child) }} | ||
{% endfor %} | ||
</ul> | ||
{% endif %} | ||
</li> | ||
{% endmacro %} | ||
<div style="column-count: 2; margin-top: 1em;"> | ||
<ul style="margin: 0;"> | ||
{{ render_detail(data.text) }} | ||
</ul> | ||
</div> |
65 changes: 65 additions & 0 deletions
65
scraper/tests-integration/libretexts/test_detailed_licensing.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from typing import Any | ||
|
||
import pytest | ||
from jinja2 import Environment, FileSystemLoader, select_autoescape | ||
|
||
from mindtouch2zim.constants import ROOT_DIR | ||
from mindtouch2zim.libretexts.detailed_licensing import ( | ||
_get_licensing_report_data, | ||
_render_html_from_data, | ||
) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def licensing_report_data() -> Any: | ||
return _get_licensing_report_data( | ||
"https://geo.libretexts.org/Courses/California_State_University_Los_Angeles/" | ||
"Book%3A_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)" | ||
) | ||
|
||
|
||
def test_get_licensing_report_data(licensing_report_data: Any): | ||
"""Check we can still get licensing report data""" | ||
|
||
assert licensing_report_data | ||
|
||
# statistics properties | ||
assert "meta" in licensing_report_data | ||
assert "specialRestrictions" in licensing_report_data["meta"] | ||
assert "licenses" in licensing_report_data["meta"] | ||
assert isinstance(licensing_report_data["meta"]["licenses"], list) | ||
assert "label" in licensing_report_data["meta"]["licenses"][0] | ||
assert "link" in licensing_report_data["meta"]["licenses"][0] | ||
assert "version" in licensing_report_data["meta"]["licenses"][0] | ||
assert "count" in licensing_report_data["meta"]["licenses"][0] | ||
assert int(licensing_report_data["meta"]["licenses"][0]["count"]) | ||
assert "percent" in licensing_report_data["meta"]["licenses"][0] | ||
assert float(licensing_report_data["meta"]["licenses"][0]["percent"]) | ||
assert "text" in licensing_report_data | ||
assert "totalPages" in licensing_report_data["text"] | ||
|
||
# details properties | ||
def check_item(data: Any): | ||
assert "license" in data | ||
assert "label" in data["license"] | ||
assert "link" in data["license"] | ||
# optional property, not set at least for "Undeclared" license | ||
if data["license"]["label"] != "Undeclared": | ||
assert "version" in data["license"] | ||
assert "url" in data | ||
assert "title" in data | ||
assert "children" in data | ||
assert isinstance(data["children"], list) | ||
for child in data["children"]: | ||
check_item(child) | ||
|
||
check_item(licensing_report_data["text"]) | ||
|
||
|
||
def test_render_licensing_template(licensing_report_data: Any): | ||
jinja2_env = Environment( | ||
loader=FileSystemLoader(ROOT_DIR.joinpath("templates")), | ||
autoescape=select_autoescape(), | ||
) | ||
template = jinja2_env.get_template("libretexts.detailed-licensing.html.jinja2") | ||
assert _render_html_from_data(template, licensing_report_data) |