Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retrieve and rewrite Detailed licensing pages content #98

Open
wants to merge 1 commit into
base: detect_special_pages
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/PublishDockerDevImage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- main
workflow_dispatch:

jobs:
publish:
Expand Down
98 changes: 98 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import Any

from jinja2 import Template
from pydantic import BaseModel
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.constants import logger
from mindtouch2zim.context import CONTEXT


class LicenseStatistic(BaseModel):
label: str
version: str | None
percent: float
count: int
link: str


class LicenseInfo(BaseModel):
statistics: list[LicenseStatistic]
details: list


class PageInfo(BaseModel):
license_label: str
license_version: str
url: str
title: str
children: list["PageInfo"]


def _get_licensing_report_data(cover_url: str) -> Any:
"""
Get licensing report from libretexts.org

Logic to get the data has been adapted from `buildLicensingReport` function
at https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js

Probably coming from
https://github.com/LibreTexts/Libretext/blob/master/public/DynamicLicensing/dynamicLicensing.js
"""
api_url = f"https://api.libretexts.org/endpoint/licensereport/{cover_url}"
logger.debug(f"Calling API at {api_url}")
resp = CONTEXT.web_session.get(

Check warning on line 45 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L43-L45

Added lines #L43 - L45 were not covered by tests
url=api_url,
headers={"Origin": "https://www.libretexts.org"}, # kinda authorization header
timeout=CONTEXT.http_timeout_long_seconds,
)
resp.raise_for_status()
return resp.json()

Check warning on line 51 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L50-L51

Added lines #L50 - L51 were not covered by tests


def _render_html_from_data(jinja2_template: Template, licensing_data: Any) -> str:
if not licensing_data.get("meta", {}).get("specialRestrictions", None):
special_restrictions = None

Check warning on line 56 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L56

Added line #L56 was not covered by tests
else:

def get_restriction_label(restriction_key: str):

Check warning on line 59 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L59

Added line #L59 was not covered by tests
if restriction_key == "noncommercial":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use a simple map

return {"noderivatives": "No Derivatives", ...}.get(restriction_key, restriction_key)

return "Noncommercial"

Check warning on line 61 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L61

Added line #L61 was not covered by tests
elif restriction_key == "noderivatives":
return "No Derivatives"

Check warning on line 63 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L63

Added line #L63 was not covered by tests
elif restriction_key == "fairuse":
return "Fair Use"

Check warning on line 65 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L65

Added line #L65 was not covered by tests
else:
return restriction_key

Check warning on line 67 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L67

Added line #L67 was not covered by tests

special_restrictions = ", ".join(

Check warning on line 69 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L69

Added line #L69 was not covered by tests
[
get_restriction_label(restriction)
for restriction in licensing_data["meta"]["specialRestrictions"]
]
)
return jinja2_template.render(

Check warning on line 75 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L75

Added line #L75 was not covered by tests
data=licensing_data, special_restrictions=special_restrictions
)


def rewrite_detailed_licensing(
rewriter: HtmlRewriter,
jinja2_template: Template,
mindtouch_client: MindtouchClient,
page: LibraryPage,
) -> str:
"""
Get and statically rewrite the detailed licensing info of libretexts.org

"""

return rewriter.rewrite(

Check warning on line 91 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L91

Added line #L91 was not covered by tests
_render_html_from_data(
jinja2_template=jinja2_template,
licensing_data=_get_licensing_report_data(
mindtouch_client.get_cover_page_encoded_url(page)
),
)
).content
19 changes: 18 additions & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from mindtouch2zim.errors import NoIllustrationFoundError
from mindtouch2zim.html import get_text
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
from mindtouch2zim.libretexts.detailed_licensing import rewrite_detailed_licensing
from mindtouch2zim.libretexts.glossary import rewrite_glossary
from mindtouch2zim.libretexts.index import rewrite_index
from mindtouch2zim.ui import (
Expand Down Expand Up @@ -224,6 +225,9 @@
self.libretexts_index_template = self.jinja2_env.get_template(
"libretexts.index.html"
)
self.libretexts_detailed_licensing_template = self.jinja2_env.get_template(

Check warning on line 228 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L228

Added line #L228 was not covered by tests
"libretexts.detailed-licensing.html.jinja2"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't we agree on using .html extensions?

)

# Start creator early to detect problems early.
with creator as creator:
Expand Down Expand Up @@ -514,6 +518,20 @@
jinja2_template=self.libretexts_glossary_template,
original_content=page_content.html_body,
)
elif (
"https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js"
in page_content.html_body
):
logger.debug(

Check warning on line 525 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L525

Added line #L525 was not covered by tests
f"Rewriting {CONTEXT.processing_step} as libretexts.org "
"detailed licensing"
)
rewriten = rewrite_detailed_licensing(

Check warning on line 529 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L529

Added line #L529 was not covered by tests
rewriter=rewriter,
jinja2_template=self.libretexts_detailed_licensing_template,
mindtouch_client=self.mindtouch_client,
page=page,
)
except Exception as exc:
# code has been tested to work "in-general", but many edge-case occurs
# and since these pages are absolutely not essential, we just display a
Expand All @@ -522,7 +540,6 @@
f"Problem processing special {CONTEXT.processing_step}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<h2>Overview</h2>
<p>
<strong>Title:</strong>
<a href="{{ data.text.url }}" target="_blank" rel="noreferrer">{{ data.text.title }}</a>
</p>
<p><strong>Webpages:</strong>{{ data.text.totalPages}}</p>
{% if special_restrictions %}
<p>
<strong>Applicable Restrictions:</strong>
{{ special_restrictions }}
</p>
{% endif %}
<p><strong>All licenses found:</strong></p>
<ul>
{% for license in data.meta.licenses %}
<li>
<a href="{{ license.link }}" target="_blank" rel="noreferrer">{{ license.label }}{% if license.version %}&nbsp;{{ license.version }}{% endif %}</a>:
{{ license.percent }}% ({{ license.count }} {% if license.count > 1 %}pages{% else %}page{% endif %})
</li>
{% endfor %}
</ul>
<h2>By Page</h2>
{% macro render_detail(detail) -%}
<li><a href="{{ detail.url }}" target="_blank">{{ detail.title }}</a>
{% if detail.license %}
- <a href="{{ detail.license.link }}" target="_blank" rel="noreferrer"> <em>{{ detail.license.label }} {{ detail.license.version or "" }}</em></a>
{% endif %}
{% if detail.children %}
<ul>
{% for child in detail.children %}
{{ render_detail(child) }}
{% endfor %}
</ul>
{% endif %}
</li>
{% endmacro %}
<div style="column-count: 2; margin-top: 1em;">
<ul style="margin: 0;">
{{ render_detail(data.text) }}
</ul>
</div>
65 changes: 65 additions & 0 deletions scraper/tests-integration/libretexts/test_detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import Any

import pytest
from jinja2 import Environment, FileSystemLoader, select_autoescape

from mindtouch2zim.constants import ROOT_DIR
from mindtouch2zim.libretexts.detailed_licensing import (
_get_licensing_report_data,
_render_html_from_data,
)


@pytest.fixture(scope="module")
def licensing_report_data() -> Any:
return _get_licensing_report_data(
"https://geo.libretexts.org/Courses/California_State_University_Los_Angeles/"
"Book%3A_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)"
)


def test_get_licensing_report_data(licensing_report_data: Any):
"""Check we can still get licensing report data"""

assert licensing_report_data

# statistics properties
assert "meta" in licensing_report_data
assert "specialRestrictions" in licensing_report_data["meta"]
assert "licenses" in licensing_report_data["meta"]
assert isinstance(licensing_report_data["meta"]["licenses"], list)
assert "label" in licensing_report_data["meta"]["licenses"][0]
assert "link" in licensing_report_data["meta"]["licenses"][0]
assert "version" in licensing_report_data["meta"]["licenses"][0]
assert "count" in licensing_report_data["meta"]["licenses"][0]
assert int(licensing_report_data["meta"]["licenses"][0]["count"])
assert "percent" in licensing_report_data["meta"]["licenses"][0]
assert float(licensing_report_data["meta"]["licenses"][0]["percent"])
assert "text" in licensing_report_data
assert "totalPages" in licensing_report_data["text"]

# details properties
def check_item(data: Any):
assert "license" in data
assert "label" in data["license"]
assert "link" in data["license"]
# optional property, not set at least for "Undeclared" license
if data["license"]["label"] != "Undeclared":
assert "version" in data["license"]
assert "url" in data
assert "title" in data
assert "children" in data
assert isinstance(data["children"], list)
for child in data["children"]:
check_item(child)

check_item(licensing_report_data["text"])


def test_render_licensing_template(licensing_report_data: Any):
jinja2_env = Environment(
loader=FileSystemLoader(ROOT_DIR.joinpath("templates")),
autoescape=select_autoescape(),
)
template = jinja2_env.get_template("libretexts.detailed-licensing.html.jinja2")
assert _render_html_from_data(template, licensing_report_data)