Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce forceRender param #1901

Merged
merged 3 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 34 additions & 14 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,19 +166,39 @@
}
}

public async setBaseUrls() {
//* Objects order in array matters!
this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

//* Objects order in array matters!
this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

public async setBaseUrls(forceRender = null) {
if (!forceRender) {
//* Objects order in array matters!
this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

//* Objects order in array matters!
this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])
} else {
switch (forceRender) {
case 'WikimediaDesktop':
if (MediaWiki.hasWikimediaDesktopRestApi()) {
this.baseUrl = MediaWiki.desktopRestApiUrl.href
this.baseUrlForMainPage = MediaWiki.desktopRestApiUrl.href
break
}
break
case 'VisualEditor':

Check warning on line 191 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L190-L191

Added lines #L190 - L191 were not covered by tests
if (MediaWiki.hasVisualEditorApi()) {
this.baseUrl = MediaWiki.visualEditorApiUrl.href
this.baseUrlForMainPage = MediaWiki.visualEditorApiUrl.href
break

Check warning on line 195 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L193-L195

Added lines #L193 - L195 were not covered by tests
}
break
default:
throw new Error('Unable to find specific API end-point to retrieve article HTML')

Check warning on line 199 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L197-L199

Added lines #L197 - L199 were not covered by tests
}
}
logger.log('Base Url: ', this.baseUrl)
logger.log('Base Url for Main Page: ', this.baseUrlForMainPage)

Expand Down Expand Up @@ -625,7 +645,7 @@
const articleData = await this.getJSON<any>(articleApiUrl)

if (articleData.error) {
const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}`
const errorMessage = `Unable to retrieve js/css dependencies for article '${this}': ${articleData.error.code}`

Check warning on line 648 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L648

Added line #L648 was not covered by tests
logger.error(errorMessage)

/* If article is missing (for example because it just has been deleted) */
Expand Down
43 changes: 26 additions & 17 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
customZimFavicon,
optimisationCacheUrl,
customFlavour,
forceRender,
} = argv

let { articleList, articleListToIgnore } = argv
Expand Down Expand Up @@ -212,8 +213,7 @@
await MediaWiki.hasCoordinates(downloader)
await MediaWiki.hasWikimediaDesktopRestApi()
await MediaWiki.hasVisualEditorApi()

await downloader.setBaseUrls()
await downloader.setBaseUrls(forceRender)

RedisStore.setOptions(argv.redis || config.defaults.redisPath)
await RedisStore.connect()
Expand Down Expand Up @@ -420,7 +420,7 @@

logger.log('Getting articles')
stime = Date.now()
const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump)
const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump, forceRender)
logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`)

logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`)
Expand Down Expand Up @@ -607,32 +607,41 @@
return mainPage ? createMainPageRedirect() : createMainPage()
}

async function fetchArticleDetail(articleId: string) {
return await articleDetailXId.get(articleId)

Check notice on line 611 in src/mwoffliner.lib.ts

View check run for this annotation

codefactor.io / CodeFactor

src/mwoffliner.lib.ts#L611

Unnecessary 'await'. (no-return-await)

Check warning on line 611 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L610-L611

Added lines #L610 - L611 were not covered by tests
}

async function updateArticleThumbnail(articleDetail: any, articleId: string) {
const imageUrl = articleDetail.thumbnail

Check warning on line 615 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L614-L615

Added lines #L614 - L615 were not covered by tests
if (!imageUrl) return

const { width: oldWidth } = getSizeFromUrl(imageUrl.source)
const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-')
const { mult, width } = getSizeFromUrl(suitableResUrl)
const path = getMediaBase(suitableResUrl, false)

Check warning on line 621 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L618-L621

Added lines #L618 - L621 were not covered by tests

articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I')

Check warning on line 623 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L623

Added line #L623 was not covered by tests

await Promise.all([filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width } as FileDetail), articleDetailXId.set(articleId, articleDetail)])

Check warning on line 625 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L625

Added line #L625 was not covered by tests
}

async function getThumbnailsData(): Promise<void> {
if (customMainPage || !articleList || articleListLines.length <= MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE) return

logger.log('Updating article thumbnails for articles')

let articleIndex = 0
let articlesWithImages = 0

while (articleIndex < articleListLines.length && articlesWithImages <= 100) {
const articleId = articleListLines[articleIndex]
articleIndex++

try {
const articleDetail = await articleDetailXId.get(articleId)
const articleDetail = await fetchArticleDetail(articleId)

Check warning on line 641 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L641

Added line #L641 was not covered by tests
if (!articleDetail) continue

const imageUrl = articleDetail.thumbnail
if (!imageUrl) continue

const { width: oldWidth } = getSizeFromUrl(imageUrl.source)
const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-')
const { mult, width } = getSizeFromUrl(suitableResUrl)
const path = getMediaBase(suitableResUrl, false)
articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I')

await Promise.all([
filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width } as FileDetail),
articleDetailXId.set(articleId, articleDetail),
])
await updateArticleThumbnail(articleDetail, articleId)

Check warning on line 644 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L644

Added line #L644 was not covered by tests
articlesWithImages++
} catch (err) {
logger.warn(`Failed to parse thumbnail for [${articleId}], skipping...`)
Expand Down
2 changes: 2 additions & 0 deletions src/parameterList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ export const parameterDescriptions = {
osTmpDir: 'Override default operating system temporary directory path environment variable',
customFlavour: 'A custom processor that can filter and process articles (see extensions/*.js)',
optimisationCacheUrl: 'S3 url, including credentials and bucket name',
forceRender:
'Force the usage of a specific API end-point/render, automatically chosen otherwise. Accepted values: [ VisualEditor, WikimediaDesktop. WikimediaMobile ]. More details at https://github.com/openzim/mwoffliner/wiki/API-end-points',
}

// TODO: Add an interface based on the object above
95 changes: 49 additions & 46 deletions src/renderers/abstract.renderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -576,9 +576,7 @@
return false
}

private applyOtherTreatments(parsoidDoc: DominoElement, dump: Dump) {
const filtersConfig = config.filters

private clearLinkAndInputTags(parsoidDoc: DominoElement, filtersConfig: any, dump: Dump) {
/* Don't need <link> and <input> tags */
const nodesToDelete: Array<{ class?: string; tag?: string; filter?: (n: any) => boolean }> = [{ tag: 'link' }, { tag: 'input' }]

Expand Down Expand Up @@ -646,6 +644,42 @@
}
}
}
}

private clearNodes(parsoidDoc: DominoElement, filtersConfig: any) {
const allNodes: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('*'))
for (const node of allNodes) {
node.removeAttribute('data-parsoid')
node.removeAttribute('typeof')
node.removeAttribute('about')
node.removeAttribute('data-mw')

if (node.getAttribute('rel') && node.getAttribute('rel').substr(0, 3) === 'mw:') {
node.removeAttribute('rel')
} else if (node.getAttribute('img')) {
/* Remove a few images Parsoid attributes */
node.removeAttribute('data-file-width')
node.removeAttribute('data-file-height')
node.removeAttribute('data-file-type')

Check warning on line 663 in src/renderers/abstract.renderer.ts

View check run for this annotation

Codecov / codecov/patch

src/renderers/abstract.renderer.ts#L661-L663

Added lines #L661 - L663 were not covered by tests
}

/* Remove a few css calls */
filtersConfig.cssClassCallsBlackList.map((classname: string) => {
if (node.getAttribute('class')) {
node.setAttribute('class', node.getAttribute('class').replace(classname, ''))
}
})
}

const kartographerMaplinkNodes = Array.from<DominoElement>(parsoidDoc.querySelectorAll('.mw-kartographer-maplink')).filter((n) => !!n.textContent)
for (const node of kartographerMaplinkNodes) {
node.textContent = '🌍'

Check warning on line 676 in src/renderers/abstract.renderer.ts

View check run for this annotation

Codecov / codecov/patch

src/renderers/abstract.renderer.ts#L676

Added line #L676 was not covered by tests
}
}

private applyOtherTreatments(parsoidDoc: DominoElement, dump: Dump) {
const filtersConfig = config.filters
this.clearLinkAndInputTags(parsoidDoc, filtersConfig, dump)

/* Go through all reference calls */
const spans: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('span'))
Expand Down Expand Up @@ -682,53 +716,22 @@
/* Remove empty paragraphs */
// TODO: Refactor this option to work with page/html and page/mobile-html output. See issues/1866
if (!dump.opts.keepEmptyParagraphs) {
if (!dump.opts.keepEmptyParagraphs) {
// Mobile view === details
// Desktop view === section
const sections: DominoElement[] = Array.from(parsoidDoc.querySelectorAll('details, section'))
for (const section of sections) {
if (
section.children.length ===
Array.from(section.children).filter((child: DominoElement) => {
return child.matches('summary')
}).length
) {
DU.deleteNode(section)
}
// Mobile view === details
// Desktop view === section
const sections: DominoElement[] = Array.from(parsoidDoc.querySelectorAll('details, section'))
for (const section of sections) {
if (
section.children.length ===
Array.from(section.children).filter((child: DominoElement) => {
return child.matches('summary')
}).length
) {
DU.deleteNode(section)
}
}
}

/* Clean the DOM of all uncessary code */
const allNodes: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('*'))
for (const node of allNodes) {
node.removeAttribute('data-parsoid')
node.removeAttribute('typeof')
node.removeAttribute('about')
node.removeAttribute('data-mw')

if (node.getAttribute('rel') && node.getAttribute('rel').substr(0, 3) === 'mw:') {
node.removeAttribute('rel')
} else if (node.getAttribute('img')) {
/* Remove a few images Parsoid attributes */
node.removeAttribute('data-file-width')
node.removeAttribute('data-file-height')
node.removeAttribute('data-file-type')
}

/* Remove a few css calls */
filtersConfig.cssClassCallsBlackList.map((classname: string) => {
if (node.getAttribute('class')) {
node.setAttribute('class', node.getAttribute('class').replace(classname, ''))
}
})
}

const kartographerMaplinkNodes = Array.from<DominoElement>(parsoidDoc.querySelectorAll('.mw-kartographer-maplink')).filter((n) => !!n.textContent)
for (const node of kartographerMaplinkNodes) {
node.textContent = '🌍'
}

this.clearNodes(parsoidDoc, filtersConfig)
return parsoidDoc
}

Expand Down
30 changes: 29 additions & 1 deletion src/sanitize-argument.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,19 @@ const parametersWithArrayType = ['format']

export async function sanitize_all(argv: any) {
// extracting all arguments
const { articleList, addNamespaces, speed: _speed, adminEmail, mwUrl, customZimFavicon, optimisationCacheUrl, verbose, customZimLongDescription, customZimDescription } = argv
const {
articleList,
addNamespaces,
speed: _speed,
adminEmail,
mwUrl,
customZimFavicon,
optimisationCacheUrl,
verbose,
customZimLongDescription,
customZimDescription,
forceRender,
} = argv

sanitizeDoubleUsedParameters(argv)

Expand Down Expand Up @@ -73,6 +85,11 @@ export async function sanitize_all(argv: any) {
// sanitizing adminEmail
sanitize_adminEmail(adminEmail)

// sanitizing renderer
if (forceRender) {
sanitize_forceRender(forceRender)
}

// Redis client sanitization
// created a redis client and then closed it.
sanitize_redis(argv)
Expand Down Expand Up @@ -173,3 +190,14 @@ export function sanitize_customFlavour(customFlavour: string): string {
}) || null
)
}

export function sanitize_forceRender(renderName: string): string {
const renderNames = ['VisualEditor', 'WikimediaDesktop', 'WikimediaMobile']
const checkRenderName = (arr: string[], val: string) => {
return arr.some((arrVal) => val === arrVal)
}
if (checkRenderName(renderNames, renderName)) {
return renderName
}
throw new Error(`Invalid render name: ${renderName}`)
}
16 changes: 13 additions & 3 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@
/*
* Fetch Articles
*/
export async function saveArticles(zimCreator: ZimCreator, downloader: Downloader, dump: Dump) {
export async function saveArticles(zimCreator: ZimCreator, downloader: Downloader, dump: Dump, forceRender = null) {
const jsModuleDependencies = new Set<string>()
const cssModuleDependencies = new Set<string>()
let jsConfigVars = ''
Expand All @@ -258,9 +258,19 @@
const articlesTotal = await articleDetailXId.len()

const rendererBuilder = new RendererBuilder()
const rendererBuilderOptions: RendererBuilderOptions = {
renderType: 'auto',

let rendererBuilderOptions: RendererBuilderOptions
if (forceRender) {
rendererBuilderOptions = {
renderType: 'specific',
renderName: forceRender,
}
} else {
rendererBuilderOptions = {
renderType: 'auto',
}
}

const mainPageRenderer = await rendererBuilder.createRenderer(rendererBuilderOptions)
// TODO: article renderer will be switched to the mobile mode later
const articlesRenderer = await rendererBuilder.createRenderer(rendererBuilderOptions)
Expand All @@ -273,142 +283,142 @@
const timeout = Math.max(downloader.requestTimeout * 2, 10 * 60 * 1000)

await articleDetailXId.iterateItems(downloader.speed, (articleKeyValuePairs, workerId) => {
return new Promise(async (resolve, reject) => {
/*
* timer to detect freezes
*/
let curStage = 0
let curArticle = ''
const timer = new Timer(() => {
const errorMessage = `Worker timed out at ${stages[curStage]} ${curArticle}`
logger.error(errorMessage)
reject(new Error(errorMessage))
}, timeout)

logger.info(`Worker [${workerId}] processing batch of article ids [${logger.logifyArray(Object.keys(articleKeyValuePairs))}]`)

const parsePromiseQueue: [string, Promise<Error>][] = []

for (const [articleId, articleDetail] of Object.entries(articleKeyValuePairs)) {
timer.reset()
curStage = 0
curArticle = articleId
const promises: [string, Promise<Error>][] = []

const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

let rets: any
try {
const articleUrl = getArticleUrl(downloader, dump, articleId)
if (dump.isMainPage) {
rets = await downloader.getArticle(
downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
mainPageRenderer,
articleUrl,
dump,
articleDetail,
dump.isMainPage(articleId),
)
}
rets = await downloader.getArticle(
downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articlesRenderer,
articleUrl,
dump,
articleDetail,
dump.isMainPage(articleId),
)

for (const { articleId, displayTitle: articleTitle, html: finalHTML, mediaDependencies, subtitles } of rets) {
if (!finalHTML) {
logger.warn(`No HTML returned for article [${articleId}], skipping`)
continue
}

curStage += 1
for (const dep of _moduleDependencies.jsDependenciesList) {
jsModuleDependencies.add(dep)
}
for (const dep of _moduleDependencies.styleDependenciesList) {
cssModuleDependencies.add(dep)
}
jsConfigVars = jsConfigVars || _moduleDependencies.jsConfigVars

/*
* getModuleDependencies and downloader.getArticle are
* network heavy while parsing and saving is I/O.
* To parse and download simultaniously, we don't await on save,
* but instead cache the promise in a queue and check it later
*/
promises.push([articleId, saveArticle(zimCreator, finalHTML, mediaDependencies, subtitles, articleId, articleTitle, articleDetail)])
}
} catch (err) {
dump.status.articles.fail += 1
logger.error(`Error downloading article ${articleId}`)
if ((!err.response || err.response.status !== 404) && err.message !== DELETED_ARTICLE_ERROR) {
reject(cleanupAxiosError(err))
return
}
}

if (parsePromiseQueue.length) {
curStage += 1
const [articleId, parsePromise] = parsePromiseQueue.shift()
curArticle = articleId
/*
* in normal circumstances, where downloading is slower than
* saving, this promise will always be resolved here already
*/
const err = await parsePromise
if (err) {
console.log(err)

logger.error(`Error parsing article ${articleId}`)
timer.clear()
reject(err)
return
}
dump.status.articles.success += 1
}

if (promises.length) {
parsePromiseQueue.push(flattenPromises(promises))
}

if ((dump.status.articles.success + dump.status.articles.fail) % 10 === 0) {
const percentProgress = (((dump.status.articles.success + dump.status.articles.fail) / articlesTotal) * 100).toFixed(1)
if (percentProgress !== prevPercentProgress) {
prevPercentProgress = percentProgress
logger.log(`Progress downloading articles [${dump.status.articles.success + dump.status.articles.fail}/${articlesTotal}] [${percentProgress}%]`)
}
}
}

/*
* clear up potentially still pending promises
*/
curStage += 1
if (parsePromiseQueue.length) {
const [articleId, parsePromise] = flattenPromises(parsePromiseQueue)
curArticle = articleId
const err = await parsePromise
if (err) {
timer.clear()
reject(err)
return
}
dump.status.articles.success += parsePromiseQueue.length
}

timer.clear()
resolve()
})

Check notice on line 421 in src/util/saveArticles.ts

View check run for this annotation

codefactor.io / CodeFactor

src/util/saveArticles.ts#L286-L421

Complex Method
})

logger.log(`Done with downloading a total of [${articlesTotal}] articles`)
Expand Down
Loading
Loading