From 01ab15e8518fa89cdcaf835dad731901d083c29b Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Thu, 12 Oct 2023 17:27:56 +0300 Subject: [PATCH] Switchover to WikimediaDesktop render while using MediawikiRESTApi, add temp workaround with TITLE_PLACEHOLDER for URL builders --- src/MediaWiki.ts | 10 +-- src/renderers/renderer.builder.ts | 7 +-- src/renderers/wikimedia-rest-api.renderer.ts | 62 ------------------- src/util/builders/url/api.director.ts | 3 +- src/util/builders/url/base.director.ts | 5 +- .../url/mediawiki-rest-api.director.ts | 2 +- src/util/builders/url/url.builder.ts | 5 ++ src/util/const.ts | 1 + src/util/saveArticles.ts | 3 +- test/unit/builders/url/api.director.test.ts | 2 +- test/unit/builders/url/base.director.test.ts | 2 +- test/unit/saveArticles.test.ts | 3 + 12 files changed, 27 insertions(+), 78 deletions(-) delete mode 100644 src/renderers/wikimedia-rest-api.renderer.ts diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index 28d158dd..1f5895c3 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -13,7 +13,7 @@ import DesktopURLDirector from './util/builders/url/desktop.director.js' import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js' import MediaWikiRESTApiDirector from './util/builders/url/mediawiki-rest-api.director.js' import { checkApiAvailability } from './util/mw-api.js' -import { BLACKLISTED_NS } from './util/const.js' +import { BLACKLISTED_NS, TITLE_PLACEHOLDER } from './util/const.js' export interface QueryOpts { action: string @@ -114,7 +114,7 @@ class MediaWiki { this.namespaces = {} this.namespacesToMirror = [] - this.#mediawikiRESTApiPath = 'w/rest.php/v1/page' + this.#mediawikiRESTApiPath = `w/rest.php/v1/page/${TITLE_PLACEHOLDER}/html` this.#apiPath = 'w/api.php' this.#wikiPath = 'wiki/' this.apiCheckArticleId = 'MediaWiki:Sidebar' @@ -141,7 +141,7 @@ class MediaWiki { public async hasWikimediaDesktopRestApi(): Promise { if (this.#hasWikimediaDesktopRestApi === null) { - this.#hasWikimediaDesktopRestApi = await checkApiAvailability(this.wikimediaDesktopUrlDirector.buildArticleURL(this.apiCheckArticleId)) + this.#hasWikimediaDesktopRestApi = await checkApiAvailability(this.wikimediaDesktopUrlDirector.buildArticleURL(this.apiCheckArticleId).replace(`/${TITLE_PLACEHOLDER}`, '')) return this.#hasWikimediaDesktopRestApi } return this.#hasWikimediaDesktopRestApi @@ -149,7 +149,7 @@ class MediaWiki { public async hasVisualEditorApi(): Promise { if (this.#hasVisualEditorApi === null) { - this.#hasVisualEditorApi = await checkApiAvailability(this.visualEditorURLDirector.buildArticleURL(this.apiCheckArticleId)) + this.#hasVisualEditorApi = await checkApiAvailability(this.visualEditorURLDirector.buildArticleURL(this.apiCheckArticleId).replace(`&page=${TITLE_PLACEHOLDER}`, '')) return this.#hasVisualEditorApi } return this.#hasVisualEditorApi @@ -157,7 +157,7 @@ class MediaWiki { public async hasMediaWikiRESTApi(): Promise { if (this.#hasMediaWikiRESTApi === null) { - this.#hasMediaWikiRESTApi = await checkApiAvailability(this.mediaWikiRESTApiDirector.buildArticleURL(this.apiCheckArticleId)) + this.#hasMediaWikiRESTApi = await checkApiAvailability(this.mediaWikiRESTApiDirector.buildArticleURL(this.apiCheckArticleId).replace(`${TITLE_PLACEHOLDER}/html/`, '')) return this.#hasMediaWikiRESTApi } return this.#hasMediaWikiRESTApi diff --git a/src/renderers/renderer.builder.ts b/src/renderers/renderer.builder.ts index 140fa86f..17aab800 100644 --- a/src/renderers/renderer.builder.ts +++ b/src/renderers/renderer.builder.ts @@ -3,7 +3,6 @@ import { Renderer } from './abstract.renderer.js' import { VisualEditorRenderer } from './visual-editor.renderer.js' import { WikimediaDesktopRenderer } from './wikimedia-desktop.renderer.js' import { RendererBuilderOptions } from './abstract.renderer.js' -import { MediawikiRESTApiRenderer } from './wikimedia-rest-api.renderer.js' import * as logger from './../Logger.js' export class RendererBuilder { @@ -24,7 +23,7 @@ export class RendererBuilder { } else if (hasVisualEditorApi) { return new VisualEditorRenderer() } else if (hasMediaWikiRESTApi) { - return new MediawikiRESTApiRenderer() + return new WikimediaDesktopRenderer() } else { logger.error('No available desktop renderer.') process.exit(1) @@ -39,7 +38,7 @@ export class RendererBuilder { } else if (hasVisualEditorApi) { return new VisualEditorRenderer() } else if (hasMediaWikiRESTApi) { - return new MediawikiRESTApiRenderer() + return new WikimediaDesktopRenderer() } else { logger.error('No render available at all.') process.exit(1) @@ -61,7 +60,7 @@ export class RendererBuilder { process.exit(1) case 'MediawikiRESTApi': if (hasMediaWikiRESTApi) { - return new MediawikiRESTApiRenderer() + return new WikimediaDesktopRenderer() } logger.error('Cannot create an instance of MediawikiRESTApi renderer.') process.exit(1) diff --git a/src/renderers/wikimedia-rest-api.renderer.ts b/src/renderers/wikimedia-rest-api.renderer.ts deleted file mode 100644 index 3bacc1bf..00000000 --- a/src/renderers/wikimedia-rest-api.renderer.ts +++ /dev/null @@ -1,62 +0,0 @@ -import * as logger from '../Logger.js' -import { Renderer } from './abstract.renderer.js' -import { getStrippedTitleFromHtml } from '../util/misc.js' -import { RenderOpts, RenderOutput } from './abstract.renderer.js' - -/* -Represent 'https://{wikimedia-wiki}/w/rest.php/v1/page/{title}/with_html' -or -'https://{3rd-part-wikimedia-wiki}/w/rest.php/v1/page/{title}/with_html' -*/ -export class MediawikiRESTApiRenderer extends Renderer { - constructor() { - super() - } - - private async retrieveHtml(renderOpts: RenderOpts): Promise { - const { data, articleId, articleDetail, isMainPage } = renderOpts - - if (!data) { - throw new Error(`Cannot render [${data}] into an article`) - } - - let html: string - let displayTitle: string - let strippedTitle: string - - if (data.html) { - html = isMainPage ? data.html : super.injectH1TitleToHtml(data.html, articleDetail) - strippedTitle = getStrippedTitleFromHtml(html) - displayTitle = strippedTitle || articleId.replace('_', ' ') - return { html, displayTitle } - } else if (data.errorKey) { - logger.error(`Error in retrieved article [${articleId}]:`, data.errorKey) - return '' - } - logger.error('Unable to parse data from visual editor') - return '' - } - - public async render(renderOpts: RenderOpts): Promise { - try { - const result: RenderOutput = [] - const { articleId, articleDetail, webp, _moduleDependencies, dump } = renderOpts - const { html, displayTitle } = await this.retrieveHtml(renderOpts) - if (html) { - const { finalHTML, mediaDependencies, subtitles } = await super.processHtml(html, dump, articleId, articleDetail, _moduleDependencies, webp) - result.push({ - articleId, - displayTitle, - html: finalHTML, - mediaDependencies, - subtitles, - }) - return result - } - return '' - } catch (err) { - logger.error(err.message) - throw new Error(err.message) - } - } -} diff --git a/src/util/builders/url/api.director.ts b/src/util/builders/url/api.director.ts index 6651d7c6..fe5a0f48 100644 --- a/src/util/builders/url/api.director.ts +++ b/src/util/builders/url/api.director.ts @@ -1,4 +1,5 @@ import urlBuilder from './url.builder.js' +import { TITLE_PLACEHOLDER } from '../../const.js' /** * Interface to build URLs based on MediaWiki API URL @@ -51,7 +52,7 @@ export default class ApiURLDirector { buildVisualEditorURL() { return urlBuilder .setDomain(this.baseDomain) - .setQueryParams({ action: 'visualeditor', mobileformat: 'html', format: 'json', paction: 'parse', formatversion: '2', page: '' }) + .setQueryParams({ action: 'visualeditor', mobileformat: 'html', format: 'json', paction: 'parse', formatversion: '2', page: TITLE_PLACEHOLDER }) .build(true) } diff --git a/src/util/builders/url/base.director.ts b/src/util/builders/url/base.director.ts index dd2d15ac..d14f8631 100644 --- a/src/util/builders/url/base.director.ts +++ b/src/util/builders/url/base.director.ts @@ -1,4 +1,5 @@ import urlBuilder from './url.builder.js' +import { TITLE_PLACEHOLDER } from '../../const.js' /** * Interface to build URLs based on base URL @@ -24,7 +25,7 @@ export default class BaseURLDirector { buildDesktopRestApiURL(path?: string) { return urlBuilder .setDomain(this.baseDomain) - .setPath(path ?? 'api/rest_v1/page/html') + .setPath(path ?? `api/rest_v1/page/html/${TITLE_PLACEHOLDER}`) .build(true, '/') } @@ -38,7 +39,7 @@ export default class BaseURLDirector { buildMediaWikiREST(path?: string) { return urlBuilder .setDomain(this.baseDomain) - .setPath(path ?? 'w/rest.php/v1/page') + .setPath(path ?? `w/rest.php/v1/page/${TITLE_PLACEHOLDER}/html`) .build(true, '/') } } diff --git a/src/util/builders/url/mediawiki-rest-api.director.ts b/src/util/builders/url/mediawiki-rest-api.director.ts index 68449165..d839c9f0 100644 --- a/src/util/builders/url/mediawiki-rest-api.director.ts +++ b/src/util/builders/url/mediawiki-rest-api.director.ts @@ -12,6 +12,6 @@ export default class MediaWikiRESTApiURL { buildArticleURL(articleId: string) { const base = urlBuilder.setDomain(this.baseDomain).build() - return `${base}${articleId}/with_html` + return `${base}${articleId}/html` } } diff --git a/src/util/builders/url/url.builder.ts b/src/util/builders/url/url.builder.ts index 08742e89..e0187431 100644 --- a/src/util/builders/url/url.builder.ts +++ b/src/util/builders/url/url.builder.ts @@ -1,4 +1,5 @@ import { ensureTrailingChar } from '../../misc.js' +import { TITLE_PLACEHOLDER } from '../../const.js' /** * Concat the path to the domain and setting query params @@ -99,6 +100,10 @@ class URLBuilder { return link } + + buildArticleUrl(renderApiUrl: string, articleId: string) { + return `${renderApiUrl.replace(TITLE_PLACEHOLDER, articleId)}`.slice(0, -1) + } } const urlBuilder = new URLBuilder() diff --git a/src/util/const.ts b/src/util/const.ts index 0ea72f3e..af168831 100644 --- a/src/util/const.ts +++ b/src/util/const.ts @@ -21,3 +21,4 @@ export const WEBP_HANDLER_URL = 'https://gist.githubusercontent.com/rgaudin/60bb export const MAX_FILE_DOWNLOAD_RETRIES = 5 export const BLACKLISTED_NS = ['Story'] // 'Story' Wikipedia namespace is content, but not indgestable by Parsoid https://github.com/openzim/mwoffliner/issues/1853 export const RENDERERS_LIST = ['WikimediaDesktop', 'VisualEditor', 'MediawikiRESTApi'] +export const TITLE_PLACEHOLDER = 'TITLE_PLACEHOLDER' diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index c777a32d..ee4b727f 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -12,6 +12,7 @@ import { config } from '../config.js' import { getSizeFromUrl, cleanupAxiosError } from './misc.js' import { CONCURRENCY_LIMIT, DELETED_ARTICLE_ERROR, MAX_FILE_DOWNLOAD_RETRIES } from './const.js' import urlHelper from './url.helper.js' +import urlBuilder from './builders/url/url.builder.js' import { Renderer } from '../renderers/abstract.renderer.js' import { RendererBuilder } from '../renderers/renderer.builder.js' @@ -225,7 +226,7 @@ async function saveArticle( } export function getArticleUrl(downloader: Downloader, dump: Dump, articleId: string): string { - return `${dump.isMainPage(articleId) ? downloader.baseUrlForMainPage : downloader.baseUrl}${encodeURIComponent(articleId)}` + return urlBuilder.buildArticleUrl(dump.isMainPage(articleId) ? downloader.baseUrlForMainPage : downloader.baseUrl, articleId) } /* diff --git a/test/unit/builders/url/api.director.test.ts b/test/unit/builders/url/api.director.test.ts index 993b9dfa..9e468fc9 100644 --- a/test/unit/builders/url/api.director.test.ts +++ b/test/unit/builders/url/api.director.test.ts @@ -57,7 +57,7 @@ describe('ApiURLDirector', () => { it('should return base visual editor URL object with default query params', () => { const url = apiUrlDirector.buildVisualEditorURL() - expect(url.href).toBe('https://en.wikipedia.org/w/api.php?action=visualeditor&mobileformat=html&format=json&paction=parse&formatversion=2&page=') + expect(url.href).toBe('https://en.wikipedia.org/w/api.php?action=visualeditor&mobileformat=html&format=json&paction=parse&formatversion=2&page=TITLE_PLACEHOLDER') }) }) }) diff --git a/test/unit/builders/url/base.director.test.ts b/test/unit/builders/url/base.director.test.ts index 9282ff8c..69b0edc5 100644 --- a/test/unit/builders/url/base.director.test.ts +++ b/test/unit/builders/url/base.director.test.ts @@ -35,7 +35,7 @@ describe('BaseURLDirector', () => { it('should return a desktop URL with default path and trailing char', () => { const url = baseUrlDirector.buildDesktopRestApiURL() - expect(url.href).toBe('https://en.m.wikipedia.com/api/rest_v1/page/html/') + expect(url.href).toBe('https://en.m.wikipedia.com/api/rest_v1/page/html/TITLE_PLACEHOLDER/') }) }) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 22efef87..ecdc2c3d 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -91,6 +91,9 @@ describe('saveArticles', () => { case 'WikimediaDesktop': rendererInstance = new WikimediaDesktopRenderer() break + case 'MediawikiRESTApi': + rendererInstance = new WikimediaDesktopRenderer() + break default: throw new Error(`Unknown renderer: ${renderer}`) }