From b7f0974fbb3be01355ff61a6b1b8e32228b7d3e8 Mon Sep 17 00:00:00 2001 From: mytlogos Date: Tue, 6 Sep 2022 09:56:35 +0200 Subject: [PATCH] feat(scraper): expand crawler metrics --- packages/core/src/asyncStorage.ts | 9 +-- .../core/src/database/contexts/jobContext.ts | 13 ++--- packages/core/src/tools.ts | 14 +++++ packages/core/src/types.ts | 4 ++ .../scraper/src/externals/custom/download.ts | 3 + packages/scraper/src/externals/custom/news.ts | 2 + .../scraper/src/externals/custom/search.ts | 2 + packages/scraper/src/externals/custom/toc.ts | 2 + .../scraper/src/externals/customv2/index.ts | 5 ++ .../src/externals/direct/boxNovelScraper.ts | 7 ++- .../src/externals/direct/gogoAnimeScraper.ts | 7 ++- .../src/externals/direct/mangaHasuScraper.ts | 7 ++- .../src/externals/direct/mangadexScraper.ts | 5 +- .../src/externals/direct/novelFullScraper.ts | 6 ++ .../externals/direct/openLibraryScraper.ts | 3 + .../externals/direct/undergroundScraper.ts | 3 + .../src/externals/direct/webnovelScraper.ts | 8 ++- .../src/externals/direct/wuxiaworldScraper.ts | 7 ++- packages/scraper/src/externals/listManager.ts | 2 + .../scraper/src/externals/queueRequest.ts | 6 +- .../src/externals/request/cloudflare.ts | 6 ++ .../scraper/src/externals/request/request.ts | 12 ++++ .../scraper/src/externals/scraperTools.ts | 19 +++++- packages/scraper/src/externals/types.ts | 1 + packages/scraper/src/metrics.ts | 58 +++++++++++++++---- packages/scraper/src/scheduler/job.ts | 10 +--- 26 files changed, 174 insertions(+), 47 deletions(-) diff --git a/packages/core/src/asyncStorage.ts b/packages/core/src/asyncStorage.ts index d71ba74b..635733c9 100644 --- a/packages/core/src/asyncStorage.ts +++ b/packages/core/src/asyncStorage.ts @@ -8,7 +8,7 @@ import { } from "async_hooks"; import { writeSync } from "fs"; import { AsyncContextError } from "./error"; -import { Modification, Optional } from "./types"; +import { Modification, NetworkTrack, Optional } from "./types"; const localStorage = new AsyncLocalStorage(); @@ -181,12 +181,7 @@ export interface StoreMapping { [StoreKey.MODIFICATIONS]: Record; [StoreKey.RESULT]: "success" | "warning" | "failed" | "aborted"; [StoreKey.MESSAGE]: string; - [StoreKey.NETWORK]: { - count: number; - sent: number; - received: number; - history: Array<{ url: string; method: string; statusCode: number; send: number; received: number }>; - }; + [StoreKey.NETWORK]: NetworkTrack; [StoreKey.LAST_RUN]: Date; [StoreKey.ERROR]: unknown; [StoreKey.LAST_REQUEST_URL]: string; diff --git a/packages/core/src/database/contexts/jobContext.ts b/packages/core/src/database/contexts/jobContext.ts index 58d7470c..48c28556 100644 --- a/packages/core/src/database/contexts/jobContext.ts +++ b/packages/core/src/database/contexts/jobContext.ts @@ -23,7 +23,7 @@ import { QueryJobHistory, Paginated, } from "../../types"; -import { isString, promiseMultiSingle, multiSingle } from "../../tools"; +import { isString, promiseMultiSingle, multiSingle, defaultNetworkTrack } from "../../tools"; import logger from "../../logger"; import mysql from "promise-mysql"; import { escapeLike } from "../storages/storageTools"; @@ -518,12 +518,7 @@ export class JobContext extends SubContext { const jobTrack: JobTrack = { modifications: store.get(StoreKey.MODIFICATIONS) || {}, - network: store.get(StoreKey.NETWORK) || { - count: 0, - sent: 0, - received: 0, - history: [], - }, + network: store.get(StoreKey.NETWORK) || defaultNetworkTrack(), queryCount: store.get(StoreKey.QUERY_COUNT) || 0, }; @@ -581,8 +576,8 @@ export class JobContext extends SubContext { item.max_deleted = Math.max(item.max_deleted, modification.deleted); }); item.sql_queries = jobTrack.queryCount; - item.min_sql_queries = Math.min(jobTrack.queryCount); - item.max_sql_queries = Math.max(jobTrack.queryCount); + item.min_sql_queries = Math.min(jobTrack.queryCount, item.min_sql_queries); + item.max_sql_queries = Math.max(jobTrack.queryCount, item.max_sql_queries); item.failed += result === "failed" ? 1 : 0; item.succeeded += result === "success" ? 1 : 0; diff --git a/packages/core/src/tools.ts b/packages/core/src/tools.ts index cc2489d5..0714f626 100644 --- a/packages/core/src/tools.ts +++ b/packages/core/src/tools.ts @@ -9,6 +9,7 @@ import { Nullable, Indexable, ExtractedIndex, + NetworkTrack, } from "./types"; import crypto from "crypto"; import bcrypt from "bcryptjs"; @@ -1041,3 +1042,16 @@ export function deferableTimeout(timeoutMillis: number, maxRetries = 0): Deferab result.defer(); return result; } + +export function defaultNetworkTrack(): NetworkTrack { + return { + count: 0, + sent: 0, + received: 0, + cloudflareCount: 0, + puppeteerCount: 0, + retryCount: 0, + hooksUsed: [], + history: [], + }; +} diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index cb24c029..11ad41db 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -1516,6 +1516,10 @@ export interface NetworkTrack { count: number; sent: number; received: number; + cloudflareCount: number; + puppeteerCount: number; + retryCount: number; + hooksUsed: string[]; history: Array<{ url: string; method: string; diff --git a/packages/scraper/src/externals/custom/download.ts b/packages/scraper/src/externals/custom/download.ts index b5b295e0..064c88f5 100644 --- a/packages/scraper/src/externals/custom/download.ts +++ b/packages/scraper/src/externals/custom/download.ts @@ -1,6 +1,7 @@ import { Cheerio, Element } from "cheerio"; import { JSONSchema7 } from "json-schema"; import { validate } from "jsonschema"; +import { storeHookName } from "../scraperTools"; import { ContentDownloader, EpisodeContent } from "../types"; import { defaultContext, extract, makeRequest } from "./common"; import { CustomHookError } from "./errors"; @@ -50,6 +51,8 @@ export function createDownloadScraper(config: HookConfig): ContentDownloader | u } const scraper: ContentDownloader = async (url) => { + storeHookName(config.name); + const context = defaultContext(); async function scrape(downloadConfig: DownloadConfig) { diff --git a/packages/scraper/src/externals/custom/news.ts b/packages/scraper/src/externals/custom/news.ts index 65cbe135..332a4848 100644 --- a/packages/scraper/src/externals/custom/news.ts +++ b/packages/scraper/src/externals/custom/news.ts @@ -5,6 +5,7 @@ import { HookConfig } from "./types"; import { CustomHookError } from "./errors"; import { ValidationError } from "enterprise-core/dist/error"; import request from "../request"; +import { storeHookName } from "../scraperTools"; function validateEpisodeNews(episodes: Array>): EpisodeNews[] { for (const episode of episodes) { @@ -51,6 +52,7 @@ export function createNewsScraper(config: HookConfig): NewsScraper | undefined { const newsConfig = config.news; const scraper: NewsScraper = async () => { + storeHookName(config.name); const $ = await request.getCheerio({ url: newsConfig.newsUrl }); const baseUri = newsConfig.base || config.base; const context = defaultContext(); diff --git a/packages/scraper/src/externals/custom/search.ts b/packages/scraper/src/externals/custom/search.ts index a175fd21..b55fbc94 100644 --- a/packages/scraper/src/externals/custom/search.ts +++ b/packages/scraper/src/externals/custom/search.ts @@ -6,6 +6,7 @@ import { SearchScraper } from "../types"; import { defaultContext, extractJSON, makeRequest } from "./common"; import { SearchResult } from "enterprise-core/dist/types"; import { CustomHookError } from "./errors"; +import { storeHookName } from "../scraperTools"; const tocSchema: JSONSchema7 = { $schema: "https://json-schema.org/draft/2020-12/schema", @@ -47,6 +48,7 @@ export function createSearchScraper(config: HookConfig): SearchScraper | undefin } const scraper: SearchScraper = async (text) => { + storeHookName(config.name); const context = defaultContext(); // @ts-expect-error context.variables.PARAM = [text]; diff --git a/packages/scraper/src/externals/custom/toc.ts b/packages/scraper/src/externals/custom/toc.ts index 1e0bd126..6a3e7ee4 100644 --- a/packages/scraper/src/externals/custom/toc.ts +++ b/packages/scraper/src/externals/custom/toc.ts @@ -6,6 +6,7 @@ import { validate } from "jsonschema"; import { JSONSchema7 } from "json-schema"; import { CustomHookError } from "./errors"; import { Response } from "../request"; +import { storeHookName } from "../scraperTools"; const tocSchema: JSONSchema7 = { $schema: "https://json-schema.org/draft/2020-12/schema", @@ -95,6 +96,7 @@ export function createTocScraper(config: HookConfig): TocScraper | undefined { } const scraper: TocScraper = async (url) => { + storeHookName(config.name); const context = defaultContext(); let lastUrl = url; diff --git a/packages/scraper/src/externals/customv2/index.ts b/packages/scraper/src/externals/customv2/index.ts index 2e17676e..819c7eb8 100644 --- a/packages/scraper/src/externals/customv2/index.ts +++ b/packages/scraper/src/externals/customv2/index.ts @@ -19,6 +19,7 @@ import { EpisodeNews, ReleaseState, SearchResult } from "enterprise-core/dist/ty import { datePattern } from "./analyzer"; import { validateEpisodeNews, validateToc } from "./validation"; import request, { Response } from "../request"; +import { storeHookName } from "../scraperTools"; type Conditional = T extends undefined ? undefined : R; type Context = Record; @@ -183,6 +184,7 @@ function createNewsScraper(config: HookConfig): NewsScraper | undefined { const context: Context = {}; const scraper: NewsScraper = async (): Promise => { + storeHookName(config.name); const results: Array> = []; for (const datum of newsConfig.data) { const selector: Selector = { @@ -265,6 +267,7 @@ function createTocScraper(config: HookConfig): TocScraper | undefined { const x = createScraper(tocConfig.regexes); const scraper: TocScraper = async (link: string): Promise => { + storeHookName(config.name); const results = []; let firstResponseUrl: string | undefined; @@ -416,6 +419,7 @@ function createDownloadScraper(config: HookConfig): ContentDownloader | undefine const context: Context = {}; const scraper: ContentDownloader = async (link) => { + storeHookName(config.name); const results = []; for (const datum of downloadConfig.data) { const selector = { @@ -451,6 +455,7 @@ function createSearchScraper(config: HookConfig): SearchScraper | undefined { return; } const scraper: SearchScraper = async (text) => { + storeHookName(config.name); const results = []; for (const datum of searchConfig.data) { if (datum._request) { diff --git a/packages/scraper/src/externals/direct/boxNovelScraper.ts b/packages/scraper/src/externals/direct/boxNovelScraper.ts index 076df6d5..0c06b323 100644 --- a/packages/scraper/src/externals/direct/boxNovelScraper.ts +++ b/packages/scraper/src/externals/direct/boxNovelScraper.ts @@ -26,7 +26,7 @@ import { LogType, getText, } from "./directTools"; -import { checkTocContent } from "../scraperTools"; +import { checkTocContent, storeHookName } from "../scraperTools"; import { MissingResourceError, UrlError } from "../errors"; import * as cheerio from "cheerio"; import request, { ResponseError } from "../request"; @@ -45,10 +45,12 @@ interface NovelSearchData { const BASE_URI = "https://boxnovel.com/"; async function tocSearch(medium: TocSearchMedium): VoidablePromise { + storeHookName("boxnovel"); return searchToc(medium, tocAdapter, BASE_URI, (searchString) => searchAjax(searchString, medium)); } async function search(text: string): Promise { + storeHookName("boxnovel"); const urlString = BASE_URI + "wp-admin/admin-ajax.php"; let response: NovelSearchResponse; const searchResults: SearchResult[] = []; @@ -111,6 +113,7 @@ export async function searchAjax(searchWords: string, medium: TocSearchMedium): } async function contentDownloadAdapter(urlString: string): Promise { + storeHookName("boxnovel"); if (!urlString.match(/https:\/\/boxnovel\.com\/novel\/.+\/chapter-.+/)) { return []; } @@ -159,6 +162,7 @@ async function contentDownloadAdapter(urlString: string): Promise { + storeHookName("boxnovel"); const uri = BASE_URI; if (!tocLink.startsWith(BASE_URI + "novel/")) { @@ -305,6 +309,7 @@ async function tocAdapter(tocLink: string): Promise { } async function newsAdapter(): VoidablePromise<{ news?: News[]; episodes?: EpisodeNews[] }> { + storeHookName("boxnovel"); const uri = BASE_URI; const $ = await request.getCheerio({ url: uri }); const items = $(".page-item-detail"); diff --git a/packages/scraper/src/externals/direct/gogoAnimeScraper.ts b/packages/scraper/src/externals/direct/gogoAnimeScraper.ts index 2f2b4b0a..6c49b84a 100644 --- a/packages/scraper/src/externals/direct/gogoAnimeScraper.ts +++ b/packages/scraper/src/externals/direct/gogoAnimeScraper.ts @@ -4,7 +4,7 @@ import { EpisodeNews, ReleaseState, SearchResult, TocSearchMedium, VoidablePromi import * as cheerio from "cheerio"; import logger from "enterprise-core/dist/logger"; import * as url from "url"; -import { checkTocContent } from "../scraperTools"; +import { checkTocContent, storeHookName } from "../scraperTools"; import { getText, LogType, scraperLog, SearchResult as TocSearchResult, searchToc } from "./directTools"; import { UrlError } from "../errors"; import request from "../request"; @@ -12,6 +12,7 @@ import request from "../request"; const BASE_URI = "https://www.gogoanime.vc/"; async function scrapeNews(): Promise { + storeHookName("gogoanime"); const uri = BASE_URI; const $ = await request.getCheerio({ url: uri }); @@ -80,6 +81,7 @@ async function scrapeNews(): Promise { } async function scrapeToc(urlString: string): Promise { + storeHookName("gogoanime"); const animeAliasReg = /^https?:\/\/(www\d*\.)?gogoanime\.(vc|wiki)\/category\/(.+)/; const aliasExec = animeAliasReg.exec(urlString); @@ -185,10 +187,12 @@ async function scrapeSearch(searchString: string, searchMedium: TocSearchMedium) } async function searchForToc(searchMedium: TocSearchMedium): VoidablePromise { + storeHookName("gogoanime"); return searchToc(searchMedium, scrapeToc, BASE_URI, (searchString) => scrapeSearch(searchString, searchMedium)); } async function search(searchWords: string): Promise { + storeHookName("gogoanime"); const urlString = `https://ajax.apimovie.xyz/site/loadAjaxSearch?keyword=${encodeURIComponent( searchWords, )}&id=-1&link_web=https%3A%2F%2Fwww.gogoanime.vc%2F`; @@ -240,6 +244,7 @@ search.medium = MediaType.VIDEO; * @deprecated behind recaptcha */ async function contentDownloader(link: string): Promise { + storeHookName("gogoanime"); const episodeRegex = /https:\/\/www\d*\.gogoanime\.(vc|wiki)\/.+-episode-(\d+)/; const exec = episodeRegex.exec(link); if (!exec) { diff --git a/packages/scraper/src/externals/direct/mangaHasuScraper.ts b/packages/scraper/src/externals/direct/mangaHasuScraper.ts index 7d0c12c1..3fe7edc5 100644 --- a/packages/scraper/src/externals/direct/mangaHasuScraper.ts +++ b/packages/scraper/src/externals/direct/mangaHasuScraper.ts @@ -10,7 +10,7 @@ import { import * as url from "url"; import logger from "enterprise-core/dist/logger"; import { equalsIgnore, extractIndices, MediaType, sanitizeString, delay, hasProp } from "enterprise-core/dist/tools"; -import { checkTocContent } from "../scraperTools"; +import { checkTocContent, storeHookName } from "../scraperTools"; import { SearchResult as TocSearchResult, searchToc, @@ -68,6 +68,7 @@ function enforceHttps(link: string): string { } async function scrapeNews(): Promise { + storeHookName("mangahasu"); // TODO scrape more than just the first page if there is an open end const baseUri = BASE_URI; const requestUrl = baseUri + "latest-releases.html"; @@ -179,6 +180,7 @@ async function scrapeNews(): Promise { } async function contentDownloadAdapter(chapterLink: string): Promise { + storeHookName("mangahasu"); const $ = await tryRequest(chapterLink); if (getText($("head > title")) === "Page not found!") { throw new MissingResourceError("Missing Toc on NovelFull", chapterLink); @@ -229,6 +231,7 @@ async function contentDownloadAdapter(chapterLink: string): Promise { + storeHookName("mangahasu"); if (!/https?:\/\/mangahasu\.se\/[^/]+\.html/.test(urlString)) { throw new UrlError("not a toc link for MangaHasu: " + urlString, urlString); } @@ -398,6 +401,7 @@ async function scrapeToc(urlString: string): Promise { } async function tocSearchAdapter(searchMedium: TocSearchMedium): VoidablePromise { + storeHookName("mangahasu"); return searchToc(searchMedium, scrapeToc, BASE_URI, (searchString) => scrapeSearch(searchString, searchMedium)); } @@ -437,6 +441,7 @@ async function scrapeSearch(searchWords: string, medium: TocSearchMedium): Promi } async function search(searchWords: string): Promise { + storeHookName("mangahasu"); const urlString = BASE_URI + "search/autosearch"; const body = "key=" + searchWords; diff --git a/packages/scraper/src/externals/direct/mangadexScraper.ts b/packages/scraper/src/externals/direct/mangadexScraper.ts index 2f075f43..355ed7de 100644 --- a/packages/scraper/src/externals/direct/mangadexScraper.ts +++ b/packages/scraper/src/externals/direct/mangadexScraper.ts @@ -3,7 +3,7 @@ import { EpisodeContentData, EpisodeNews, ReleaseState, Optional } from "enterpr import * as url from "url"; import logger from "enterprise-core/dist/logger"; import { extractIndices, ignore, hasProp, MediaType, sanitizeString } from "enterprise-core/dist/tools"; -import { checkTocContent } from "../scraperTools"; +import { checkTocContent, storeHookName } from "../scraperTools"; import { episodeStorage } from "enterprise-core/dist/database/storages/storage"; import { MissingResourceError, ScraperError, UrlError } from "../errors"; import { extractLinkable, getText, LogType, scraperLog } from "./directTools"; @@ -83,6 +83,7 @@ interface ChapterChapterItem { } async function contentDownloadAdapter(chapterLink: string): Promise { + storeHookName("mangadex"); const linkReg = /^https:\/\/mangadex\.org\/chapter\/(\d+)/; const exec = linkReg.exec(chapterLink); if (!exec) { @@ -145,6 +146,7 @@ async function contentDownloadAdapter(chapterLink: string): Promise { + storeHookName("mangadex"); // TODO: 19.07.2019 set the cookie 'mangadex_filter_langs:"1"' // with expiration date somewhere in 100 years to lessen load @@ -257,6 +259,7 @@ async function scrapeNews(): Promise { } async function scrapeToc(urlString: string): Promise { + storeHookName("mangadex"); const urlRegex = /^https?:\/\/mangadex\.org\/title\/\d+\/[^/]+\/?$/; if (!urlRegex.test(urlString)) { diff --git a/packages/scraper/src/externals/direct/novelFullScraper.ts b/packages/scraper/src/externals/direct/novelFullScraper.ts index 4c3699b7..dc5db6ca 100644 --- a/packages/scraper/src/externals/direct/novelFullScraper.ts +++ b/packages/scraper/src/externals/direct/novelFullScraper.ts @@ -27,10 +27,12 @@ import { import { ScraperError, UrlError } from "../errors"; import * as cheerio from "cheerio"; import request from "../request"; +import { storeHookName } from "../scraperTools"; const BASE_URI = "https://novelfull.com/"; async function tocSearch(medium: TocSearchMedium): VoidablePromise { + storeHookName("novelfull"); return searchTocCheerio( medium, tocAdapterTooled, @@ -41,6 +43,7 @@ async function tocSearch(medium: TocSearchMedium): VoidablePromise { } async function search(text: string): Promise { + storeHookName("novelfull"); const encodedText = encodeURIComponent(text); const $ = await request.getCheerio({ url: BASE_URI + "search?keyword=" + encodedText }); @@ -66,6 +69,7 @@ async function search(text: string): Promise { } async function contentDownloadAdapter(urlString: string): Promise { + storeHookName("novelfull"); const pattern = /^https?:\/\/novelfull\.com\/.+\/.+\d+.+/; if (!urlString.match(pattern)) { scraperLog("warn", LogType.INVALID_LINK, "novelfull", { link: urlString, expected: pattern.source }); @@ -120,6 +124,7 @@ function extractTocSnippet($: cheerio.CheerioAPI, link: string): Toc { } async function tocAdapterTooled(tocLink: string): Promise { + storeHookName("novelfull"); const uri = BASE_URI; const linkMatch = tocLink.match("^https?://novelfull\\.com/([\\w-]+.html)$"); @@ -184,6 +189,7 @@ async function tocAdapterTooled(tocLink: string): Promise { } async function newsAdapter(): Promise { + storeHookName("novelfull"); const uri = BASE_URI; const $ = await request.getCheerio({ url: uri }); const items = $("#list-index .list-new .row"); diff --git a/packages/scraper/src/externals/direct/openLibraryScraper.ts b/packages/scraper/src/externals/direct/openLibraryScraper.ts index 4f0b6b8e..7efd70bf 100644 --- a/packages/scraper/src/externals/direct/openLibraryScraper.ts +++ b/packages/scraper/src/externals/direct/openLibraryScraper.ts @@ -6,6 +6,7 @@ import { UrlError } from "../errors"; import { SearchResult } from "enterprise-core/dist/types"; import request from "../request"; import { ValidationError } from "enterprise-core/dist/error"; +import { storeHookName } from "../scraperTools"; const BASE_URI = "https://openlibrary.org/"; @@ -64,6 +65,7 @@ interface OpenLibraryBookData { * @param tocLink toc link */ async function toc(tocLink: string): Promise { + storeHookName("openlibrary"); const linkRegex = /https?:\/\/openlibrary\.org\/api\/books\?bibkeys=(ISBN|OLID):\d+&jscmd=data&format=json/; if (!linkRegex.test(tocLink)) { @@ -139,6 +141,7 @@ interface SearchItem { * @param medium the medium type to filter after */ async function search(text: string, medium: number): Promise { + storeHookName("openlibrary"); if (medium !== MediaType.TEXT) { return []; } diff --git a/packages/scraper/src/externals/direct/undergroundScraper.ts b/packages/scraper/src/externals/direct/undergroundScraper.ts index e7627498..52c45ecb 100644 --- a/packages/scraper/src/externals/direct/undergroundScraper.ts +++ b/packages/scraper/src/externals/direct/undergroundScraper.ts @@ -6,12 +6,14 @@ import { episodeStorage, mediumStorage, partStorage } from "enterprise-core/dist import request from "../request"; import { ScraperError } from "../errors"; import { scraperLog, LogType, getText } from "./directTools"; +import { storeHookName } from "../scraperTools"; export const sourceType = "qidian_underground"; const BASE_URI = "https://toc.qidianunderground.org/"; async function scrapeNews(): VoidablePromise { + storeHookName("underground"); const uri = BASE_URI; const $ = await request.getCheerio({ url: uri }); @@ -194,6 +196,7 @@ async function processMediumNews(mediumTitle: string, potentialNews: News[]): Em } async function scrapeContent(urlString: string): Promise { + storeHookName("underground"); const $ = await request.getCheerio({ url: urlString }); const contents = $(".center-block .well"); diff --git a/packages/scraper/src/externals/direct/webnovelScraper.ts b/packages/scraper/src/externals/direct/webnovelScraper.ts index f48d207f..8c73a072 100644 --- a/packages/scraper/src/externals/direct/webnovelScraper.ts +++ b/packages/scraper/src/externals/direct/webnovelScraper.ts @@ -11,7 +11,7 @@ import { import { equalsIgnore, ignore, MediaType, relativeToAbsoluteTime, sanitizeString } from "enterprise-core/dist/tools"; import logger from "enterprise-core/dist/logger"; import * as url from "url"; -import { checkTocContent } from "../scraperTools"; +import { checkTocContent, storeHookName } from "../scraperTools"; import { ScraperError, UrlError } from "../errors"; import { Cookie } from "tough-cookie"; import * as cheerio from "cheerio"; @@ -32,6 +32,7 @@ function toTocLink(bookId: string): Link { } async function scrapeNews(): Promise<{ news?: News[]; episodes?: EpisodeNews[] } | undefined> { + storeHookName("webnovel"); const uri = BASE_URI; const $ = await request.getCheerio({ url: uri }); const newsRows = $("#LatUpdate tbody > tr"); @@ -96,6 +97,7 @@ async function scrapeNews(): Promise<{ news?: News[]; episodes?: EpisodeNews[] } } async function scrapeToc(urlString: string): Promise { + storeHookName("webnovel"); // wait for a normal request, to get the right cookies await initPromise; @@ -123,6 +125,7 @@ function getCookies(): Promise { } async function scrapeTocPage(bookId: string, mediumId?: number): Promise { + storeHookName("webnovel"); const csrfCookie = (await getCookies()).find((value) => value.key === "_csrfToken"); if (!csrfCookie) { @@ -224,6 +227,7 @@ async function loadJson(urlString: string, retry = 0): Promise { } async function scrapeContent(urlString: string): Promise { + storeHookName("webnovel"); let $: cheerio.CheerioAPI; try { $ = await loadBody(urlString); @@ -334,6 +338,7 @@ interface TocResponse { } async function searchToc(searchMedium: TocSearchMedium): VoidablePromise { + storeHookName("webnovel"); logger.info("start searching webnovel " + searchMedium.mediumId); const urlString = BASE_URI + "/search?keywords=" + encodeURIComponent(searchMedium.title); const body = await loadBody(urlString); @@ -365,6 +370,7 @@ async function searchToc(searchMedium: TocSearchMedium): VoidablePromise { } async function search(text: string): Promise { + storeHookName("webnovel"); const uri = BASE_URI; const urlString = BASE_URI + "/search?keywords=" + encodeURIComponent(text); const body = await loadBody(urlString); diff --git a/packages/scraper/src/externals/direct/wuxiaworldScraper.ts b/packages/scraper/src/externals/direct/wuxiaworldScraper.ts index a05378c0..eebbe6b7 100644 --- a/packages/scraper/src/externals/direct/wuxiaworldScraper.ts +++ b/packages/scraper/src/externals/direct/wuxiaworldScraper.ts @@ -3,7 +3,7 @@ import { EpisodeNews, SearchResult, TocSearchMedium, VoidablePromise, Nullable } import logger from "enterprise-core/dist/logger"; import * as url from "url"; import { countOccurrence, equalsIgnore, extractIndices, MediaType, sanitizeString } from "enterprise-core/dist/tools"; -import { checkTocContent } from "../scraperTools"; +import { checkTocContent, storeHookName } from "../scraperTools"; import { UrlError } from "../errors"; import request from "../request"; import { getText } from "./directTools"; @@ -11,6 +11,7 @@ import { getText } from "./directTools"; const BASE_URI = "https://www.wuxiaworld.com/"; async function scrapeNews(): VoidablePromise { + storeHookName("wuxiaworld"); const uri = BASE_URI; const $ = await request.getCheerio({ url: uri }); @@ -107,6 +108,7 @@ async function scrapeNews(): VoidablePromise { } async function scrapeToc(urlString: string): Promise { + storeHookName("wuxiaworld"); if (urlString.endsWith("-preview")) { return []; } @@ -237,6 +239,7 @@ async function scrapeToc(urlString: string): Promise { } async function scrapeContent(urlString: string): Promise { + storeHookName("wuxiaworld"); const $ = await request.getCheerio({ url: urlString }); const mainElement = $(".content"); const novelTitle = sanitizeString(getText(mainElement.find(".top-bar-area .caption a").first())); @@ -278,6 +281,7 @@ async function scrapeContent(urlString: string): Promise { } async function tocSearcher(medium: TocSearchMedium): VoidablePromise { + storeHookName("wuxiaworld"); const words = medium.title.split(/\s+/).filter((value) => value); let tocLink = ""; let searchWord = ""; @@ -315,6 +319,7 @@ async function tocSearcher(medium: TocSearchMedium): VoidablePromise { } async function search(text: string): Promise { + storeHookName("wuxiaworld"); const word = encodeURIComponent(text); const parsed: NovelSearchResponse = await request.getJson({ url: BASE_URI + "api/novels/search?query=" + word }); diff --git a/packages/scraper/src/externals/listManager.ts b/packages/scraper/src/externals/listManager.ts index a78500b0..1b4b4b1d 100644 --- a/packages/scraper/src/externals/listManager.ts +++ b/packages/scraper/src/externals/listManager.ts @@ -9,6 +9,7 @@ import request, { Requestor } from "./request"; import { ValidationError } from "enterprise-core/dist/error"; import { ScraperError } from "./errors"; import { getText } from "./direct/directTools"; +import { storeHookName } from "./scraperTools"; interface SimpleReadingList { menu: string; @@ -543,6 +544,7 @@ export interface ScrapeMedium { } async function novelUpdatesTocAdapter(uri: string) { + storeHookName("novelupdates"); /* const pageInfo = await storage.getPageInfo(uri, "scraped"); if (pageInfo.values) { diff --git a/packages/scraper/src/externals/queueRequest.ts b/packages/scraper/src/externals/queueRequest.ts index 83023a47..b20596f5 100644 --- a/packages/scraper/src/externals/queueRequest.ts +++ b/packages/scraper/src/externals/queueRequest.ts @@ -2,7 +2,7 @@ import { setContext, removeContext, getStore, bindContext, StoreKey } from "ente import http from "http"; import https from "https"; import { Socket } from "net"; -import { isString, getElseSet, stringify } from "enterprise-core/dist/tools"; +import { isString, getElseSet, stringify, defaultNetworkTrack } from "enterprise-core/dist/tools"; import logger from "enterprise-core/dist/logger"; import { AsyncResource } from "async_hooks"; import { channel } from "diagnostics_channel"; @@ -44,9 +44,7 @@ function patchRequest(module: HttpModule, protocol: string) { return; } - const stats = getElseSet(store, StoreKey.NETWORK, () => { - return { count: 0, sent: 0, received: 0, history: [] }; - }); + const stats = getElseSet(store, StoreKey.NETWORK, defaultNetworkTrack); stats.count += 1; stats.sent += bytesSend; stats.received += bytesReceived; diff --git a/packages/scraper/src/externals/request/cloudflare.ts b/packages/scraper/src/externals/request/cloudflare.ts index 1b6d6055..c00b03e3 100644 --- a/packages/scraper/src/externals/request/cloudflare.ts +++ b/packages/scraper/src/externals/request/cloudflare.ts @@ -7,6 +7,7 @@ import { URL, URLSearchParams } from "url"; import { delay } from "enterprise-core/dist/tools"; import { RequestError, CloudflareError, CaptchaError, ParserError } from "./error"; import { HeaderGenerator } from "header-generator"; +import { getStoreValue, StoreKey } from "enterprise-core/dist/asyncStorage"; const headerGenerator = new HeaderGenerator({ browsers: ["chrome", "firefox", "safari"], @@ -298,6 +299,11 @@ function onCloudflareResponse( requester: Requester, isHtml: boolean, ): Promise { + const networkTrack = getStoreValue(StoreKey.NETWORK); + + if (networkTrack) { + networkTrack.cloudflareCount++; + } if (body.length < 1) { // This is a 4xx-5xx Cloudflare response with an empty body. throw new CloudflareError(response.statusCode, options, response); diff --git a/packages/scraper/src/externals/request/request.ts b/packages/scraper/src/externals/request/request.ts index 258af268..94815127 100644 --- a/packages/scraper/src/externals/request/request.ts +++ b/packages/scraper/src/externals/request/request.ts @@ -214,6 +214,11 @@ export class Requestor { private async usePuppeteer

= RequestConfig>( config: R, ): Promise> { + const networkTrack = getStoreValue(StoreKey.NETWORK); + + if (networkTrack) { + networkTrack.puppeteerCount++; + } const signal = getStoreValue(StoreKey.ABORT); signal?.throwIfAborted(); @@ -370,6 +375,8 @@ export class Requestor { try { const response = await this.performRequest(config); + // this will be catched immediately, normally such a statuscode + // throws already an error without having to manually throw it if (response.status === 429) { throw Error("Too many requests"); } @@ -378,6 +385,11 @@ export class Requestor { } catch (error) { // retry at most 3 times for 429 - Too many Requests error if (error instanceof RequestError && error.response?.status === 429 && tryAgain < 3) { + const networkTrack = getStoreValue(StoreKey.NETWORK); + + if (networkTrack) { + networkTrack.retryCount++; + } const retryAfterValue = error.response?.headers?.["retry-after"]; const retryAfterSeconds = Number.parseInt(retryAfterValue); diff --git a/packages/scraper/src/externals/scraperTools.ts b/packages/scraper/src/externals/scraperTools.ts index 351fd9b1..d9ef10b2 100644 --- a/packages/scraper/src/externals/scraperTools.ts +++ b/packages/scraper/src/externals/scraperTools.ts @@ -1,6 +1,6 @@ import { ListScrapeResult } from "./listManager"; -import { combiIndex, getElseSet, hasProp } from "enterprise-core/dist/tools"; -import { Episode, Uuid, SearchResult, EmptyPromise, Optional } from "enterprise-core/dist/types"; +import { combiIndex, defaultNetworkTrack, getElseSet, hasProp } from "enterprise-core/dist/tools"; +import { Episode, Uuid, SearchResult, EmptyPromise, Optional, NetworkTrack } from "enterprise-core/dist/types"; import logger from "enterprise-core/dist/logger"; import { ContentDownloader, DownloadContent, EpisodeContent, Hook, Toc, TocContent } from "./types"; import { Cache } from "enterprise-core/dist/cache"; @@ -19,6 +19,7 @@ import { import request, { Response } from "./request"; import { ValidationError } from "enterprise-core/dist/error"; import { registerOnExitHandler } from "enterprise-core/dist/exit"; +import { getStore, StoreKey } from "enterprise-core/dist/asyncStorage"; interface ScrapeableFilterResult { available: string[]; @@ -359,3 +360,17 @@ export function checkLink(link: string, linkKey?: string): Promise { }); }); } + +/** + * Stores hookName in the asyncStorage as an used Hook. + * + * @param hookName name to store + */ +export function storeHookName(hookName: string) { + const store = getStore(); + + if (store) { + const track: NetworkTrack = getElseSet(store, StoreKey.NETWORK, defaultNetworkTrack); + track.hooksUsed.push(hookName); + } +} diff --git a/packages/scraper/src/externals/types.ts b/packages/scraper/src/externals/types.ts index 6d1ad4b5..caa7c005 100644 --- a/packages/scraper/src/externals/types.ts +++ b/packages/scraper/src/externals/types.ts @@ -112,6 +112,7 @@ export interface EndJobChannelMessage extends BasicJobChannelMessage { result: string; reason?: string; jobTrack: JobTrack; + duration: number; } /** diff --git a/packages/scraper/src/metrics.ts b/packages/scraper/src/metrics.ts index 7930376f..4550c00d 100644 --- a/packages/scraper/src/metrics.ts +++ b/packages/scraper/src/metrics.ts @@ -1,4 +1,4 @@ -import { Counter, Gauge } from "prom-client"; +import { Counter, Gauge, Histogram, exponentialBuckets } from "prom-client"; import { subscribe } from "diagnostics_channel"; const jobMaxCount = new Gauge({ @@ -19,37 +19,62 @@ const jobActiveCount = new Gauge({ const jobResultCount = new Counter({ name: "scraper_job_result_count", help: "Count of finished jobs", - labelNames: ["result", "jobType"], + labelNames: ["result", "jobType", "hook"], }); const jobModificationsCount = new Counter({ name: "scraper_job_modification_count", help: "Count of modified database entities", - labelNames: ["type", "entity", "jobType"], + labelNames: ["type", "entity", "jobType", "hook"], }); const jobDBQueryCount = new Counter({ name: "scraper_job_db_queries_count", help: "Number of database queries", - labelNames: ["jobType"], + labelNames: ["jobType", "hook"], }); const jobNetworkQueryCount = new Counter({ name: "scraper_job_network_query_count", help: "Number of network queries", - labelNames: ["jobType"], + labelNames: ["jobType", "hook"], }); const jobNetworkSendCount = new Counter({ name: "scraper_job_bytes_send_count", help: "Network bytes send", - labelNames: ["jobType"], + labelNames: ["jobType", "hook"], }); const jobNetworkReceivedCount = new Counter({ name: "scraper_job_bytes_received_count", help: "Network bytes received", - labelNames: ["jobType"], + labelNames: ["jobType", "hook"], +}); + +const jobNetworkRetryCount = new Counter({ + name: "scraper_job_request_retry_count", + help: "Number of request retries", + labelNames: ["jobType", "hook"], +}); + +const jobNetworkCloudflareCount = new Counter({ + name: "scraper_job_cloudflare_count", + help: "Count Clouflare encountered", + labelNames: ["jobType", "hook"], +}); + +const jobNetworkPuppeteerCount = new Counter({ + name: "scraper_job_puppeteer_used_count", + help: "Count Puppeteer used", + labelNames: ["jobType", "hook"], +}); + +const jobDuration = new Histogram({ + name: "scraper_job_duration_seconds", + help: "Duration of the job in seconds from to start to end", + buckets: exponentialBuckets(1, 2, 10), + labelNames: ["jobType", "hook"], }); subscribe("enterprise-jobqueue", (message) => { @@ -60,16 +85,25 @@ subscribe("enterprise-jobqueue", (message) => { subscribe("enterprise-jobs", (message) => { if (message.type === "finished") { - jobResultCount.inc({ result: message.result, jobType: message.jobType }, 1); + // TODO: currently silently ignore multiple used hooks, only a single one should be used anyway + const hook = message.jobTrack.network.hooksUsed[0]; + + jobResultCount.inc({ result: message.result, jobType: message.jobType, hook }, 1); for (const [key, value] of Object.entries(message.jobTrack.modifications)) { - jobModificationsCount.inc({ type: "delete", entity: key, jobType: message.jobType }, value.deleted); - jobModificationsCount.inc({ type: "update", entity: key, jobType: message.jobType }, value.updated); - jobModificationsCount.inc({ type: "insert", entity: key, jobType: message.jobType }, value.created); + jobModificationsCount.inc({ type: "delete", entity: key, jobType: message.jobType, hook }, value.deleted); + jobModificationsCount.inc({ type: "update", entity: key, jobType: message.jobType, hook }, value.updated); + jobModificationsCount.inc({ type: "insert", entity: key, jobType: message.jobType, hook }, value.created); } - const label = { jobType: message.jobType }; + // if no hook was used, let it be undefined + const label = { jobType: message.jobType, hook }; + // convert milliseconds to seconds + jobDuration.observe(label, message.duration / 1000); + jobNetworkRetryCount.inc(label, message.jobTrack.network.retryCount); + jobNetworkCloudflareCount.inc(label, message.jobTrack.network.cloudflareCount); + jobNetworkPuppeteerCount.inc(label, message.jobTrack.network.puppeteerCount); jobDBQueryCount.inc(label, message.jobTrack.queryCount); jobNetworkQueryCount.inc(label, message.jobTrack.network.count); jobNetworkSendCount.inc(label, message.jobTrack.network.sent); diff --git a/packages/scraper/src/scheduler/job.ts b/packages/scraper/src/scheduler/job.ts index 1f26bd9c..609e67d1 100644 --- a/packages/scraper/src/scheduler/job.ts +++ b/packages/scraper/src/scheduler/job.ts @@ -3,7 +3,7 @@ import { runAsync, Store, StoreKey } from "enterprise-core/dist/asyncStorage"; import { jobStorage } from "enterprise-core/dist/database/storages/storage"; import { JobError } from "enterprise-core/dist/error"; import logger from "enterprise-core/dist/logger"; -import { stringify } from "enterprise-core/dist/tools"; +import { defaultNetworkTrack, stringify } from "enterprise-core/dist/tools"; import { JobItem, JobState, Optional, ScrapeName } from "enterprise-core/dist/types"; import { EndJobChannelMessage, StartJobChannelMessage } from "../externals/types"; import { scrapeMapping } from "./scrapeJobs"; @@ -237,16 +237,12 @@ export class Job { jobType: item.type, jobTrack: { modifications: store.get(StoreKey.MODIFICATIONS) || {}, - network: store.get(StoreKey.NETWORK) || { - count: 0, - sent: 0, - received: 0, - history: [], - }, + network: store.get(StoreKey.NETWORK) || defaultNetworkTrack(), queryCount: store.get(StoreKey.QUERY_COUNT) || 0, }, result, reason: result !== "success" ? store.get(StoreKey.MESSAGE) : undefined, + duration: Date.now() - (this.currentItem.runningSince?.getTime() ?? 0), timestamp: Date.now(), }; jobChannel.publish(message);