Skip to content

Commit

Permalink
Merge pull request #44 from mytlogos/scraper
Browse files Browse the repository at this point in the history
Scraper
  • Loading branch information
mytlogos authored Nov 30, 2020
2 parents bec1551 + 7d08a40 commit 0fcd87f
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 19 deletions.
4 changes: 2 additions & 2 deletions src/server/bin/database/contexts/externalUserContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,12 @@ export class ExternalUserContext extends SubContext {
}

/**
*
* Return all ExternalUser not scraped in the last seven days.
*/
public async getScrapeExternalUser(): Promise<ExternalUser[]> {
const result = await this.query(
"SELECT uuid, local_uuid, service, cookies, name, last_scrape FROM external_user " +
"WHERE last_scrape IS NULL OR last_scrape > NOW() - 7",
"WHERE last_scrape IS NULL OR last_scrape < TIMESTAMPADD(day, -7, now())",
);

return result.map((value: any): ExternalUser => {
Expand Down
7 changes: 4 additions & 3 deletions src/server/bin/externals/direct/boxNovelScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ async function search(text: string): Promise<SearchResult[]> {

if (parsed.success && parsed.data && parsed.data.length) {
for (const datum of parsed.data) {
searchResults.push({link: datum.url, title: datum.title});
searchResults.push({link: datum.url.replace("-boxnovel", ""), title: datum.title});
}
}
return searchResults;
Expand Down Expand Up @@ -80,7 +80,7 @@ export async function searchAjax(searchWords: string, medium: TocSearchMedium):
}
for (const datum of parsed.data) {
if (equalsIgnore(datum.title, medium.title) || medium.synonyms.some((s) => equalsIgnore(datum.title, s))) {
return {value: datum.url, done: true};
return {value: datum.url.replace("-boxnovel", ""), done: true};
}
}
return {done: false};
Expand Down Expand Up @@ -287,7 +287,8 @@ async function newsAdapter(): VoidablePromise<{ news?: News[]; episodes?: Episod
const newsRow = items.eq(i);

const mediumTitleElement = newsRow.find(".post-title a");
const tocLink = url.resolve(uri, mediumTitleElement.attr("href") as string);
const tocLink = url.resolve(uri, mediumTitleElement.attr("href") as string).replace("-boxnovel", "");

const mediumTitle = sanitizeString(mediumTitleElement.text());

const titleElement = newsRow.find(".chapter-item .chapter a");
Expand Down
18 changes: 14 additions & 4 deletions src/server/bin/externals/direct/gogoAnimeScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,21 @@ async function scrapeNews(): Promise<NewsScrapeResult> {

const news: EpisodeNews[] = [];
const titlePattern = /Episode\s*((\d+)(\.(\d+))?)/i;
const linkPattern = /(.+\/\/.+\/)(.+)-episode-\d+$/;

for (let i = 0; i < newsRows.length; i++) {
const newsRow = newsRows.eq(i);

const mediumElement = newsRow.find(".name a");
const link = url.resolve(uri, mediumElement.attr("href") as string);
const linkMatch = linkPattern.exec(link)

if (!linkMatch) {
logger.warn(`Unknown GogoAnime News Link Format: '${link}'`);
continue
}
const tocLink = linkMatch[1] + "category/" + linkMatch[2];

const episodeTitleElement = newsRow.children(".episode");

const mediumTitle = sanitizeString(mediumElement.text());
Expand All @@ -43,7 +53,7 @@ async function scrapeNews(): Promise<NewsScrapeResult> {
news.push({
link,
mediumType: MediaType.VIDEO,
mediumTocLink: link,
mediumTocLink: tocLink,
mediumTitle,
episodeTitle,
episodeIndex: episodeIndices.combi,
Expand All @@ -60,13 +70,13 @@ async function scrapeNews(): Promise<NewsScrapeResult> {
}

async function scrapeToc(urlString: string): Promise<Toc[]> {
const animeAliasReg = /^https?:\/\/www\d*\.gogoanime\.io\/category\/(.+)/;
const animeAliasReg = /^https?:\/\/(www\d*\.)?gogoanime\.io\/category\/(.+)/;
const aliasExec = animeAliasReg.exec(urlString);

if (!aliasExec) {
throw new UrlError("invalid toc url for GogoAnime: " + urlString, urlString);
}
const animeAlias = aliasExec[1];
const animeAlias = aliasExec[2];

const $ = await queueCheerioRequest(urlString);
const contentElement = $(".content_left .main_body");
Expand Down Expand Up @@ -223,7 +233,7 @@ export function getHook(): Hook {
return {
name: "gogoanime",
medium: MediaType.VIDEO,
domainReg: /^https?:\/\/www\d*\.gogoanime\.io/,
domainReg: /^https?:\/\/(www\d*\.)?gogoanime\.io/,
searchAdapter: search,
newsAdapter: scrapeNews,
tocAdapter: scrapeToc,
Expand Down
1 change: 1 addition & 0 deletions src/server/bin/externals/jobScraperManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ export class JobScraperManager {
const store = getStore();
if (store) {
store.set("label", [`job-${item.id}-${item.name}`]);
store.set("lastRun", item.lastRun);
}
}

Expand Down
7 changes: 4 additions & 3 deletions src/server/bin/externals/scraperTools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -578,9 +578,9 @@ export const toc = async (value: TocRequest): Promise<TocResult> => {
/**
* Scrapes ListWebsites and follows possible redirected pages.
*/
export const list = async (value: { cookies: string; uuid: Uuid }): Promise<ExternalListResult> => {
export const list = async (value: { info: string; uuid: Uuid }): Promise<ExternalListResult> => {
// TODO: 10.03.2020 for now list scrape novelupdates only, later it should take listtype as an argument
const manager = factory(ListType.NOVELUPDATES, value.cookies);
const manager = factory(ListType.NOVELUPDATES, value.info);
try {
const lists = await manager.scrapeLists();
const listsPromise: EmptyPromise = Promise.all(lists.lists.map(
Expand Down Expand Up @@ -622,7 +622,8 @@ export const list = async (value: { cookies: string; uuid: Uuid }): Promise<Exte
type: ListType.NOVELUPDATES,
// TODO: add useruuid here
userUuid: "",
...value
cookies: value.info,
uuid: value.uuid
},
lists
};
Expand Down
35 changes: 31 additions & 4 deletions src/server/bin/jobHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ interface TocPartMapping {
}>;
}

async function addPartEpisodes(value: TocPartMapping, storageEpisodes: Readonly<CombinedEpisode[]>, storageReleses: Readonly<EpisodeRelease[]>): EmptyPromise {
async function addPartEpisodes(value: TocPartMapping, storageEpisodes: Readonly<CombinedEpisode[]>, storageReleases: Readonly<EpisodeRelease[]>): EmptyPromise {
if (!value.part || !value.part.id) {
throw Error(`something went wrong. got no part for tocPart ${value.tocPart.combiIndex}`);
}
Expand Down Expand Up @@ -288,7 +288,7 @@ async function addPartEpisodes(value: TocPartMapping, storageEpisodes: Readonly<
episodeId: 0,
title: episodeToc.tocEpisode.title,
url: episodeToc.tocEpisode.url,
releaseDate: episodeToc.tocEpisode.releaseDate || new Date()
releaseDate: getLatestDate(episodeToc.tocEpisode.releaseDate || new Date())
}]
};
});
Expand All @@ -312,13 +312,13 @@ async function addPartEpisodes(value: TocPartMapping, storageEpisodes: Readonly<
throw Error("known episode has no episode from storage");
}
const id = currentEpisode.id;
const foundRelease = storageReleses.find((release) =>
const foundRelease = storageReleases.find((release) =>
release.url === episodeValue.tocEpisode.url
&& release.episodeId === id);

const tocRelease: EpisodeRelease = {
episodeId: id,
releaseDate: episodeValue.tocEpisode.releaseDate || new Date(),
releaseDate: getLatestDate(episodeValue.tocEpisode.releaseDate || new Date()),
title: episodeValue.tocEpisode.title,
url: episodeValue.tocEpisode.url,
locked: episodeValue.tocEpisode.locked,
Expand Down Expand Up @@ -459,6 +459,33 @@ export async function tocHandler(result: TocResult): EmptyPromise {
await Promise.all((await Promise.all(promises)).flat());
}

/**
* Return the most likely latest Date.
* Checks with the "lastRun" Item in the AsyncStore.
* Checks only dates which are on the same day as the lastRun value.
*
* Some site use the date only without time, so it may happen that a new job
* encounters dates which seem to occurr earlier than it really happened.
* Should always be checked against the dates in the storage.
*
* @param date date to check
*/
function getLatestDate(date: Date): Date {
const store = getStore();
const lastRun = store?.get("lastRun") as Date | undefined;

// check if lastrun does not exist or is earlier than given date
if (!lastRun || lastRun < date) {
return date;
}
// check if lastRun happened on the same day, if it does, it should be the better date
if (date.toDateString() !== lastRun.toDateString()) {
return date;
}
return lastRun;
}


async function addFeeds(feeds: string[]): EmptyPromise {
if (!feeds.length) {
return;
Expand Down
3 changes: 0 additions & 3 deletions src/server/bin/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ const logger = winston.createLogger({
exceptionHandlers: [
new winston.transports.File({
filename: filePrefix + "exception.log",
zippedArchive: true,
maxsize: 10_000_000
})
],
Expand All @@ -51,12 +50,10 @@ const logger = winston.createLogger({
new winston.transports.File({
filename: filePrefix + "error.log",
level: "error",
zippedArchive: true,
maxsize: 10_000_000
}),
new winston.transports.File({
filename: filePrefix + "combined.log",
zippedArchive: true,
maxsize: 20_000_000
}),
new winston.transports.Console({
Expand Down

0 comments on commit 0fcd87f

Please sign in to comment.