Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: get Xiaohongshu fulltext #17075

Closed
wants to merge 11 commits into from
6 changes: 6 additions & 0 deletions lib/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,9 @@ export type Config = {
device_id?: string;
refresh_token?: string;
};
xiaohongshu: {
cookie?: string;
};
ximalaya: {
token?: string;
};
Expand Down Expand Up @@ -772,6 +775,9 @@ const calculateValue = () => {
device_id: envs.XIAOYUZHOU_ID,
refresh_token: envs.XIAOYUZHOU_TOKEN,
},
xiaohongshu: {
cookie: envs.XIAOHONGSHU_COOKIE,
},
ximalaya: {
token: envs.XIMALAYA_TOKEN,
},
Expand Down
123 changes: 113 additions & 10 deletions lib/routes/xiaohongshu/notes.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import { Route } from '@/types';
import cache from '@/utils/cache';
import { getNotes, formatText, formatNote } from './util';
import { config } from '@/config';
import * as cheerio from 'cheerio';
import got from '@/utils/got';
import { formatNote, formatText, getNotes } from './util';

export const route: Route = {
path: '/user/:user_id/notes/fulltext',
path: '/user/:user_id/notes/:fulltext',
radar: [
{
source: ['xiaohongshu.com/user/profile/:user_id'],
Expand All @@ -15,25 +18,125 @@ export const route: Route = {
handler,
example: '/xiaohongshu/user/52d8c541b4c4d60e6c867480/notes/fulltext',
features: {
requireConfig: [
{
name: 'XIAOHONGSHU_COOKIE',
optional: true,
description: '小红书 cookie 值,可在浏览器控制台通过`document.cookie`获取。',
},
],
antiCrawler: true,
requirePuppeteer: true,
},
parameters: {
user_id: 'user id, length 24 characters',
fulltext: {
description: '是否获取全文',
default: '',
},
},
};

async function handler(ctx) {
const userId = ctx.req.param('user_id');
const url = `https://www.xiaohongshu.com/user/profile/${userId}`;

const { user, notes } = await getNotes(url, cache);
if (config.xiaohongshu.cookie && ctx.req.param('fulltext')) {
const user = await getUser(url, config.xiaohongshu.cookie);
const notes = await renderNotesFulltext(user.notes, url);
return {
title: `${user.userPageData.basicInfo.nickname} - 笔记 • 小红书 / RED`,
description: user.userPageData.basicInfo.desc,
image: user.userPageData.basicInfo.imageb || user.userPageData.basicInfo.images,
link: url,
item: notes,
};
} else {
const { user, notes } = await getNotes(url, cache);
return {
title: `${user.nickname} - 笔记 • 小红书 / RED`,
description: formatText(user.desc),
image: user.imageb || user.images,
link: url,
item: notes.map((item) => formatNote(url, item)),
};
}
}

async function getUser(url, cookie) {
const res = await got(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RSSHub will use a randomised user agent of Chrome on mac by default. Does the site only work with this fixed version of Chrome on Windows?

Cookie: cookie,
},
});
Comment on lines +67 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This request does not require cookie.

const $ = cheerio.load(res.data);

let script = $('script')
.filter((i, script) => {
const text = script.children[0]?.data;
return text?.startsWith('window.__INITIAL_STATE__=');
})
.text();
script = script.slice('window.__INITIAL_STATE__='.length);
script = script.replaceAll('undefined', 'null');
Comment on lines +75 to +82
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
let script = $('script')
.filter((i, script) => {
const text = script.children[0]?.data;
return text?.startsWith('window.__INITIAL_STATE__=');
})
.text();
script = script.slice('window.__INITIAL_STATE__='.length);
script = script.replaceAll('undefined', 'null');
const script = $("script:contains('__INITIAL_STATE__')")
.text()
.match(/window\.__INITIAL_STATE__=(.*)/)?.[1]
?.replaceAll('undefined', 'null');

const state = JSON.parse(script);
return state.user;
}

async function renderNotesFulltext(notes, url) {
const data: any[] = [];
const promises = notes.flatMap((note) =>
note.map(async ({ noteCard }) => {
const link = `${url}/${noteCard.noteId}`;
const { title, description, pubDate } = await getFullNote(link);
return {
title,
link,
description,
author: noteCard.user.nickName,
guid: noteCard.noteId,
pubDate,
};
})
);
data.push(...(await Promise.all(promises)));
return data;
}
Comment on lines +87 to +105
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove unnecessary spreading and push

Suggested change
async function renderNotesFulltext(notes, url) {
const data: any[] = [];
const promises = notes.flatMap((note) =>
note.map(async ({ noteCard }) => {
const link = `${url}/${noteCard.noteId}`;
const { title, description, pubDate } = await getFullNote(link);
return {
title,
link,
description,
author: noteCard.user.nickName,
guid: noteCard.noteId,
pubDate,
};
})
);
data.push(...(await Promise.all(promises)));
return data;
}
function renderNotesFulltext(notes, url) {
const promises = notes.flatMap((note) =>
note.map(async ({ noteCard }) => {
const link = `${url}/${noteCard.noteId}`;
const { title, description, pubDate } = await getFullNote(link);
return {
title,
link,
description,
author: noteCard.user.nickName,
guid: noteCard.noteId,
pubDate,
};
})
);
return Promise.all(promises);
}


return {
title: `${user.nickname} - 笔记 • 小红书 / RED`,
description: formatText(user.desc),
image: user.imageb || user.images,
link: url,
item: notes.map((item) => formatNote(url, item)),
};
async function getFullNote(link) {
const cookie = config.xiaohongshu.cookie;
const data = (await cache.tryGet(link, async () => {
const res = await got(link, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RSSHub will use a randomised user agent of Chrome on mac by default. Does the site only works with this fixed version of Chrome on Windows?

Cookie: cookie,
} as any,
});
const $ = cheerio.load(res.data);
let script = $('script')
.filter((i, script) => {
const text = script.children[0]?.data;
return text?.startsWith('window.__INITIAL_STATE__=');
})
.text();
script = script.slice('window.__INITIAL_STATE__='.length);
script = script.replaceAll('undefined', 'null');
Comment on lines +117 to +124
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

const state = JSON.parse(script);
const note = state.note.noteDetailMap[state.note.firstNoteId].note;
const images = note.imageList.map((image) => image.urlDefault);
const title = note.title;
let desc = note.desc;
desc = desc.replaceAll(/\[.*?\]/g, '');
desc = desc.replaceAll(/#(.*?)#/g, '#$1');
desc = desc.replaceAll('\n', '<br>');
const pubDate = new Date(note.time);
const description = `${images.map((image) => `<img src="${image}">`).join('')}<br>${title}<br>${desc}`;
return {
title,
description,
pubDate,
};
})) as Promise<{ title: string; description: string; pubDate: Date }>;
return data;
}