-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler.js
126 lines (121 loc) · 3.74 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
new Crawler({
appId: "NHCE31YG9M",
apiKey: "REDACTED",
rateLimit: 8,
maxDepth: 10,
startUrls: ["https://docs.galacticbase.com/"],
sitemaps: ["https://docs.galacticbase.com/sitemap.xml"],
ignoreCanonicalTo: true,
discoveryPatterns: ["https://docs.galacticbase.com/**"],
actions: [
{
indexName: "galacticbase",
pathsToMatch: ["https://docs.galacticbase.com/**"],
recordExtractor: ({ url, $, helpers }) => {
// get the product name (private, serverless, or enterprise)
const { pathname } = new URL(url);
const pathParts = pathname.split("/").slice(1);
// Extract the product from the path.
let product = pathParts.shift();
// get the description metadata
const description = $("meta[name='description']").attr("content") || "";
const keywords = $("meta[name='keywords']").attr("content") || "";
const pinyin = $("meta[name='pinyin']").attr("content") || "";
// priority order: deepest active sub list header -> navbar active item -> 'Documentation'
const lvl0 =
$(
".menu__link.menu__link--sublist.menu__link--active, .navbar__item.navbar__link--active",
)
.last()
.text() || "Documentation";
return helpers.docsearch({
recordProps: {
lvl0: {
selectors: "",
defaultValue: lvl0,
},
lvl1: ["keywords", "header h1", "article h1"],
lvl2: "article h2",
lvl3: "article h3",
lvl4: "article h4",
lvl5: "article h5, article td:first-child",
lvl6: "article h6",
content: "article p, article li, article td:last-child",
description: { defaultValue: description },
keywords: { defaultValue: keywords },
pinyin: { defaultValue: pinyin },
product: { defaultValue: product },
},
indexHeadings: true,
aggregateContent: true,
recordVersion: "v3",
});
},
},
],
initialIndexSettings: {
galacticbase: {
attributesForFaceting: [
"type",
"lang",
"language",
"version",
"docusaurus_tag",
"product",
],
attributesToRetrieve: [
"keywords",
"hierarchy",
"content",
"anchor",
"url",
"url_without_anchor",
"type",
"product",
],
attributesToHighlight: ["hierarchy", "content"],
attributesToSnippet: ["content:10"],
camelCaseAttributes: ["hierarchy", "content"],
searchableAttributes: [
"unordered(keywords)",
"unordered(description)",
"unordered(pinyin)",
"unordered(hierarchy.lvl0)",
"unordered(hierarchy.lvl1)",
"unordered(hierarchy.lvl2)",
"unordered(hierarchy.lvl3)",
"unordered(hierarchy.lvl4)",
"unordered(hierarchy.lvl5)",
"unordered(hierarchy.lvl6)",
"content",
],
distinct: true,
attributeForDistinct: "url",
customRanking: [
"desc(weight.pageRank)",
"desc(weight.level)",
"asc(weight.position)",
],
ranking: [
"words",
"filters",
"typo",
"attribute",
"proximity",
"exact",
"custom",
],
highlightPreTag: '<span class="algolia-docsearch-suggestion--highlight">',
highlightPostTag: "</span>",
minWordSizefor1Typo: 3,
minWordSizefor2Typos: 7,
allowTyposOnNumericTokens: false,
minProximity: 1,
ignorePlurals: true,
advancedSyntax: true,
attributeCriteriaComputedByMinProximity: true,
removeWordsIfNoResults: "allOptional",
separatorsToIndex: "_",
},
},
});