-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
223 lines (181 loc) · 6.94 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
const fs = require('fs');
const cheerio = require('cheerio');
import("@xenova/transformers").then((transformers) => {
({ pipeline, env } = transformers);
env.useBrowserCache = false;
env.allowLocalModels = true;
});
// Dynamically import node-fetch
import("node-fetch").then((fetchModule) => {
const fetch = fetchModule.default;
// Initialize the embedding pipeline
let embeddingPipeline = null;
async function initializeEmbeddingPipeline() {
if (!embeddingPipeline) {
// Using MiniLM model which is smaller and faster than BERT but still effective
embeddingPipeline = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
}
return embeddingPipeline;
}
// Helper functions
async function fetchWebpageContent(url) {
try {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
// Get title and description
const title = $("title").text().trim() || $("h1").first().text().trim();
const description = $('meta[name="description"]').attr("content") || "";
const content = $("main, article, .content, #content")
.text()
.replace(/\s+/g, " ")
.trim();
return { title, content, description };
} catch (error) {
console.error(`Error fetching content for ${url}:`, error);
return { title: '', content: '', description: '' };
}
}
// Function to generate embedding for a given text
async function generateEmbedding(text) {
try {
const pipeline = await initializeEmbeddingPipeline();
// Truncate text if it's too long (model has max token limit)
const truncatedText = text.slice(0, 512);
// Generate embedding
const result = await pipeline(truncatedText, {
pooling: 'mean',
normalize: true
});
// Convert to Array for easier handling
return Array.from(result.data);
} catch (error) {
console.error("Error generating embedding:", error);
throw error;
}
}
// Cosine Similarity to compare embeddings
function cosineSimilarity(vecA, vecB) {
if (!vecA || !vecB || vecA.length !== vecB.length) {
throw new Error("Invalid vectors provided for comparison");
}
if (vecA.length !== vecB.length) {
throw new Error("Vectors must be of the same length");
}
const dotProduct = vecA.reduce((sum, a, idx) => sum + a * vecB[idx], 0);
const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
if (magnitudeA === 0 || magnitudeB === 0) {
return 0;
}
return dotProduct / (magnitudeA * magnitudeB);
}
// Enhance bookmark with content and embedding
async function enhanceBookmark(bookmark) {
try {
const enhancedBookmark = { ...bookmark };
// Fetch content if the site is not a local IP
if (!bookmark.site.match(/^https?:\/\/(\d{1,3}\.){3}\d{1,3}/)) {
const webContent = await fetchWebpageContent(bookmark.site);
enhancedBookmark.title = webContent.title;
enhancedBookmark.description = webContent.description;
enhancedBookmark.content = webContent.content;
}
// Combine text for embedding (title, description, content, tags, categories)
const textForEmbedding = [
enhancedBookmark.title,
enhancedBookmark.description,
enhancedBookmark.content,
...enhancedBookmark.category,
...enhancedBookmark.tag
].filter(Boolean).join(" ");
// Generate embedding if text is available
if (textForEmbedding) {
enhancedBookmark.embedding = await generateEmbedding(textForEmbedding);
}
return enhancedBookmark;
} catch (error) {
console.error(`Error enhancing bookmark ${bookmark.site}:`, error);
}
}
// Perform semantic search
async function semanticSearch(query, bookmarks, limit = 5) {
try {
const queryEmbedding = await generateEmbedding(query);
const results = bookmarks
.filter(bookmark => bookmark.embedding)
.map(bookmark => ({
...bookmark,
similarity: cosineSimilarity(queryEmbedding, bookmark.embedding)
}))
.sort((a, b) => b.similarity - a.similarity)
.slice(0, limit);
return results;
} catch (error) {
console.error("Error during semantic search:", error);
throw error;
}
}
// Load bookmarks from file
async function loadBookmarksFromFile(filePath) {
if (fs.existsSync(filePath)) {
const fileContent = fs.readFileSync(filePath, "utf-8");
return JSON.parse(fileContent);
} else {
console.error(`${filePath} not found`);
return [];
}
}
// Save enhanced bookmarks to file
function saveEnhancedBookmarks(bookmarks, outputFilePath) {
fs.writeFileSync(outputFilePath, JSON.stringify(bookmarks, null, 2));
}
// Main function to enhance bookmarks and perform search
async function main() {
try {
const embeddingFilePath = 'embedding.json'; // Output file for embeddings
const inputFilePath = 'site.json'; // Path to your site.json file
const outputFilePath = 'enhanced_bookmarks.json'; // Output file for enhanced bookmarks
// Load bookmarks
const bookmarks = await loadBookmarksFromFile(inputFilePath);
// Create an array to hold embeddings
const embeddingArray = [];
// Enhance bookmarks with title, content, and embedding
const enhancedBookmarks = [];
for (let bookmark of bookmarks) {
const enhancedBookmark = await enhanceBookmark(bookmark);
enhancedBookmarks.push(enhancedBookmark);
// Add the embedding to the embeddings array
if (enhancedBookmark.embedding) {
embeddingArray.push(enhancedBookmark.embedding);
}
console.log(`Enhanced bookmark: ${bookmark.site}`);
}
// Save enhanced bookmarks to a new file
saveEnhancedBookmarks(enhancedBookmarks, outputFilePath);
// Save embeddings to a separate file
saveEmbeddings(embeddingArray, embeddingFilePath); // Now using embeddingArray
// Perform semantic search with a sample query
const query = "icon";
const searchResults = await semanticSearch(query, enhancedBookmarks);
// Display search results
console.log("\nSearch Results:");
searchResults.forEach((result, index) => {
console.log(`${index + 1}. ${result.title || result.site}`);
console.log(` URL: ${result.site}`);
console.log(` Similarity: ${(result.similarity * 100).toFixed(2)}%`);
if (result.description) {
console.log(` Description: ${result.description}`);
}
});
} catch (error) {
console.error("Error in main function:", error);
}
}
// Run the main function
main();
// Save embeddings to a separate file
function saveEmbeddings(embeddings, outputFilePath) {
fs.writeFileSync(outputFilePath, JSON.stringify(embeddings, null, 2));
}
});