-
Notifications
You must be signed in to change notification settings - Fork 6
/
scrapeImagesAirTable.js
159 lines (131 loc) · 4.51 KB
/
scrapeImagesAirTable.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
const fs = require('fs')
const path = require('path')
const fetch = (...args) =>
import('node-fetch').then(({ default: fetch }) => fetch(...args))
const buildOutFolder = './out'
const mirrorFolderName = 'img-airtable'
const graphAssetUrlRegex =
/https?:\/\/[^\"\'\s]+airtableusercontent.com[^\"\'\s]*/g
const mirrorFolderPath = `${buildOutFolder}/${mirrorFolderName}`
const filesToParse = ['html', 'js', 'json']
// utility function to recursively process files in a folder with a callback
const processFilesRecursively = (folder, processFile) => {
fs.readdirSync(folder).forEach(fileOrDir => {
const fullPath = path.join(folder, fileOrDir)
const stat = fs.statSync(fullPath)
if (stat.isDirectory()) {
processFilesRecursively(fullPath, processFile)
} else if (stat.isFile()) {
processFile(fullPath)
}
})
}
// returns array of urls matching graphAssetUrlRegex
const getAllAssetUrls = (folder = buildOutFolder) => {
const assetUrls = new Set()
processFilesRecursively(folder, filePath => {
if (filesToParse.some(type => filePath.endsWith(type))) {
let content = fs.readFileSync(filePath, 'utf8')
const matchedUrls = content.match(graphAssetUrlRegex) || []
matchedUrls.forEach(url => {
assetUrls.add(url)
})
fs.writeFileSync(filePath, content)
}
})
return Array.from(assetUrls)
}
const downloadAssetsAndAddFileExtensions = async (
assetUrls,
destination = mirrorFolderPath
) => {
const srcChanges = []
const mirrorFolderName = mirrorFolderPath.split('/').pop()
for (const url of assetUrls) {
try {
const controller = new AbortController()
const timeout = setTimeout(() => {
controller.abort()
}, 15000)
const response = await fetch(url)
clearTimeout(timeout)
if (!response.ok) {
console.error(
`Failed to fetch ${url}: ${response.status} - ${response.statusText}`
)
continue
}
const buffer = await response.buffer()
const filename = path.basename(new URL(url).pathname)
const extension = await determineFileExtension(buffer)
const newFilename = `${filename}.${extension}`
const newFilepath = path.join(destination, newFilename)
srcChanges.push({
old: url,
new: `/${mirrorFolderName}/${newFilename}`
})
fs.writeFileSync(newFilepath, buffer)
} catch (error) {
if (error.name === 'AbortError') {
console.error(`Request for ${url} timed out`)
} else {
console.error(`Error fetching or saving ${url}:`, error.message)
}
}
}
return { srcChanges }
}
const determineFileExtension = async buffer => {
const typeResult = await fileType.fileTypeFromBuffer(buffer)
if (typeResult) {
if (typeResult.ext === 'xml') return 'svg'
return typeResult.ext
}
let extension = undefined
const content = buffer.toString('utf8')
extension =
content.trim().startsWith('<svg') || content.trim().startsWith('<?xml')
? 'svg'
: undefined
if (extension) return extension
throw new Error('Could not determine file extension')
}
const findAllAndReplace = (replacements, basePath = buildOutFolder) => {
let count = 0
processFilesRecursively(basePath, filePath => {
let content = fs.readFileSync(filePath, 'utf8')
let newContent = content
replacements.forEach(({ old, new: newStr }) => {
newContent = newContent.split(old).join(newStr)
})
if (content !== newContent) {
count++
fs.writeFileSync(filePath, newContent)
}
})
return { count }
}
function clearOrCreateMirrorFolder(folderPath = mirrorFolderPath) {
if (fs.existsSync(folderPath)) {
fs.rmSync(folderPath, { recursive: true, force: true })
}
fs.mkdirSync(folderPath, { recursive: true })
}
let fileType = undefined // needs to be dynamically imported
;(async () => {
try {
console.log('😈 Beginning Asset Scrape AirTable 😈')
fileType = await import('file-type')
clearOrCreateMirrorFolder()
const imageUrls = getAllAssetUrls()
console.log(`Found ${imageUrls.length} asset urls in build folder`)
const { srcChanges } = await downloadAssetsAndAddFileExtensions(imageUrls)
console.log('Downloaded assets and added file extensions')
await new Promise(resolve => setTimeout(resolve, 2000)) // Delay for file system sync
const { count } = findAllAndReplace(srcChanges)
console.log(`Replaced src strings in ${count} files`)
console.log('All Done ✅')
} catch (error) {
console.error('An error occurred:', error)
}
})()