Skip to content

Commit

Permalink
Fix download filename clashes (#39)
Browse files Browse the repository at this point in the history
* downloaded filenames use download ids instead of urls now

* remove  dep

* move readline-sync from devDeps to deps
  • Loading branch information
andykais authored Oct 12, 2019
1 parent c64891b commit f6073f4
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 107 deletions.
24 changes: 1 addition & 23 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
"handlebars": "^4.4.3",
"jsonata": "^1.7.0",
"node-fetch": "^2.5.0",
"readline-sync": "^1.4.10",
"rxjs": "^6.5.2",
"sanitize-filename": "^1.6.1",
"typescript-is": "^0.12.6",
"verror": "^1.10.0"
},
Expand Down Expand Up @@ -74,7 +74,6 @@
"pre-push": "^0.1.1",
"prettier": "^1.17.1",
"raw-loader": "^3.1.0",
"readline-sync": "^1.4.10",
"shx": "^0.3.2",
"ts-loader": "^6.2.0",
"typescript": "^3.6.4",
Expand Down
6 changes: 3 additions & 3 deletions src/scraper/scrape-step/downloader/implementations/http.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import fetch from 'node-fetch'
import { createWriteStream } from 'fs'
import * as path from 'path'
import { mkdirp, sanitizeFilename } from '../../../../util/fs'
import { mkdirp } from '../../../../util/fs'
import { FMap } from '../../../../util/map'
import { ResponseError } from '../../../../util/errors'

Expand Down Expand Up @@ -77,7 +77,7 @@ export class Downloader extends AbstractDownloader<DownloadData> {

private downloadToFileAndMemory: FetchFunction = async (downloadId, [url, fetchOptions]) => {
const downloadFolder = path.resolve(this.params.folder, downloadId.toString())
const filename = path.resolve(downloadFolder, sanitizeFilename(url))
const filename = path.resolve(downloadFolder, downloadId.toString() + path.extname(url))

const response = await fetch(url, fetchOptions)
if (!response.ok) throw new ResponseError(response, url)
Expand All @@ -103,7 +103,7 @@ export class Downloader extends AbstractDownloader<DownloadData> {
}
private downloadToFileOnly: FetchFunction = async (downloadId, [url, fetchOptions]) => {
const downloadFolder = path.resolve(this.params.folder, downloadId.toString())
const filename = path.resolve(downloadFolder, sanitizeFilename(url))
const filename = path.resolve(downloadFolder, downloadId.toString() + path.extname(url))

const response = await fetch(url, fetchOptions)
if (!response.ok) throw new ResponseError(response, url)
Expand Down
3 changes: 0 additions & 3 deletions src/util/fs.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import * as path from 'path'
import * as fs from 'fs'
import { promisify } from 'util'
import sanitize from 'sanitize-filename'

const [mkdir, readdir, stat, unlink, rmdir, rename, access, readFile, writeFile] = [
fs.mkdir,
Expand Down Expand Up @@ -88,5 +87,3 @@ export const findFiles = async (folder: string): Promise<string[]> => {
}
return endpointFiles
}

export const sanitizeFilename = (filename: string) => sanitize(filename, { replacement: '_' })
77 changes: 38 additions & 39 deletions testing/functional/increment-gallery-site/expected-query-results.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export const expected: { [queryStr: string]: QueryResult } = {
downloadData:
'["http://increment-gallery-site.com/image/the.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/7/http___increment-gallery-site.com_image_the.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/7/7.jpg',
byteLength: '17.0',
complete: 1
}
Expand All @@ -26,7 +26,7 @@ export const expected: { [queryStr: string]: QueryResult } = {
downloadData:
'["http://increment-gallery-site.com/image/quick.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/10/http___increment-gallery-site.com_image_quick.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/10/10.jpg',
byteLength: '17.0',
complete: 1
}
Expand All @@ -41,7 +41,7 @@ export const expected: { [queryStr: string]: QueryResult } = {
downloadData:
'["http://increment-gallery-site.com/image/brown.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/16/http___increment-gallery-site.com_image_brown.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/16/16.jpg',
byteLength: '17.0',
complete: 1
}
Expand All @@ -56,16 +56,28 @@ export const expected: { [queryStr: string]: QueryResult } = {
downloadData:
'["http://increment-gallery-site.com/image/fox.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/19/http___increment-gallery-site.com_image_fox.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/19/19.jpg',
byteLength: '17.0',
complete: 1
}
]
}
],

'{"scrapers":["image","tag"],"groupBy":"image-page"}': [
{
image: [
{
id: 1,
scraper: 'image',
parsedValue: '',
downloadData:
'["http://increment-gallery-site.com/image/the.jpg",{"headers":{},"method":"GET"}]',
filename:
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/7/7.jpg',
byteLength: '17.0',
complete: 1
}
],
tag: [
{
id: 1,
Expand All @@ -85,22 +97,22 @@ export const expected: { [queryStr: string]: QueryResult } = {
byteLength: null,
complete: 1
}
],
]
},
{
image: [
{
id: 1,
id: 2,
scraper: 'image',
parsedValue: '',
downloadData:
'["http://increment-gallery-site.com/image/the.jpg",{"headers":{},"method":"GET"}]',
'["http://increment-gallery-site.com/image/quick.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/7/http___increment-gallery-site.com_image_the.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/10/10.jpg',
byteLength: '17.0',
complete: 1
}
]
},
{
],
tag: [
{
id: 2,
Expand All @@ -120,22 +132,22 @@ export const expected: { [queryStr: string]: QueryResult } = {
byteLength: null,
complete: 1
}
],
]
},
{
image: [
{
id: 2,
id: 11,
scraper: 'image',
parsedValue: '',
downloadData:
'["http://increment-gallery-site.com/image/quick.jpg",{"headers":{},"method":"GET"}]',
'["http://increment-gallery-site.com/image/brown.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/10/http___increment-gallery-site.com_image_quick.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/16/16.jpg',
byteLength: '17.0',
complete: 1
}
]
},
{
],
tag: [
{
id: 11,
Expand All @@ -155,22 +167,22 @@ export const expected: { [queryStr: string]: QueryResult } = {
byteLength: null,
complete: 1
}
],
]
},
{
image: [
{
id: 11,
id: 12,
scraper: 'image',
parsedValue: '',
downloadData:
'["http://increment-gallery-site.com/image/brown.jpg",{"headers":{},"method":"GET"}]',
'["http://increment-gallery-site.com/image/fox.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/16/http___increment-gallery-site.com_image_brown.jpg',
'/home/andrew/Build/dev/scrape-pages.after-force/testing/functional/.run-output/increment-gallery-site/image/19/19.jpg',
byteLength: '17.0',
complete: 1
}
]
},
{
],
tag: [
{
id: 12,
Expand Down Expand Up @@ -199,19 +211,6 @@ export const expected: { [queryStr: string]: QueryResult } = {
byteLength: null,
complete: 1
}
],
image: [
{
id: 12,
scraper: 'image',
parsedValue: '',
downloadData:
'["http://increment-gallery-site.com/image/fox.jpg",{"headers":{},"method":"GET"}]',
filename:
'/Users/andrew/Code/scratchwork/scrape-pages.linux/testing/functional/.run-output/increment-gallery-site/image/19/http___increment-gallery-site.com_image_fox.jpg',
byteLength: '17.0',
complete: 1
}
]
}
]
Expand Down
Loading

0 comments on commit f6073f4

Please sign in to comment.