From 47077d043a8aa638a31295820c3d2662013e0a79 Mon Sep 17 00:00:00 2001 From: MarvNC Date: Sat, 3 Feb 2024 21:56:22 -0800 Subject: [PATCH] Download/Unzip with Node Fixes #30 --- package-lock.json | 12 ++++++- package.json | 7 ++-- src/downloadLatest.js | 68 +++++++++++++++++++++++++++++++---- src/scripts/download_unzip.sh | 34 ------------------ 4 files changed, 76 insertions(+), 45 deletions(-) delete mode 100644 src/scripts/download_unzip.sh diff --git a/package-lock.json b/package-lock.json index 759a394..c60d114 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,8 @@ "csv-parser": "^3.0.0", "jsdom": "^23.0.1", "sharp": "^0.33.2", - "yomichan-dict-builder": "^2.2.0" + "yomichan-dict-builder": "^2.2.0", + "zlib": "^1.0.5" }, "devDependencies": { "ava": "^6.0.1" @@ -3253,6 +3254,15 @@ "dependencies": { "jszip": "^3.10.1" } + }, + "node_modules/zlib": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/zlib/-/zlib-1.0.5.tgz", + "integrity": "sha512-40fpE2II+Cd3k8HWTWONfeKE2jL+P42iWJ1zzps5W51qcTsOUKM5Q5m2PFb0CLxlmFAaUuUdJGc3OfZy947v0w==", + "hasInstallScript": true, + "engines": { + "node": ">=0.2.0" + } } } } diff --git a/package.json b/package.json index 3d44df0..bdc407e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "scripts": { - "download": "node src/downloadLatest.js && sh src/scripts/download_unzip.sh", + "download": "node src/downloadLatest.js", "buildTermDict": "node src/convertToTermDictionary.js", "buildLatest": "npm run download && npm run buildTermDict", "buildFreq": "node src/convertToFrequencyDictionary.js", @@ -12,11 +12,12 @@ "csv-parser": "^3.0.0", "jsdom": "^23.0.1", "sharp": "^0.33.2", - "yomichan-dict-builder": "^2.2.0" + "yomichan-dict-builder": "^2.2.0", + "zlib": "^1.0.5" }, "type": "module", "devDependencies": { "ava": "^6.0.1" }, "version": "1.0.0" -} \ No newline at end of file +} diff --git a/src/downloadLatest.js b/src/downloadLatest.js index 4762af9..8105213 100644 --- a/src/downloadLatest.js +++ b/src/downloadLatest.js @@ -1,12 +1,15 @@ import { JSDOM } from 'jsdom'; import fs from 'fs'; import path from 'path'; +import zlib from 'zlib'; +import axios from 'axios'; const domain = 'https://words.hk'; const requestURL = `${domain}/faiman/request_data/`; const csvURLtxt = 'csv.gz.URLs.txt'; +const csvDir = 'csvs'; -async function downloadLatest() { +(async function downloadLatest() { const dom = await JSDOM.fromURL(requestURL); const { document } = dom.window; const csrfTokenInput = document.querySelector( @@ -39,14 +42,18 @@ async function downloadLatest() { throw new Error(`Response: ${response.status} ${response.statusText}`); } console.log('Request success, getting csv links...'); - downloadCSVs(new JSDOM(text)); -} + const csvLinks = await getCSVLinks(new JSDOM(text)); + + await downloadCSVs(csvLinks); + console.log('Download complete.'); +})(); /** * * @param {JSDOM} dom + * @returns {Promise} The URLs of the CSVs */ -function downloadCSVs(dom) { +async function getCSVLinks(dom) { const { document } = dom.window; const csvLinkAnchors = /** @type {HTMLAnchorElement[]} */ ([ @@ -62,8 +69,55 @@ function downloadCSVs(dom) { // Write to file // const filePath = path.join(__dirname, csvURLtxt); - fs.writeFileSync(csvURLtxt, csvLinks.join('\n')); - console.log(`Wrote csv links to ${csvURLtxt}`); + // fs.writeFileSync(csvURLtxt, csvLinks.join('\n')); + // console.log(`Wrote csv links to ${csvURLtxt}`); + return csvLinks; } -downloadLatest(); +/** + * Download the CSVs from the given URLs + * @param {string[]} csvLinks + */ +async function downloadCSVs(csvLinks) { + // Create the directory if it doesn't exist + if (!fs.existsSync(csvDir)) { + fs.mkdirSync(csvDir); + } + + // Delete contents of the directory + fs.readdirSync(csvDir).forEach((file) => { + fs.unlinkSync(path.join(csvDir, file)); + }); + + // Process each URL + for (const url of csvLinks) { + // Extract filename from URL + const filename = path.basename(url); + + const fullPath = path.join(csvDir, filename); + + console.log(`Downloading ${filename} from ${url}...`); + + // Download the file from the URL to csvs directory + const response = await axios.get(url, { + responseType: 'arraybuffer', + }); + const buffer = Buffer.from(response.data); + + fs.writeFileSync(fullPath, buffer); + + // Unzip the downloaded file + console.log(`Unzipping ${filename}...`); + const gzip = zlib.createGunzip(); + const source = fs.createReadStream(fullPath); + const destination = fs.createWriteStream( + path.join(csvDir, filename.replace('.gz', '')) + ); + source + .pipe(gzip) + .pipe(destination) + .on('finish', function () { + fs.unlinkSync(fullPath); + }); + } +} diff --git a/src/scripts/download_unzip.sh b/src/scripts/download_unzip.sh deleted file mode 100644 index 039dccd..0000000 --- a/src/scripts/download_unzip.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -# Define the directory name -csv_dir="csvs" - -# Create the directory if it doesn't exist -mkdir -p "$csv_dir" - -# Delete contents of the directory if it exists -rm -rf "$csv_dir"/* - -# Read each line in the file and process -while IFS= read -r url || [ -n "$url" ]; do - # Ignore empty lines - if [ -n "$url" ]; then - # Extract filename from URL - filename=$(basename "$url") - - # Check if the file already exists - if [ -f "$csv_dir/$filename" ]; then - echo "File $filename already exists. Skipping..." - continue - else - echo "File $filename does not exist. Downloading..." - # Download the file - echo "Downloading $filename..." - wget "$url" -O "$csv_dir/$filename" - fi - - # Unzip the downloaded file - echo "Unzipping $filename..." - gzip -d "$csv_dir/$filename" - fi -done