Skip to content

Commit

Permalink
Download/Unzip with Node
Browse files Browse the repository at this point in the history
Fixes #30
  • Loading branch information
MarvNC committed Feb 4, 2024
1 parent 6b38b8f commit 47077d0
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 45 deletions.
12 changes: 11 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"scripts": {
"download": "node src/downloadLatest.js && sh src/scripts/download_unzip.sh",
"download": "node src/downloadLatest.js",
"buildTermDict": "node src/convertToTermDictionary.js",
"buildLatest": "npm run download && npm run buildTermDict",
"buildFreq": "node src/convertToFrequencyDictionary.js",
Expand All @@ -12,11 +12,12 @@
"csv-parser": "^3.0.0",
"jsdom": "^23.0.1",
"sharp": "^0.33.2",
"yomichan-dict-builder": "^2.2.0"
"yomichan-dict-builder": "^2.2.0",
"zlib": "^1.0.5"
},
"type": "module",
"devDependencies": {
"ava": "^6.0.1"
},
"version": "1.0.0"
}
}
68 changes: 61 additions & 7 deletions src/downloadLatest.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import { JSDOM } from 'jsdom';
import fs from 'fs';
import path from 'path';
import zlib from 'zlib';
import axios from 'axios';

const domain = 'https://words.hk';
const requestURL = `${domain}/faiman/request_data/`;
const csvURLtxt = 'csv.gz.URLs.txt';
const csvDir = 'csvs';

async function downloadLatest() {
(async function downloadLatest() {
const dom = await JSDOM.fromURL(requestURL);
const { document } = dom.window;
const csrfTokenInput = document.querySelector(
Expand Down Expand Up @@ -39,14 +42,18 @@ async function downloadLatest() {
throw new Error(`Response: ${response.status} ${response.statusText}`);
}
console.log('Request success, getting csv links...');
downloadCSVs(new JSDOM(text));
}
const csvLinks = await getCSVLinks(new JSDOM(text));

await downloadCSVs(csvLinks);
console.log('Download complete.');
})();

/**
*
* @param {JSDOM} dom
* @returns {Promise<string[]>} The URLs of the CSVs
*/
function downloadCSVs(dom) {
async function getCSVLinks(dom) {
const { document } = dom.window;

const csvLinkAnchors = /** @type {HTMLAnchorElement[]} */ ([
Expand All @@ -62,8 +69,55 @@ function downloadCSVs(dom) {

// Write to file
// const filePath = path.join(__dirname, csvURLtxt);
fs.writeFileSync(csvURLtxt, csvLinks.join('\n'));
console.log(`Wrote csv links to ${csvURLtxt}`);
// fs.writeFileSync(csvURLtxt, csvLinks.join('\n'));
// console.log(`Wrote csv links to ${csvURLtxt}`);
return csvLinks;
}

downloadLatest();
/**
* Download the CSVs from the given URLs
* @param {string[]} csvLinks
*/
async function downloadCSVs(csvLinks) {
// Create the directory if it doesn't exist
if (!fs.existsSync(csvDir)) {
fs.mkdirSync(csvDir);
}

// Delete contents of the directory
fs.readdirSync(csvDir).forEach((file) => {
fs.unlinkSync(path.join(csvDir, file));
});

// Process each URL
for (const url of csvLinks) {
// Extract filename from URL
const filename = path.basename(url);

const fullPath = path.join(csvDir, filename);

console.log(`Downloading ${filename} from ${url}...`);

// Download the file from the URL to csvs directory
const response = await axios.get(url, {
responseType: 'arraybuffer',
});
const buffer = Buffer.from(response.data);

fs.writeFileSync(fullPath, buffer);

// Unzip the downloaded file
console.log(`Unzipping ${filename}...`);
const gzip = zlib.createGunzip();
const source = fs.createReadStream(fullPath);
const destination = fs.createWriteStream(
path.join(csvDir, filename.replace('.gz', ''))
);
source
.pipe(gzip)
.pipe(destination)
.on('finish', function () {
fs.unlinkSync(fullPath);
});
}
}
34 changes: 0 additions & 34 deletions src/scripts/download_unzip.sh

This file was deleted.

0 comments on commit 47077d0

Please sign in to comment.