From d3b3df2a949ddf81ea1f71a6a789ef35a35e10be Mon Sep 17 00:00:00 2001 From: MarvNC Date: Thu, 11 Jul 2024 18:08:51 -0700 Subject: [PATCH] Move downloads to ts --- .gitignore | 2 +- readme.md | 10 +++----- src/downloadDump.sh | 23 ------------------- src/index.ts | 9 ++++---- src/util/downloadDumps.ts | 48 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 35 deletions(-) delete mode 100755 src/downloadDump.sh create mode 100644 src/util/downloadDumps.ts diff --git a/.gitignore b/.gitignore index 929e9d1..3b44fc6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ *.bz2 *.zip -.tsimp +/download # Created by https://www.toptal.com/developers/gitignore/api/node # Edit at https://www.toptal.com/developers/gitignore?templates=node diff --git a/readme.md b/readme.md index 5c8c440..2fadfed 100644 --- a/readme.md +++ b/readme.md @@ -60,13 +60,7 @@ The code in this repository is licensed under the MIT license. This project uses bun. -To download the abstracts for a language, run: - -```sh -/src/downloadDump.sh ja 2022.12.01 -``` - -To build a dictionary, run: +To download and build a dictionary, run: ```sh bun run start -l ja -d 2022-12-01 @@ -74,3 +68,5 @@ bun run start -l ja -d 2022-12-01 where `ja` is the language code and `2022.12.01` is the date of the dump (there are no newer DBPedia versions). + +You must be running this in linux with `bzip2` installed. diff --git a/src/downloadDump.sh b/src/downloadDump.sh deleted file mode 100755 index 3af78c2..0000000 --- a/src/downloadDump.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Check for two arguments: language (en, de, fr, ...) and date (yyyy.mm.dd) -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -ARCHIVE="short-abstracts_lang=$1.ttl.bz2" -URL="https://databus.dbpedia.org/dbpedia/text/short-abstracts/$2/$ARCHIVE" -FILE="short-abstracts_lang=$1.ttl" - -# Download the archive if neither the file nor archive exists -if [ ! -f "$FILE" ] && [ ! -f "$ARCHIVE" ]; then - echo "Downloading $URL" - wget "$URL" -fi - -# Extract the archive if it does not exist -if [ ! -f "$FILE" ]; then - echo "Extracting $ARCHIVE" - bzip2 -dc "$ARCHIVE" >"$FILE" -fi diff --git a/src/index.ts b/src/index.ts index 7173d37..b0a0769 100644 --- a/src/index.ts +++ b/src/index.ts @@ -11,11 +11,10 @@ import type { } from 'yomichan-dict-builder/dist/types/yomitan/termbank'; import * as cliProgress from 'cli-progress'; +import { downloadDumps } from './util/downloadDumps'; const outputZipName = (lang: string, date: string, version: string) => `${lang} Wikipedia [${date}] (v${version}).zip`; -const shortAbstractFile = (lang: string) => - `short-abstracts_lang=${lang.toLowerCase()}.ttl`; (async () => { const version = await getVersion(); @@ -23,9 +22,11 @@ const shortAbstractFile = (lang: string) => console.log(`Using version ${version}`); const { lang, date } = readArgs(); - console.log(`Converting ${lang} Wikipedia dump from ${date}...`); - const filePath = shortAbstractFile(lang); + + console.log(`Converting ${lang} Wikipedia dump from ${date}...`); + + const filePath = await downloadDumps(lang, date); const fileHandle = file(filePath); const fileReader = fileHandle.stream(); const lineReader = fileReader.getReader(); diff --git a/src/util/downloadDumps.ts b/src/util/downloadDumps.ts new file mode 100644 index 0000000..06112d3 --- /dev/null +++ b/src/util/downloadDumps.ts @@ -0,0 +1,48 @@ +import { $ } from 'bun'; +import { exists } from 'node:fs/promises'; +import { join } from 'path'; + +const ARCHIVE = (lang: string) => + `short-abstracts_lang=${lang.toLowerCase()}.ttl.bz2`; +const FILE = (lang: string) => `short-abstracts_lang=${lang.toLowerCase()}.ttl`; +const URL = (lang: string, date: string) => + `https://databus.dbpedia.org/dbpedia/text/short-abstracts/${date}/${ARCHIVE( + lang + )}`; +const DOWNLOAD_DIR = './download'; + +export async function downloadDumps(lang: string, date: string) { + // Replace - with . in date + date = date.replace(/-/g, '.'); + // Check if lang is in format YYYY.MM.DD + if (!/^\d{4}\.\d{2}\.\d{2}$/.test(date)) { + throw new Error(`Invalid date format: ${date}`); + } + + // TODO: Check lang + + // Check if download directory exists + if (!(await exists(DOWNLOAD_DIR))) { + await $`mkdir ${DOWNLOAD_DIR}`; + } + + const archivePath = join(DOWNLOAD_DIR, ARCHIVE(lang)); + const filePath = join(DOWNLOAD_DIR, FILE(lang)); + const archiveExists = await exists(archivePath); + const fileExists = await exists(filePath); + + // Download the archive if neither the file nor archive exists + if (!fileExists && !archiveExists) { + const url = URL(lang, date); + console.log(`Downloading ${url}`); + await $`wget ${url} -O ${archivePath}`; + } + + // Extract the archive if it does not exist + if (!fileExists) { + console.log(`Extracting ${archivePath}`); + await $`bzip2 -dc ${archivePath} >${filePath}`; + } + + return filePath; +}