Skip to content

Commit

Permalink
Move downloads to ts
Browse files Browse the repository at this point in the history
  • Loading branch information
MarvNC committed Jul 12, 2024
1 parent ea6b027 commit d3b3df2
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*.bz2
*.zip

.tsimp
/download

# Created by https://www.toptal.com/developers/gitignore/api/node
# Edit at https://www.toptal.com/developers/gitignore?templates=node
Expand Down
10 changes: 3 additions & 7 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,13 @@ The code in this repository is licensed under the MIT license.

This project uses bun.

To download the abstracts for a language, run:

```sh
/src/downloadDump.sh ja 2022.12.01
```

To build a dictionary, run:
To download and build a dictionary, run:

```sh
bun run start -l ja -d 2022-12-01
```

where `ja` is the language code and `2022.12.01` is the date of the dump (there
are no newer DBPedia versions).

You must be running this in linux with `bzip2` installed.
23 changes: 0 additions & 23 deletions src/downloadDump.sh

This file was deleted.

9 changes: 5 additions & 4 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,22 @@ import type {
} from 'yomichan-dict-builder/dist/types/yomitan/termbank';

import * as cliProgress from 'cli-progress';
import { downloadDumps } from './util/downloadDumps';

const outputZipName = (lang: string, date: string, version: string) =>
`${lang} Wikipedia [${date}] (v${version}).zip`;
const shortAbstractFile = (lang: string) =>
`short-abstracts_lang=${lang.toLowerCase()}.ttl`;

(async () => {
const version = await getVersion();

console.log(`Using version ${version}`);

const { lang, date } = readArgs();
console.log(`Converting ${lang} Wikipedia dump from ${date}...`);

const filePath = shortAbstractFile(lang);

console.log(`Converting ${lang} Wikipedia dump from ${date}...`);

const filePath = await downloadDumps(lang, date);
const fileHandle = file(filePath);
const fileReader = fileHandle.stream();
const lineReader = fileReader.getReader();
Expand Down
48 changes: 48 additions & 0 deletions src/util/downloadDumps.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { $ } from 'bun';
import { exists } from 'node:fs/promises';
import { join } from 'path';

const ARCHIVE = (lang: string) =>
`short-abstracts_lang=${lang.toLowerCase()}.ttl.bz2`;
const FILE = (lang: string) => `short-abstracts_lang=${lang.toLowerCase()}.ttl`;
const URL = (lang: string, date: string) =>
`https://databus.dbpedia.org/dbpedia/text/short-abstracts/${date}/${ARCHIVE(
lang
)}`;
const DOWNLOAD_DIR = './download';

export async function downloadDumps(lang: string, date: string) {
// Replace - with . in date
date = date.replace(/-/g, '.');
// Check if lang is in format YYYY.MM.DD
if (!/^\d{4}\.\d{2}\.\d{2}$/.test(date)) {
throw new Error(`Invalid date format: ${date}`);
}

// TODO: Check lang

// Check if download directory exists
if (!(await exists(DOWNLOAD_DIR))) {
await $`mkdir ${DOWNLOAD_DIR}`;
}

const archivePath = join(DOWNLOAD_DIR, ARCHIVE(lang));
const filePath = join(DOWNLOAD_DIR, FILE(lang));
const archiveExists = await exists(archivePath);
const fileExists = await exists(filePath);

// Download the archive if neither the file nor archive exists
if (!fileExists && !archiveExists) {
const url = URL(lang, date);
console.log(`Downloading ${url}`);
await $`wget ${url} -O ${archivePath}`;
}

// Extract the archive if it does not exist
if (!fileExists) {
console.log(`Extracting ${archivePath}`);
await $`bzip2 -dc ${archivePath} >${filePath}`;
}

return filePath;
}

0 comments on commit d3b3df2

Please sign in to comment.