From e4ffa50404cad9fcf2f95b5f1588da2b52035d6f Mon Sep 17 00:00:00 2001 From: MarvNC Date: Thu, 1 Feb 2024 00:31:21 -0800 Subject: [PATCH] Add frequency dictionary convert script --- .gitignore | 5 +- package.json | 3 +- src/convertToFrequencyDictionary.js | 82 +++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 src/convertToFrequencyDictionary.js diff --git a/.gitignore b/.gitignore index 32df546..57fa4a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ *.txt csvs -/images/*.* -/compressedImages/*.* +/images +/compressedImages +/freqjsons # Created by https://www.toptal.com/developers/gitignore/api/node # Edit at https://www.toptal.com/developers/gitignore?templates=node diff --git a/package.json b/package.json index e7e9a79..3d44df0 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,7 @@ "download": "node src/downloadLatest.js && sh src/scripts/download_unzip.sh", "buildTermDict": "node src/convertToTermDictionary.js", "buildLatest": "npm run download && npm run buildTermDict", + "buildFreq": "node src/convertToFrequencyDictionary.js", "test": "ava" }, "dependencies": { @@ -18,4 +19,4 @@ "ava": "^6.0.1" }, "version": "1.0.0" -} +} \ No newline at end of file diff --git a/src/convertToFrequencyDictionary.js b/src/convertToFrequencyDictionary.js new file mode 100644 index 0000000..4432795 --- /dev/null +++ b/src/convertToFrequencyDictionary.js @@ -0,0 +1,82 @@ +/** + * Requires the jsons downloaded from https://words.hk/faiman/analysis/ + * to be in the freqjsons directory + */ +import fs from 'fs'; +import path from 'path'; +import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder'; +const freqJsonsDir = 'freqjsons'; +const charCountJson = 'charcount.json'; +const existingWordCountJson = 'existingwordcount.json'; + +(async () => { + const freqJsons = fs.readdirSync(freqJsonsDir); + const charCountData = JSON.parse( + fs.readFileSync(path.join(freqJsonsDir, charCountJson)).toString() + ); + const existingWordCountData = JSON.parse( + fs.readFileSync(path.join(freqJsonsDir, existingWordCountJson)).toString() + ); + console.log(`Read ${freqJsons.length} files from ${freqJsonsDir}`); + console.log( + `Read ${Object.keys(charCountData).length} characters from ${charCountJson}` + ); + console.log( + `Read ${ + Object.keys(existingWordCountData).length + } words from ${existingWordCountJson}` + ); + + const dictionary = new Dictionary({ + fileName: 'Words.hk Frequency.zip', + }); + const dictionaryIndex = new DictionaryIndex() + .setAuthor('Marv') + .setAttribution( + `Words.hk & contributers (https://words.hk) + See license at https://words.hk/base/hoifong/` + ) + .setUrl('https://github.com/MarvNC/wordshk-yomitan') + .setDescription( + `Converted from the free Words.hk dictionary found at https://words.hk/. + Converted using https://github.com/MarvNC/yomichan-dict-builder` + ) + .setTitle(`Words.hk Frequency`) + .setRevision(`1.0`); + await dictionary.setIndex(dictionaryIndex.build()); + + // Add characters to kanji meta + const sortedCharCountData = Object.entries(charCountData).sort( + ([, a], [, b]) => b - a + ); + for (let i = 0; i < sortedCharCountData.length; i++) { + const [char, occurrences] = sortedCharCountData[i]; + await dictionary.addKanjiMeta([ + char, + 'freq', + { + displayValue: `${i + 1} (${occurrences})`, + value: i + 1, + }, + ]); + } + + // Add words to dictionary + const sortedExistingWordCountData = Object.entries( + existingWordCountData + ).sort(([, a], [, b]) => b - a); + for (let i = 0; i < sortedExistingWordCountData.length; i++) { + const [word, occurrences] = sortedExistingWordCountData[i]; + await dictionary.addTermMeta([ + word, + 'freq', + { + displayValue: `${i + 1} (${occurrences})`, + value: i + 1, + }, + ]); + } + + await dictionary.export('dist'); + console.log(`Exported dictionary to dist.`); +})();