From e4ffa50404cad9fcf2f95b5f1588da2b52035d6f Mon Sep 17 00:00:00 2001 From: MarvNC Date: Thu, 1 Feb 2024 00:31:21 -0800 Subject: [PATCH 1/3] Add frequency dictionary convert script --- .gitignore | 5 +- package.json | 3 +- src/convertToFrequencyDictionary.js | 82 +++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 src/convertToFrequencyDictionary.js diff --git a/.gitignore b/.gitignore index 32df546..57fa4a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ *.txt csvs -/images/*.* -/compressedImages/*.* +/images +/compressedImages +/freqjsons # Created by https://www.toptal.com/developers/gitignore/api/node # Edit at https://www.toptal.com/developers/gitignore?templates=node diff --git a/package.json b/package.json index e7e9a79..3d44df0 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,7 @@ "download": "node src/downloadLatest.js && sh src/scripts/download_unzip.sh", "buildTermDict": "node src/convertToTermDictionary.js", "buildLatest": "npm run download && npm run buildTermDict", + "buildFreq": "node src/convertToFrequencyDictionary.js", "test": "ava" }, "dependencies": { @@ -18,4 +19,4 @@ "ava": "^6.0.1" }, "version": "1.0.0" -} +} \ No newline at end of file diff --git a/src/convertToFrequencyDictionary.js b/src/convertToFrequencyDictionary.js new file mode 100644 index 0000000..4432795 --- /dev/null +++ b/src/convertToFrequencyDictionary.js @@ -0,0 +1,82 @@ +/** + * Requires the jsons downloaded from https://words.hk/faiman/analysis/ + * to be in the freqjsons directory + */ +import fs from 'fs'; +import path from 'path'; +import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder'; +const freqJsonsDir = 'freqjsons'; +const charCountJson = 'charcount.json'; +const existingWordCountJson = 'existingwordcount.json'; + +(async () => { + const freqJsons = fs.readdirSync(freqJsonsDir); + const charCountData = JSON.parse( + fs.readFileSync(path.join(freqJsonsDir, charCountJson)).toString() + ); + const existingWordCountData = JSON.parse( + fs.readFileSync(path.join(freqJsonsDir, existingWordCountJson)).toString() + ); + console.log(`Read ${freqJsons.length} files from ${freqJsonsDir}`); + console.log( + `Read ${Object.keys(charCountData).length} characters from ${charCountJson}` + ); + console.log( + `Read ${ + Object.keys(existingWordCountData).length + } words from ${existingWordCountJson}` + ); + + const dictionary = new Dictionary({ + fileName: 'Words.hk Frequency.zip', + }); + const dictionaryIndex = new DictionaryIndex() + .setAuthor('Marv') + .setAttribution( + `Words.hk & contributers (https://words.hk) + See license at https://words.hk/base/hoifong/` + ) + .setUrl('https://github.com/MarvNC/wordshk-yomitan') + .setDescription( + `Converted from the free Words.hk dictionary found at https://words.hk/. + Converted using https://github.com/MarvNC/yomichan-dict-builder` + ) + .setTitle(`Words.hk Frequency`) + .setRevision(`1.0`); + await dictionary.setIndex(dictionaryIndex.build()); + + // Add characters to kanji meta + const sortedCharCountData = Object.entries(charCountData).sort( + ([, a], [, b]) => b - a + ); + for (let i = 0; i < sortedCharCountData.length; i++) { + const [char, occurrences] = sortedCharCountData[i]; + await dictionary.addKanjiMeta([ + char, + 'freq', + { + displayValue: `${i + 1} (${occurrences})`, + value: i + 1, + }, + ]); + } + + // Add words to dictionary + const sortedExistingWordCountData = Object.entries( + existingWordCountData + ).sort(([, a], [, b]) => b - a); + for (let i = 0; i < sortedExistingWordCountData.length; i++) { + const [word, occurrences] = sortedExistingWordCountData[i]; + await dictionary.addTermMeta([ + word, + 'freq', + { + displayValue: `${i + 1} (${occurrences})`, + value: i + 1, + }, + ]); + } + + await dictionary.export('dist'); + console.log(`Exported dictionary to dist.`); +})(); From 488072ad5d0a1305d1e6d089fe6283e4d6716e6d Mon Sep 17 00:00:00 2001 From: MarvNC Date: Fri, 2 Feb 2024 21:38:41 -0800 Subject: [PATCH 2/3] Rename `languages` to `LANGUAGES_DATA` --- src/constants.js | 4 ++-- src/util/entryParse/parseEntryToJson.js | 4 ++-- src/util/yomitan/convertSenseToSC.js | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/constants.js b/src/constants.js index 40ed719..3a273d8 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,7 +1,7 @@ /** * @type {Record} */ -const languages = { +const LANGUAGES_DATA = { yue: { name: '廣東話', shortName: '粵', @@ -79,7 +79,7 @@ const COMPRESSED_IMAGES_FOLDER = './compressedImages'; const IMAGE_RESIZE_WIDTH = 400; export { - languages, + LANGUAGES_DATA, IMAGE_FOLDER, COMPRESSED_IMAGES_FOLDER, IMAGE_RESIZE_WIDTH, diff --git a/src/util/entryParse/parseEntryToJson.js b/src/util/entryParse/parseEntryToJson.js index f0eb6b4..f58702c 100644 --- a/src/util/entryParse/parseEntryToJson.js +++ b/src/util/entryParse/parseEntryToJson.js @@ -1,4 +1,4 @@ -import { languages } from '../../constants.js'; +import { LANGUAGES_DATA } from '../../constants.js'; /** * @@ -158,7 +158,7 @@ function parseLanguageData(text) { continue; } // Check if the language is a possible language - if (!languages[matchedLang]) { + if (!LANGUAGES_DATA[matchedLang]) { throw new Error(`Invalid language: ${matchedLang}`); } // Else a language is found diff --git a/src/util/yomitan/convertSenseToSC.js b/src/util/yomitan/convertSenseToSC.js index b5635ca..c6cf585 100644 --- a/src/util/yomitan/convertSenseToSC.js +++ b/src/util/yomitan/convertSenseToSC.js @@ -1,4 +1,4 @@ -import { languages } from '../../constants.js'; +import { LANGUAGES_DATA } from '../../constants.js'; import { isStringSentence } from '../textHandling/textUtils.js'; import { convertTextToSC } from './parseTextToSC.js'; @@ -187,7 +187,7 @@ function convertLanguageEntryToListItems( * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} */ const languageLiScArray = []; - const languageInfo = languages[language]; + const languageInfo = LANGUAGES_DATA[language]; for (const languageText of languageTexts) { /** * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} From 000fdd57ed2e9594b87200fab4ca7580da58e3a8 Mon Sep 17 00:00:00 2001 From: MarvNC Date: Fri, 2 Feb 2024 21:51:47 -0800 Subject: [PATCH 3/3] Add to readme --- readme.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 433624b..e7c97ba 100644 --- a/readme.md +++ b/readme.md @@ -10,7 +10,8 @@ more Yomitan dictionaries and tools, see ## Download -🚧 Coming soon 🚧 +- 🚧 Words.hk for Yomitan (TBA) +- [Words.hk Frequency](https://drive.google.com/open?id=14kx0q9EBftwqaZPw55y9USkXFQlUrjf1&usp=drive_fs)