Skip to content

Commit

Permalink
Merge pull request #29 from MarvNC/make-freq
Browse files Browse the repository at this point in the history
  • Loading branch information
MarvNC authored Feb 3, 2024
2 parents 76453d3 + 000fdd5 commit 15eb168
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 10 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
*.txt
csvs
/images/*.*
/compressedImages/*.*
/images
/compressedImages
/freqjsons

# Created by https://www.toptal.com/developers/gitignore/api/node
# Edit at https://www.toptal.com/developers/gitignore?templates=node
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"download": "node src/downloadLatest.js && sh src/scripts/download_unzip.sh",
"buildTermDict": "node src/convertToTermDictionary.js",
"buildLatest": "npm run download && npm run buildTermDict",
"buildFreq": "node src/convertToFrequencyDictionary.js",
"test": "ava"
},
"dependencies": {
Expand All @@ -18,4 +19,4 @@
"ava": "^6.0.1"
},
"version": "1.0.0"
}
}
3 changes: 2 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ more Yomitan dictionaries and tools, see

## Download

🚧 Coming soon 🚧
- 🚧 Words.hk for Yomitan (TBA)
- [Words.hk Frequency](https://drive.google.com/open?id=14kx0q9EBftwqaZPw55y9USkXFQlUrjf1&usp=drive_fs)

<!-- ## Screenshots -->

Expand Down
4 changes: 2 additions & 2 deletions src/constants.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* @type {Record<Language, { name: string, shortName: string, langCode: string }>}
*/
const languages = {
const LANGUAGES_DATA = {
yue: {
name: '廣東話',
shortName: '粵',
Expand Down Expand Up @@ -79,7 +79,7 @@ const COMPRESSED_IMAGES_FOLDER = './compressedImages';
const IMAGE_RESIZE_WIDTH = 400;

export {
languages,
LANGUAGES_DATA,
IMAGE_FOLDER,
COMPRESSED_IMAGES_FOLDER,
IMAGE_RESIZE_WIDTH,
Expand Down
82 changes: 82 additions & 0 deletions src/convertToFrequencyDictionary.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/**
* Requires the jsons downloaded from https://words.hk/faiman/analysis/
* to be in the freqjsons directory
*/
import fs from 'fs';
import path from 'path';
import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder';
const freqJsonsDir = 'freqjsons';
const charCountJson = 'charcount.json';
const existingWordCountJson = 'existingwordcount.json';

(async () => {
const freqJsons = fs.readdirSync(freqJsonsDir);
const charCountData = JSON.parse(
fs.readFileSync(path.join(freqJsonsDir, charCountJson)).toString()
);
const existingWordCountData = JSON.parse(
fs.readFileSync(path.join(freqJsonsDir, existingWordCountJson)).toString()
);
console.log(`Read ${freqJsons.length} files from ${freqJsonsDir}`);
console.log(
`Read ${Object.keys(charCountData).length} characters from ${charCountJson}`
);
console.log(
`Read ${
Object.keys(existingWordCountData).length
} words from ${existingWordCountJson}`
);

const dictionary = new Dictionary({
fileName: 'Words.hk Frequency.zip',
});
const dictionaryIndex = new DictionaryIndex()
.setAuthor('Marv')
.setAttribution(
`Words.hk & contributers (https://words.hk)
See license at https://words.hk/base/hoifong/`
)
.setUrl('https://github.com/MarvNC/wordshk-yomitan')
.setDescription(
`Converted from the free Words.hk dictionary found at https://words.hk/.
Converted using https://github.com/MarvNC/yomichan-dict-builder`
)
.setTitle(`Words.hk Frequency`)
.setRevision(`1.0`);
await dictionary.setIndex(dictionaryIndex.build());

// Add characters to kanji meta
const sortedCharCountData = Object.entries(charCountData).sort(
([, a], [, b]) => b - a
);
for (let i = 0; i < sortedCharCountData.length; i++) {
const [char, occurrences] = sortedCharCountData[i];
await dictionary.addKanjiMeta([
char,
'freq',
{
displayValue: `${i + 1} (${occurrences})`,
value: i + 1,
},
]);
}

// Add words to dictionary
const sortedExistingWordCountData = Object.entries(
existingWordCountData
).sort(([, a], [, b]) => b - a);
for (let i = 0; i < sortedExistingWordCountData.length; i++) {
const [word, occurrences] = sortedExistingWordCountData[i];
await dictionary.addTermMeta([
word,
'freq',
{
displayValue: `${i + 1} (${occurrences})`,
value: i + 1,
},
]);
}

await dictionary.export('dist');
console.log(`Exported dictionary to dist.`);
})();
4 changes: 2 additions & 2 deletions src/util/entryParse/parseEntryToJson.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { languages } from '../../constants.js';
import { LANGUAGES_DATA } from '../../constants.js';

/**
*
Expand Down Expand Up @@ -158,7 +158,7 @@ function parseLanguageData(text) {
continue;
}
// Check if the language is a possible language
if (!languages[matchedLang]) {
if (!LANGUAGES_DATA[matchedLang]) {
throw new Error(`Invalid language: ${matchedLang}`);
}
// Else a language is found
Expand Down
4 changes: 2 additions & 2 deletions src/util/yomitan/convertSenseToSC.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { languages } from '../../constants.js';
import { LANGUAGES_DATA } from '../../constants.js';
import { isStringSentence } from '../textHandling/textUtils.js';
import { convertTextToSC } from './parseTextToSC.js';

Expand Down Expand Up @@ -187,7 +187,7 @@ function convertLanguageEntryToListItems(
* @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
*/
const languageLiScArray = [];
const languageInfo = languages[language];
const languageInfo = LANGUAGES_DATA[language];
for (const languageText of languageTexts) {
/**
* @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
Expand Down

0 comments on commit 15eb168

Please sign in to comment.