Skip to content

Commit

Permalink
Add frequency dictionary convert script
Browse files Browse the repository at this point in the history
  • Loading branch information
MarvNC committed Feb 1, 2024
1 parent 76453d3 commit e4ffa50
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 3 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
*.txt
csvs
/images/*.*
/compressedImages/*.*
/images
/compressedImages
/freqjsons

# Created by https://www.toptal.com/developers/gitignore/api/node
# Edit at https://www.toptal.com/developers/gitignore?templates=node
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"download": "node src/downloadLatest.js && sh src/scripts/download_unzip.sh",
"buildTermDict": "node src/convertToTermDictionary.js",
"buildLatest": "npm run download && npm run buildTermDict",
"buildFreq": "node src/convertToFrequencyDictionary.js",
"test": "ava"
},
"dependencies": {
Expand All @@ -18,4 +19,4 @@
"ava": "^6.0.1"
},
"version": "1.0.0"
}
}
82 changes: 82 additions & 0 deletions src/convertToFrequencyDictionary.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/**
* Requires the jsons downloaded from https://words.hk/faiman/analysis/
* to be in the freqjsons directory
*/
import fs from 'fs';
import path from 'path';
import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder';
const freqJsonsDir = 'freqjsons';
const charCountJson = 'charcount.json';
const existingWordCountJson = 'existingwordcount.json';

(async () => {
const freqJsons = fs.readdirSync(freqJsonsDir);
const charCountData = JSON.parse(
fs.readFileSync(path.join(freqJsonsDir, charCountJson)).toString()
);
const existingWordCountData = JSON.parse(
fs.readFileSync(path.join(freqJsonsDir, existingWordCountJson)).toString()
);
console.log(`Read ${freqJsons.length} files from ${freqJsonsDir}`);
console.log(
`Read ${Object.keys(charCountData).length} characters from ${charCountJson}`
);
console.log(
`Read ${
Object.keys(existingWordCountData).length
} words from ${existingWordCountJson}`
);

const dictionary = new Dictionary({
fileName: 'Words.hk Frequency.zip',
});
const dictionaryIndex = new DictionaryIndex()
.setAuthor('Marv')
.setAttribution(
`Words.hk & contributers (https://words.hk)
See license at https://words.hk/base/hoifong/`
)
.setUrl('https://github.com/MarvNC/wordshk-yomitan')
.setDescription(
`Converted from the free Words.hk dictionary found at https://words.hk/.
Converted using https://github.com/MarvNC/yomichan-dict-builder`
)
.setTitle(`Words.hk Frequency`)
.setRevision(`1.0`);
await dictionary.setIndex(dictionaryIndex.build());

// Add characters to kanji meta
const sortedCharCountData = Object.entries(charCountData).sort(
([, a], [, b]) => b - a
);
for (let i = 0; i < sortedCharCountData.length; i++) {
const [char, occurrences] = sortedCharCountData[i];
await dictionary.addKanjiMeta([
char,
'freq',
{
displayValue: `${i + 1} (${occurrences})`,
value: i + 1,
},
]);
}

// Add words to dictionary
const sortedExistingWordCountData = Object.entries(
existingWordCountData
).sort(([, a], [, b]) => b - a);
for (let i = 0; i < sortedExistingWordCountData.length; i++) {
const [word, occurrences] = sortedExistingWordCountData[i];
await dictionary.addTermMeta([
word,
'freq',
{
displayValue: `${i + 1} (${occurrences})`,
value: i + 1,
},
]);
}

await dictionary.export('dist');
console.log(`Exported dictionary to dist.`);
})();

0 comments on commit e4ffa50

Please sign in to comment.