Merge pull request #160 from ku-nlp/dev

v2.0.0
ku-nlp · Mar 13, 2023 · 3212795 · 3212795
2 parents 3593fcc + 8291e21
commit 3212795
Show file tree

Hide file tree

Showing 265 changed files with 12,289 additions and 29,838 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,5 +1,7 @@
 [report]
 exclude_lines =
+    pragma: no cover
+
 	# Do not complain about missing debug-only code:
 	def __repr__
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,29 +1,40 @@
+default_language_version:
+  python: python3.10
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v4.4.0
     hooks:
       - id: end-of-file-fixer
       - id: trailing-whitespace
       - id: check-yaml
   - repo: https://github.com/psf/black
-    rev: 22.10.0
+    rev: 23.1.0
     hooks:
       - id: black
   - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
+    rev: 6.0.0
     hooks:
       - id: flake8
   - repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.982
+    rev: v1.1.1
     hooks:
       - id: mypy
         additional_dependencies:
-          - rhoknp==1.0.2
-          - hydra-core==1.3.1
-          - torch==1.12.1
-          - transformers==4.23.1
+          - rhoknp==1.2.1
+          - hydra-core==1.3.2
+          - torch==1.13.1
+          - torchmetrics==0.11.4
+          - transformers==4.25.1
           - tokenizers==0.13.2
+          - wandb==0.13.11
+  - repo: https://github.com/jumanjihouse/pre-commit-hooks
+    rev: 3.0.0
+    hooks:
+      - id: shellcheck
+        files: ^scripts/
+        types: [ shell ]
+        args: [ --exclude=SC2002 ]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -9,54 +9,60 @@ poetry install
 ```
 
 You need prepare config files and the `.env` file:
+
 1. Copy base config file and edit `work_dir`
+
 ```shell
 cp configs/base_template.yaml configs/base.yaml
 ```
+
 2. Create a `.env` file and set `DATA_DIR`.
+
 ```shell
 echo DATA_DIR="/path/to/data_dir" >> .env
 ```
 
 ## Preprocessing
 
-If you want to use the word segmenter, please prepare a word matcher in advance with the following command.
-```shell
-poetry run python src/kwja/preprocessors/wiki_ene_dic.py
-  --input-json-path "/path/to/wiki_ene_json_file"
-```
-Options:
-- `--output-dir, -o`: path to directory to save. Default: `./data`
-- `--save-filtered-results, -s`: whether to create an intermediate file to save the filtering results.
-
 For morphological analysis, you need to convert JumanDIC in advance with the following commands.
+
 ```shell
 cd /path/to/JumanDIC
 git checkout kwja
 make kwja
 ```
+
 and
+
 ```shell
-poetry run python src/kwja/preprocessors/preprocess_jumandic.py
+poetry run python scripts/preprocessors/preprocess_jumandic.py
   --input-dir /path/to/JumanDIC
   --output-dir /path/to/dic_dir
 ```
+
 Options:
+
 - `--input-dir, -i`: path to the JumanDIC dir.
 - `--output-dir, -o`: path to a directory where processed data are saved.
 
-## Build dataset for training typo module
+## Building dataset for training typo module
+
 You must preprocess Japanese Wikipedia Typo Dataset.
+
 ```shell
-poetry run python src/kwja/preprocessors/preprocess_typo.py
+poetry run python scripts/preprocessors/preprocess_typo.py
   --input-dir "/path/to/unzipped_typo_dataset_dir"
 ```
+
 Options:
+
 - `--output-dir, -o`: path to directory to save. Default: `./data`
 - `--num-valid-samples, -n`: number of validation data. Default: `1000`
 
-## Build datasets for training word module
+## Building datasets for training word module
+
 "build_datasets.sh" performs formatting KWDLC and annotated FKC corpus.
+
 ```shell
 ./scripts/build_datasets.sh
   -a $(poetry run echo $VIRTUAL_ENV)/bin/activate
@@ -65,7 +71,9 @@ Options:
   -j 2
   -o /path/to/output_dir
 ```
+
 Options:
+
 - `-a`: path to activator
 - `-w`: path to working directory
 - `-s`: path to scripts
@@ -76,6 +84,7 @@ NOTE:
 To train word module on Kyoto University Text Corpus, you must have access to it and IREX CRL named entity data.
 If you have both access, you can format the corpus with the following commands.
 (You may need preprocessing to format IREX CRL named entity data.)
+
 ```shell
 poetry run python scripts/add_features_to_raw_corpus.py
   KyotoCorpus/knp
@@ -91,35 +100,67 @@ poetry run kyoto idsplit \
 ```
 
 ## Training and evaluation
+
 You can train and test the models in the following command:
+
 ```shell
 # For training and evaluating word segmenter
 poetry run python scripts/train.py -cn char_module devices=[0,1]
 ```
 
 If you only want to do evaluation after training, please use the following command:
+
 ```shell
 # For evaluating word segmenter
 poetry run python scripts/test.py module=char checkpoint_path="/path/to/checkpoint" devices=[0]
 ```
 
 ## Debugging
+
 You can do debugging on local and server environments:
 
 Local environment (using CPU):
+
 ```shell
 # For debugging word segmenter
 poetry run python scripts/train.py -cn char_module.debug devices=1
 ```
+
 Server environment (using GPU):
+
 ```shell
 # For debugging word segmenter
 poetry run python scripts/train.py -cn char_module.debug devices=[0]
 ```
 
-## Run unit test
+## Running unit test
 
 ```shell
 poetry run pytest
 ```
-”
+
+## Releasing a new version
+
+- Checkout `main` branch
+- Make sure the new version is supported in `_get_model_version` function in `src/kwja/cli/utils.py`
+- Update `CHANGELOG.md`
+- Edit `pyproject.toml` to update `tool.poetry.version`
+- Update dependencies
+
+    ```shell
+    poetry update
+    ```
+
+- Add a new tag and push changes
+
+    ```shell
+    git tag -a v0.1.0 -m "Release v0.1.0"
+    git push --follow-tags
+    ```
+
+- If CI is passed, publish to PyPI
+
+    ```shell
+    poetry build
+    poetry publish [--username $PYPI_USERNAME] [--password $PYPI_PASSWORD]
+    ```
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 
 [![test](https://github.com/ku-nlp/kwja/actions/workflows/test.yml/badge.svg)](https://github.com/ku-nlp/kwja/actions/workflows/test.yml)
 [![codecov](https://codecov.io/gh/ku-nlp/kwja/branch/main/graph/badge.svg?token=A9FWWPLITO)](https://codecov.io/gh/ku-nlp/kwja)
+[![CodeFactor Grade](https://img.shields.io/codefactor/grade/github/ku-nlp/kwja)](https://www.codefactor.io/repository/github/ku-nlp/kwja)
 [![PyPI](https://img.shields.io/pypi/v/kwja)](https://pypi.org/project/kwja/)
 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/kwja)
 
@@ -11,11 +12,12 @@
 KWJA is a Japanese language analyzer based on pre-trained language models.
 KWJA performs many language analysis tasks, including:
 - Typo correction
-- Tokenization
+- Word segmentation
 - Word normalization
 - Morphological analysis
-- Named entity recognition
 - Word feature tagging
+- NER (Named Entity Recognition)
+- Base phrase feature tagging
 - Dependency parsing
 - PAS analysis
 - Bridging reference resolution

diff --git a/configs/base_template.yaml b/configs/base_template.yaml
@@ -19,7 +19,7 @@ seed: null
 name: ${hydra:job.config_name}-${hydra:job.override_dirname}
 
 exp_dir: ${work_dir}/result/${name}
-run_id: ${seed}
+run_id: ${now:%m%d}_${now:%H%M%S}
 run_dir: ${exp_dir}/${run_id}
 config_name: ${hydra:job.config_name}
 

diff --git a/configs/callbacks/char_module_writer.yaml b/configs/callbacks/char_module_writer.yaml
@@ -1,4 +1,3 @@
 prediction_writer:
   _target_: kwja.callbacks.char_module_writer.CharModuleWriter
-  output_dir: ${run_dir}
-  pred_filename: "predict"
+  destination: ${run_dir}/char_prediction.juman
diff --git a/configs/callbacks/lr_monitor.yaml b/configs/callbacks/lr_monitor.yaml
@@ -0,0 +1,4 @@
+lr_monitor:
+  _target_: pytorch_lightning.callbacks.LearningRateMonitor
+  logging_interval: null  # "epoch", "step", or "null"
+  log_momentum: false
diff --git a/configs/callbacks/senter_module_writer.yaml b/configs/callbacks/senter_module_writer.yaml
@@ -0,0 +1,3 @@
+prediction_writer:
+  _target_: kwja.callbacks.senter_module_writer.SenterModuleWriter
+  destination: ${run_dir}/senter_prediction.txt
diff --git a/configs/callbacks/seq2seq_module_writer.yaml b/configs/callbacks/seq2seq_module_writer.yaml
@@ -0,0 +1,8 @@
+prediction_writer:
+  _target_: kwja.callbacks.seq2seq_module_writer.Seq2SeqModuleWriter
+  destination: ${run_dir}/seq2seq_prediction.txt
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: ${encoder.pretrained_model_name_or_path}
+    additional_special_tokens: ${special_tokens}
+    _convert_: all
diff --git a/configs/callbacks/typo_module_writer.yaml b/configs/callbacks/typo_module_writer.yaml
@@ -1,8 +1,10 @@
 prediction_writer:
   _target_: kwja.callbacks.typo_module_writer.TypoModuleWriter
-  output_dir: ${run_dir}
-  extended_vocab_path: ${dataset.extended_vocab_path}
   confidence_threshold: ${confidence_threshold}
-  pred_filename: "predict"
-  model_name_or_path: ${encoder.pretrained_model_name_or_path}
-  tokenizer_kwargs: ${dataset.tokenizer_kwargs}
+  destination: ${run_dir}/typo_prediction.txt
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: ${encoder.pretrained_model_name_or_path}
+    do_word_tokenize: false
+    additional_special_tokens: ${special_tokens}
+    _convert_: all
diff --git a/configs/callbacks/word_module_writer.yaml b/configs/callbacks/word_module_writer.yaml
@@ -1,13 +1,10 @@
 prediction_writer:
   _target_: kwja.callbacks.word_module_writer.WordModuleWriter
-  output_dir: ${run_dir}
-  reading_resource_path: ${dataset.reading_resource_path}
-  pred_filename: "predict"
-  jumandic_path: kwja/resource/jumandic
   ambig_surf_specs:
     - conjtype: "イ形容詞アウオ段"
       conjform: "エ基本形"
     - conjtype: "イ形容詞イ段"
       conjform: "エ基本形"
     - conjtype: "イ形容詞イ段特殊"
       conjform: "エ基本形"
+  destination: ${run_dir}/word_prediction.knp
diff --git a/configs/char_module.debug.yaml b/configs/char_module.debug.yaml
@@ -2,35 +2,41 @@ defaults:
   - base
   - callbacks: [early_stopping, model_checkpoint, model_summary, char_module_writer, progress_bar]
   - datamodule: char
-  - dataset: char
   - logger: null
-  - encoder: char_roberta_base
+  - encoder: char_deberta_tiny
   - module: char
   - optimizer: adamw
   - scheduler: constant_schedule_with_warmup
   - trainer: debug
   - _self_
 
-max_seq_length: 128
+max_seq_length: 256
 do_predict_after_train: true
 checkpoint_path: ""
-document_split_stride: 1
+document_split_stride: -1
 
 # For word normalization
-denormalize_prob: 0.0
+denormalize_probability: 0.5
 
 # set monitor and mode for early_stopping and model_checkpoint
-monitor: valid/f1
+monitor: valid/aggregated_char_metrics
 mode: max
+aggregating_metrics:
+  - word_segmentation_f1
+#  - word_normalization_f1  # exclude because this metric is unstable
 
 # hyper-parameters to be tuned
-warmup_steps: 100
+lr: 2e-5
+max_epochs: 2
+warmup_steps: 10
 effective_batch_size: 16
 
 # environment dependent settings
-devices: ${oc.env:GPUS,0}
-max_batches_per_device: 2
-num_workers: 0
+devices: ${oc.env:DEVICES,0}
+max_batches_per_device: ${oc.env:MAX_BATCHES_PER_DEVICE,2}
+num_workers: ${oc.env:NUM_WORKERS,0}
 
 ignore_hparams_on_save: false
+
+# constants
 hparams_to_ignore_on_save: []