Merge pull request #177 from ku-nlp/v2.1.0

v2.1.0
ku-nlp · Jun 2, 2023 · b06245f · b06245f
2 parents 832e0c6 + dd9d8ea
commit b06245f
Show file tree

Hide file tree

Showing 155 changed files with 5,233 additions and 3,294 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,56 @@
+name: Build
+
+on: [ push, pull_request ]
+
+jobs:
+  build:
+    name: Build the project
+    runs-on: ${{ matrix.os }}
+    strategy:
+      max-parallel: 4
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Poetry
+        run: |
+            pipx install poetry
+            echo "$HOME/.local/bin" >> $GITHUB_PATH
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction --without dev,test
+      - name: Build KWJA
+        run: |
+          poetry build
+      - name: Install KWJA from wheel (non-Windows)
+        if: ${{ matrix.os != 'windows-latest' }}
+        run: |
+          pip3 install dist/*.whl
+      - name: Install KWJA from wheel (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: |
+          $whlFile = (Get-ChildItem -Path dist -Filter *.whl).FullName
+          pip3 install $whlFile
+        shell: pwsh
+      - name: Run KWJA (non-Windows)
+        if: ${{ matrix.os != 'windows-latest' }}
+        run: |
+          kwja --version
+          kwja --help
+          kwja --tasks typo,senter,char,word --model-size tiny --text "自然言語処理"
+          kwja --tasks typo,senter,seq2seq,word --model-size tiny --text "自然言語処理"
+      - name: Run KWJA (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: |
+          $env:PYTHONUTF8 = "1"
+          kwja --version
+          kwja --help
+          kwja --tasks typo,senter,char,word --model-size tiny --text "自然言語処理"
+          kwja --tasks typo,senter,seq2seq,word --model-size tiny --text "自然言語処理"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -6,24 +6,18 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     strategy:
-      max-parallel: 3
+      max-parallel: 4
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install pre-commit
-        run: |
-          python3 -m pip install --user pipx
-          python3 -m pipx ensurepath
-          python3 -m pipx install pre-commit
-      - name: Add path for Python packages
-        run: echo "$HOME/.local/bin" >> $GITHUB_PATH
-      - name: Run linters
+      - name: Install pre-commit and run linters
         run: |
+          pipx install pre-commit
           pre-commit run --all-files
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -14,11 +14,11 @@ jobs:
         uses: actions/checkout@v3
       - name: Create Release
         id: create_release
-        uses: actions/create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
+        uses: softprops/action-gh-release@v1
+        if: startsWith(github.ref, 'refs/tags/')
         with:
+          body: |
+            [Changelog](https://github.com/ku-nlp/kwja/blob/main/CHANGELOG.md)
           tag_name: ${{ github.ref }}
-          release_name: Release ${{ github.ref }}
           draft: false
           prerelease: false
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,24 +7,21 @@ jobs:
     name: Run tests with pytest
     runs-on: ubuntu-latest
     strategy:
-      max-parallel: 3
+      max-parallel: 4
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install Poetry
         run: |
-          python3 -m pip install --user pipx
-          python3 -m pipx ensurepath
-          python3 -m pipx install poetry
-      - name: Add path for Python packages
-        run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+            pipx install poetry
+            echo "$HOME/.local/bin" >> $GITHUB_PATH
       - name: Install dependencies
         run: |
           poetry config virtualenvs.create false
@@ -42,3 +39,25 @@ jobs:
           root_dir: true
           name: codecov-umbrella
           verbose: true
+
+  test-cli:
+    name: Install KWJA from pip and run CLI
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 4
+      fail-fast: false
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install KWJA from pip
+        run: pip3 install .
+      - name: Run CLI
+        run: |
+          kwja --version
+          kwja --help
diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
@@ -9,9 +9,10 @@ on:
       - reopened
 
 jobs:
-  build:
+  typos:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-      - name: typos-action
-        uses: crate-ci/[email protected]
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Check spelling
+        uses: crate-ci/typos@master
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       - id: trailing-whitespace
       - id: check-yaml
   - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 23.3.0
     hooks:
       - id: black
   - repo: https://github.com/PyCQA/flake8
@@ -20,17 +20,19 @@ repos:
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.1.1
+    rev: v1.3.0
     hooks:
       - id: mypy
         additional_dependencies:
-          - rhoknp==1.2.1
+          - rhoknp==1.3.0
           - hydra-core==1.3.2
-          - torch==1.13.1
+          - torch==2.0.0
           - torchmetrics==0.11.4
-          - transformers==4.25.1
-          - tokenizers==0.13.2
-          - wandb==0.13.11
+          - transformers==4.28.1
+          - tokenizers==0.13.3
+          - wandb==0.15.1
+          - typer==0.9.0
+          - types-PyYAML==6.0.12.9
   - repo: https://github.com/jumanjihouse/pre-commit-hooks
     rev: 3.0.0
     hooks:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [v2.1.0] - 2023-06-02
+### Added
+- Support Python 3.11.
+- Support NN-based sentence segmentation.
+  ```shell
+  kwja --tasks senter --text "モーニング娘。は日本のアイドルグループです。"
+  ```
+- Support multiple files as input.
+  ```shell
+  kwja --filename file1.txt --filename file2.txt
+  ```
+- Introduce a config file. You can specify some options in `XDG_CONFIG_HOME/kwja/config.yaml`.
+  ```yaml
+  model_size: base
+  device: cpu
+  num_workers: 0
+  torch_compile: false
+  typo_batch_size: 1
+  senter_batch_size: 1
+  seq2seq_batch_size: 1
+  char_batch_size: 1
+  word_batch_size: 1
+  ```
+- Implement padding truncation of word module to accelerate inference.
+- Support Windows.
+
+### Changed
+- Support CUDA 11.7 by default instead of CUDA 10.x.
+- Skip typo correction by default.
+- Optimize package requirements for faster loading.
+- Optimize model initialization for faster loading.
+- Replace mt5 models with t5 models pre-trained on Japanese corpora in seq2seq module.
+- Use partially annotated data for word normalization to train seq2seq module.
+
+### Removed
+- Remove the discourse module.
+
+### Fixed
+- Fix a bug that warning messages are shown when Juman++ and/or KNP are not installed.
+- Fix a bug that document IDs are not assigned properly when a text file is given as input.
+
 ## [2.0.0] - 2023-03-14
 
 ### Added
@@ -118,7 +159,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Removed
 - Remove an unnecessary dependency, `fugashi`.
 
-[Unreleased]: https://github.com/ku-nlp/kwja/compare/v2.0.0...HEAD
+[Unreleased]: https://github.com/ku-nlp/kwja/compare/v2.1.0...HEAD
+[2.1.0]: https://github.com/ku-nlp/kwja/compare/v2.0.0...v2.1.0
 [2.0.0]: https://github.com/ku-nlp/kwja/compare/v1.4.2...v2.0.0
 [1.4.2]: https://github.com/ku-nlp/kwja/compare/v1.4.1...v1.4.2
 [1.4.1]: https://github.com/ku-nlp/kwja/compare/v1.4.0...v1.4.1

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -64,39 +64,38 @@ Options:
 "build_datasets.sh" performs formatting KWDLC and annotated FKC corpus.
 
 ```shell
-./scripts/build_datasets.sh
-  -a $(poetry run echo $VIRTUAL_ENV)/bin/activate
-  -w /path/to/work_dir
-  -s $(realpath ./scripts)
-  -j 2
-  -o /path/to/output_dir
+./scripts/build_datasets.sh \
+  --jobs 2 \
+  --out-dir /path/to/output_dir
 ```
 
 Options:
 
-- `-a`: path to activator
-- `-w`: path to working directory
-- `-s`: path to scripts
-- `-j`: number of jobs
-- `-o`: path to output directory
+- `--jobs`: number of jobs
+- `--out-dir`: path to output directory
 
 NOTE:
 To train word module on Kyoto University Text Corpus, you must have access to it and IREX CRL named entity data.
 If you have both access, you can format the corpus with the following commands.
 (You may need preprocessing to format IREX CRL named entity data.)
 
 ```shell
-poetry run python scripts/add_features_to_raw_corpus.py
-  KyotoCorpus/knp
-  kyoto/knp
-  --ne-tags IREX_CRL_NE_data.jmn
+poetry run python scripts/build_dataset.py \
+  ./KyotoCorpus/knp \
+  ./kyoto/knp \
+  --ne-tags ./IREX_CRL_NE_data.jmn \
   -j 2
 poetry run kyoto idsplit \
   --corpus-dir kyoto/knp \
   --output-dir kyoto \
   --train KyotoCorpus/id/full/train.id \
   --valid KyotoCorpus/id/full/dev.id \
   --test KyotoCorpus/id/full/test.id
+poetry run python scripts/build_dataset.py \
+  ./KyotoCorpus/knp \
+  ./kyoto_ed \
+  --id ./KyotoCorpus/id/syntax-only \
+  -j 32
 ```
 
 ## Training and evaluation
@@ -117,16 +116,20 @@ poetry run python scripts/test.py module=char checkpoint_path="/path/to/checkpoi
 
 ## Debugging
 
-You can do debugging on local and server environments:
 
-Local environment (using CPU):
+```shell
+# For debugging word segmenter
+poetry run python scripts/train.py -cn char_module.debug
+```
+
+If you are on a machine with MPS devices (e.g. Apple M1), specify `trainer=cpu.debug` to use CPU.
 
 ```shell
 # For debugging word segmenter
-poetry run python scripts/train.py -cn char_module.debug devices=1
+poetry run python scripts/train.py -cn char_module.debug trainer=cpu.debug
 ```
 
-Server environment (using GPU):
+If you are on a machine with GPUs, you can specify the GPUs to use with the `devices` option.
 
 ```shell
 # For debugging word segmenter