diff --git a/.github/workflows/haskell.yml b/.github/workflows/haskell.yml index 87334ec4..51999d8a 100644 --- a/.github/workflows/haskell.yml +++ b/.github/workflows/haskell.yml @@ -123,15 +123,14 @@ jobs: ghc_version: 9.6.0.20230128 ghcup_release_channel: "https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-prereleases-0.0.7.yaml" runner: ubuntu-latest - cabal_version: 3.8.1.0 + cabal_version: 3.9.0.0 - # [TODO] Use latest cabal (pre-)release - name: head ghc_version: head - # ghcup_release_channel: "https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-prereleases-0.0.7.yaml" + ghcup_release_channel: "https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-prereleases-0.0.7.yaml" runner: ubuntu-latest - # cabal_version: 3.9.0.0 - cabal_version: latest + # [WARNING] Ensure to use latest cabal (pre-)release + cabal_version: 3.9.0.0 - name: hlint pack_options: HLINT_OPTIONS="lint" HLINT_TARGETS="lib exe" @@ -148,6 +147,7 @@ jobs: ghcup-release-channel: ${{ matrix.ghcup_release_channel }} cabal-version: ${{ matrix.cabal_version }} + # [TODO] Use haskell/actions/setup when it supports reliably GHC head. # Adapted from https://github.com/composewell/streamly/blob/master/.github/workflows/haskell.yml - name: Install GHC head environment if: ${{ matrix.ghc_version == 'head' }} @@ -164,6 +164,8 @@ jobs: GHCUP_VER=0.1.19.0 $CURL -sL -o ./ghcup https://downloads.haskell.org/~ghcup/$GHCUP_VER/${GHCUP_ARCH}-ghcup-$GHCUP_VER chmod +x ./ghcup + # Set ghcup pre-release + ./ghcup config add-release-channel ${{ matrix.ghcup_release_channel }} # Install GHC head # The URL may change, to find a working URL go to https://gitlab.haskell.org/ghc/ghc/-/jobs/ # Find a debian10 job, click on a passed/failed job, at the diff --git a/.github/workflows/s390x.yml b/.github/workflows/s390x.yml index 663bcef1..39660ccf 100644 --- a/.github/workflows/s390x.yml +++ b/.github/workflows/s390x.yml @@ -37,24 +37,24 @@ jobs: ghc --make \ -XMagicHash -XBangPatterns -XUnboxedTuples -XScopedTypeVariables \ -XLambdaCase -XBlockArguments -XTupleSections \ - -iunicode-data/test:unicode-data/lib \ - -o core-test unicode-data/test/Main.hs + -iunicode-data-core/test:unicode-data-core/lib \ + -o core-test unicode-data-core/test/Main.hs ./core-test ghc --make \ -XMagicHash -XBangPatterns -XUnboxedTuples -XScopedTypeVariables \ -XLambdaCase -XBlockArguments -XTupleSections \ - -iunicode-data-names/test:unicode-data-names/lib:unicode-data/lib \ + -iunicode-data-names/test:unicode-data-names/lib:unicode-data-core/lib \ -o names-test unicode-data-names/test/Main.hs ./names-test ghc --make \ -XMagicHash -XBangPatterns -XUnboxedTuples -XScopedTypeVariables \ -XLambdaCase -XBlockArguments -XTupleSections \ - -iunicode-data-scripts/test:unicode-data-scripts/lib:unicode-data/lib \ + -iunicode-data-scripts/test:unicode-data-scripts/lib:unicode-data-core/lib \ -o scripts-test unicode-data-scripts/test/Main.hs ./scripts-test ghc --make \ -XMagicHash -XBangPatterns -XUnboxedTuples -XScopedTypeVariables \ -XLambdaCase -XBlockArguments -XTupleSections \ - -iunicode-data-security/test:unicode-data-security/lib:unicode-data/lib \ + -iunicode-data-security/test:unicode-data-security/lib:unicode-data-core/lib \ -o security-test unicode-data-security/test/Main.hs ./security-test diff --git a/README.md b/README.md index 3baec6e5..d85a0bcc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ This repository provides packages to use the [Unicode character database](https://www.unicode.org/ucd/) (UCD): -- [`unicode-data`](#unicode-data) for general character properties. +- [`unicode-data`](#unicode-data) is an all-in-one package re-exporting the + following ones. +- [`unicode-data-core`](#unicode-data-core) for general character properties. - [`unicode-data-names`](#unicode-data-names) for characters names and aliases. - [`unicode-data-scripts`](#unicode-data-scripts) for characters scripts. - [`unicode-data-security`](#unicode-data-security) for security mechanisms. @@ -16,12 +18,21 @@ The latest Unicode version supported by these libraries is ### `unicode-data` -[`unicode-data`](unicode-data#readme) provides Haskell APIs to efficiently +[`unicode-data`](unicode-data#readme) is an _all-in-one_ package that re-exports +all the `unicode-data-*` package familly. + +Please see the +[Haddock documentation](https://hackage.haskell.org/package/unicode-data) +for reference documentation. + +### `unicode-data-core` + +[`unicode-data-core`](unicode-data-core#readme) provides Haskell APIs to efficiently access the Unicode character database. Performance is the primary goal in the design of this package. Please see the -[Haddock documentation](https://hackage.haskell.org/package/unicode-data) +[Haddock documentation](https://hackage.haskell.org/package/unicode-data-core) for reference documentation. ### `unicode-data-names` @@ -56,17 +67,17 @@ for reference documentation. ## Performance -`unicode-data` is up to [_5 times faster_](unicode-data#performance) +`unicode-data` is up to [_5 times faster_](unicode-data-core#performance) than `base`. ## Unicode database version update -See `unicode-data`’s [guide](unicode-data/README.md#unicode-database-version-update). +See `unicode-data-core`’s [guide](unicode-data-core/README.md#unicode-database-version-update). ## Licensing `unicode-data*` packages are an [open source](https://github.com/composewell/unicode-data) -project available under a liberal [Apache-2.0 license](unicode-data/LICENSE). +project available under a liberal [Apache-2.0 license](unicode-data-core/LICENSE). ## Contributing diff --git a/experimental/unicode-data-text/unicode-data-text.cabal b/experimental/unicode-data-text/unicode-data-text.cabal index fa827518..2b8bf39b 100644 --- a/experimental/unicode-data-text/unicode-data-text.cabal +++ b/experimental/unicode-data-text/unicode-data-text.cabal @@ -22,9 +22,10 @@ tested-with: GHC==8.0.2 , GHC==8.6.5 , GHC==8.8.4 , GHC==8.10.7 - , GHC==9.0.1 - , GHC==9.2.1 - , GHC==9.4.2 + , GHC==9.0.2 + , GHC==9.2.5 + , GHC==9.4.4 + , GHC==9.6.0 extra-source-files: Changelog.md @@ -66,9 +67,9 @@ library hs-source-dirs: lib build-depends: - base >= 4.7 && < 4.19, - text >= 1.2.4 && < 2.1, - unicode-data >= 0.3 && < 0.5 + base >= 4.7 && < 4.19, + text >= 1.2.4 && < 2.1, + unicode-data-core >= 15.0.0 && < 15.1 test-suite test import: default-extensions, compile-options @@ -79,9 +80,9 @@ test-suite test other-modules: Unicode.Text.CaseSpec build-depends: - base >= 4.7 && < 4.19, - hspec >= 2.0 && < 2.11, - text >= 1.2.4 && < 2.1, + base >= 4.7 && < 4.19, + hspec >= 2.0 && < 2.11, + text >= 1.2.4 && < 2.1, unicode-data-text build-tool-depends: hspec-discover:hspec-discover >= 2.0 && < 2.11 @@ -95,11 +96,11 @@ benchmark bench hs-source-dirs: bench main-is: Main.hs build-depends: - base >= 4.7 && < 4.19, - deepseq >= 1.1 && < 1.5, - tasty-bench >= 0.2.5 && < 0.4, - tasty >= 1.4.1, - text >= 1.2.4 && < 2.1, - unicode-data >= 0.3 && < 0.5, + base >= 4.7 && < 4.19, + deepseq >= 1.1 && < 1.5, + tasty-bench >= 0.2.5 && < 0.4, + tasty >= 1.4.1, + text >= 1.2.4 && < 2.1, + unicode-data-core >= 15.0.0 && < 15.1, unicode-data-text ghc-options: -O2 -fdicts-strict -rtsopts diff --git a/stack.yaml b/stack.yaml index 68c4fedb..cd769a83 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,6 +1,7 @@ resolver: lts-18.18 packages: - './unicode-data' +- './unicode-data-core' - './unicode-data-names' - './unicode-data-scripts' - './unicode-data-security' diff --git a/ucd.sh b/ucd.sh index ac88d97e..1cf46ec4 100755 --- a/ucd.sh +++ b/ucd.sh @@ -82,7 +82,7 @@ run_generator() { # Compile and run ucd2haskell cabal run --flag ucd2haskell ucd2haskell -- \ --input "./data/$VERSION" \ - --output-core ./unicode-data/lib/ \ + --output-core ./unicode-data-core/lib/ \ --output-names ./unicode-data-names/lib/ \ --output-scripts ./unicode-data-scripts/lib/ \ --output-security ./unicode-data-security/lib/ \ @@ -98,7 +98,7 @@ run_generator() { --core-prop Pattern_White_Space # Update unicodeVersion in Unicode.Char VERSION_AS_LIST=$(echo "$VERSION" | sed "s/\./, /g") - sed -ri "s/^(unicodeVersion = makeVersion \[)[^]]*\]/\1$VERSION_AS_LIST\]/" "unicode-data/lib/Unicode/Char.hs" + sed -ri "s/^(unicodeVersion = makeVersion \[)[^]]*\]/\1$VERSION_AS_LIST\]/" "unicode-data-core/lib/Unicode/Char.hs" } # Print help text diff --git a/ucd2haskell/exe/UCD2Haskell.hs b/ucd2haskell/exe/UCD2Haskell.hs index e931afaf..b738e524 100644 --- a/ucd2haskell/exe/UCD2Haskell.hs +++ b/ucd2haskell/exe/UCD2Haskell.hs @@ -16,13 +16,13 @@ data CLIOptions = CLIOptions { input :: FilePath , output_core :: FilePath - -- ^ `unicode-data` + -- ^ @unicode-data-core@ , output_names :: FilePath - -- ^ `unicode-data-names` + -- ^ @unicode-data-names@ , output_scripts :: FilePath - -- ^ `unicode-data-scripts` + -- ^ @unicode-data-scripts@ , output_security :: FilePath - -- ^ `unicode-data-security` + -- ^ @unicode-data-security@ , core_prop :: [String] } deriving (Show, Generic, HasArguments) diff --git a/ucd2haskell/ucd2haskell.cabal b/ucd2haskell/ucd2haskell.cabal index 8161d838..2d8c1a54 100644 --- a/ucd2haskell/ucd2haskell.cabal +++ b/ucd2haskell/ucd2haskell.cabal @@ -14,16 +14,9 @@ copyright: 2020 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple -tested-with: GHC==8.0.2 - , GHC==8.2.2 - , GHC==8.4.4 - , GHC==8.6.5 - , GHC==8.8.4 +tested-with: GHC==8.8.4 , GHC==8.10.7 - , GHC==9.0.1 - , GHC==9.2.1 - , GHC==9.4.2 - , GHC==9.6.0 + , GHC==9.2.5 extra-source-files: README.md diff --git a/unicode-data-core/Changelog.md b/unicode-data-core/Changelog.md new file mode 100644 index 00000000..c20d99b3 --- /dev/null +++ b/unicode-data-core/Changelog.md @@ -0,0 +1,102 @@ +# Changelog + +## 15.0.0 (February 2023) + +- Rename package from `unicode-data` to `unicode-data-core`. + [`unicode-data`](https://hackage.haskell.org/package/unicode-data) is now + an all-in-one package with heavier dependencies. +- New version scheme: `U.B.M`, where `U` is the Unicode standard major version + number, `B` marks a breaking change and `M` a non-breaking change per + [PVP](https://pvp.haskell.org/). + +### Breaking changes + +- Removed deprecated predicates `isLower` and `isUpper` in `Unicode.Char.Case`. + To migrate, use `isLowerCase` and `isUpperCase` respectively. +- Removed deprecated predicates `isLetter` and `isSpace` in `Unicode.Char.General`. + To migrate, use `isAlphabetic` and `isWhiteSpace` respectively. +- Remove deprecated predicate `isNumber` in `Unicode.Char.Numeric`. + To migrate, use `isNumber` from `Unicode.Char.Numeric.Compat`. + +## 0.4.0.1 (December 2022) + +- Fix [Unicode blocks handling on big-endian architectures](https://github.com/composewell/unicode-data/issues/97). + +## 0.4.0 (October 2022) + +- Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/). + +## 0.3.1 (September 2022) + +- Added full case conversions to `Unicode.Char.Case`: + + - Case folding: `caseFoldMapping` and `toCaseFoldString`. + - Lower case: `lowerCaseMapping` and `toLowerString`. + - Upper case: `upperCaseMapping` and `toUpperString`. + - Title case: `titleCaseMapping` and `toTitleString`. + - Stream mechanism: `Unfold` and `Step`. + +- Added `isNumeric`, `numericValue` and `integerValue` + to `Unicode.Char.Numeric`. +- Added the module `Unicode.Char.General.Blocks`. +- Add compatibility module: + + - `Unicode.Char.Numeric.Compat` + +### Deprecations + +- `Unicode.Char.Numeric.isNumber`: it will be replaced by `isNumeric` + in a _future_ version of this package. + Use the function in `Unicode.Char.Numeric.Compat` instead. + +## 0.3.0 (December 2021) + +- Support for big-endian architectures. +- Added `unicodeVersion`. +- Added `GeneralCategory` data type and corresponding `generalCategoryAbbr`, + `generalCategory` functions. +- Added the following functions to `Unicode.Char.General`: + `isAlphabetic`, `isAlphaNum`, + `isControl`, `isMark`, `isPrint`, `isPunctuation`, `isSeparator`, + `isSymbol` and `isWhiteSpace`. +- Added the module `Unicode.Char.Numeric`. +- Add compatibility modules: + + - `Unicode.Char.General.Compat` + - `Unicode.Char.Case.Compat` + + These modules are compatible with `base:Data.Char`. +- Re-export some functions from `Data.Char` in order to make `Unicode.Char` + a drop-in replacement in a _future_ version of this package. +- Drop support for GHC 7.10.3 + +### Deprecations + +- In `Unicode.Char.Case`: + + - `isUpper`: use `isUpperCase` instead. + - `isLower`: use `isLowerCase` instead. + +- In `Unicode.Char.General`: + + - `isLetter`: use `isAlphabetic` instead. + - `isSpace`: use `isWhiteSpace` instead. + +- In `Unicode.Char`: same as hereinabove. These functions will be replaced in a + _future_ release with the functions with the same names from + `Unicode.Char.Case.Compat` and `Unicode.Char.General.Compat`. + +## 0.2.0 (November 2021) + +* Update to [Unicode 14.0.0](https://www.unicode.org/versions/Unicode14.0.0/). +* Add `Unicode.Char.Identifiers` supporting Unicode Identifier and Pattern + Syntax. + +## 0.1.0.1 (Jul 2021) + +* Workaround to avoid incorrect display of dependencies on Hackage by moving + build-depends of ucd2haskell executable under a build flag conditional. + +## 0.1.0 (Jul 2021) + +* Initial release diff --git a/unicode-data-core/LICENSE b/unicode-data-core/LICENSE new file mode 100644 index 00000000..2fab32ff --- /dev/null +++ b/unicode-data-core/LICENSE @@ -0,0 +1,249 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work. + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + +------------------------------------------------------------------------------- +This distribution includes portions of code from the "unicode-transforms" +package (https://github.com/composewell/unicode-transforms/) which is +available under BSD-3-Clause license as described below. +------------------------------------------------------------------------------- + +Copyright (c) 2016, Harendra Kumar +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- +This distribution includes portions of code from the "unicode-transforms" +package (https://github.com/composewell/unicode-transforms/) +which included portions of code from the "prose" +(https://github.com/llelf/prose) package available under BSD-3-Clause +license as described below. +------------------------------------------------------------------------------- + +Copyright (c) 2014–2015, Antonio Nikishaev + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Antonio Nikishaev nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/unicode-data/NOTICE b/unicode-data-core/NOTICE similarity index 94% rename from unicode-data/NOTICE rename to unicode-data-core/NOTICE index 2d05db71..fab5a61f 100644 --- a/unicode-data/NOTICE +++ b/unicode-data-core/NOTICE @@ -1,4 +1,4 @@ -unicode-data +unicode-data-core Copyright 2020 Composewell Technologies and Contributors This product includes software developed at diff --git a/unicode-data-core/README.md b/unicode-data-core/README.md new file mode 100644 index 00000000..2ccebfdc --- /dev/null +++ b/unicode-data-core/README.md @@ -0,0 +1,161 @@ +# README + +`unicode-data-core` provides Haskell APIs to efficiently access the Unicode +character database. [Performance](#performance) is the primary goal in the +design of this package. + +The Haskell data structures are generated programmatically from the +[Unicode character database](https://www.unicode.org/ucd/) (UCD) files. +The latest Unicode version supported by this library is +[`15.0.0`](https://www.unicode.org/versions/Unicode15.0.0/). + +Please see the +[Haddock documentation](https://hackage.haskell.org/package/unicode-data-core) +for reference documentation. + +## Performance + +`unicode-data-core` is up to _5 times faster_ than `base` ≤ 4.17 (see +[partial integration to `base`](#partial-integration-of-unicode-data-into-base)). + +The following benchmark compares the time taken in milliseconds to process all +the Unicode code points for `base-4.16` (GHC 9.2.1) and this package (v0.3). +Machine: 8 × AMD Ryzen 5 2500U on Linux. + +``` +All + Unicode.Char.Case.Compat + isLower + base: OK (1.53s) + 24 ms ± 3.8 ms + unicode-data: OK (2.25s) + 4.4 ms ± 88 μs, 0.19x + isUpper + base: OK (1.50s) + 24 ms ± 450 μs + unicode-data: OK (2.37s) + 4.7 ms ± 200 μs, 0.19x + toLower + base: OK (1.40s) + 22 ms ± 1.8 ms + unicode-data: OK (1.89s) + 7.2 ms ± 297 μs, 0.32x + toTitle + base: OK (1.25s) + 20 ms ± 2.0 ms + unicode-data: OK (1.65s) + 6.4 ms ± 509 μs, 0.32x + toUpper + base: OK (1.26s) + 20 ms ± 2.5 ms + unicode-data: OK (1.72s) + 6.8 ms ± 335 μs, 0.34x + Unicode.Char.General + generalCategory + base: OK (2.02s) + 134 ms ± 1.6 ms + unicode-data: OK (1.75s) + 116 ms ± 1.6 ms, 0.87x + isAlphaNum + base: OK (1.53s) + 24 ms ± 1.7 ms + unicode-data: OK (2.16s) + 4.2 ms ± 29 μs, 0.18x + isControl + base: OK (1.47s) + 23 ms ± 2.6 ms + unicode-data: OK (2.23s) + 4.4 ms ± 22 μs, 0.19x + isMark + base: OK (1.47s) + 23 ms ± 624 μs + unicode-data: OK (2.28s) + 4.5 ms ± 48 μs, 0.19x + isPrint + base: OK (1.53s) + 25 ms ± 2.4 ms + unicode-data: OK (2.27s) + 4.4 ms ± 50 μs, 0.18x + isPunctuation + base: OK (1.51s) + 24 ms ± 459 μs + unicode-data: OK (2.24s) + 4.4 ms ± 25 μs, 0.18x + isSeparator + base: OK (1.52s) + 24 ms ± 407 μs + unicode-data: OK (2.43s) + 4.8 ms ± 94 μs, 0.20x + isSymbol + base: OK (1.49s) + 24 ms ± 863 μs + unicode-data: OK (1.34s) + 5.2 ms ± 92 μs, 0.22x + Unicode.Char.General.Compat + isAlpha + base: OK (1.46s) + 23 ms ± 322 μs + unicode-data: OK (2.14s) + 4.1 ms ± 36 μs, 0.18x + isLetter + base: OK (1.44s) + 22 ms ± 640 μs + unicode-data: OK (2.17s) + 4.3 ms ± 58 μs, 0.19x + isSpace + base: OK (1.44s) + 11 ms ± 1.2 ms + unicode-data: OK (1.36s) + 5.3 ms ± 243 μs, 0.49x + Unicode.Char.Numeric + isNumber + base: OK (1.52s) + 24 ms ± 368 μs + unicode-data: OK (2.41s) + 4.7 ms ± 41 μs, 0.19x +``` + +### Partial integration of `unicode-data-core` into `base` + +Since `base` 4.18, `unicode-data-core` has been +_partially_ [integrated to GHC](https://gitlab.haskell.org/ghc/ghc/-/merge_requests/8072), +so there should be no relevant difference. However, using `unicode-data-core` allows +to select the _exact_ version of Unicode to support, therefore not relying on +the version supported by GHC. + +## Unicode database version update + +To update the Unicode version please update the version number in +`ucd.sh`. + +To download the Unicode database, run `ucd.sh download` from the top +level directory of the repo to fetch the database in `./ucd`. + +``` +$ ./ucd.sh download +``` + +To generate the Haskell data structure files from the downloaded database +files, run `ucd.sh generate` from the top level directory of the repo. + +``` +$ ./ucd.sh generate +``` + +## Running property doctests + +Temporarily add `QuickCheck` to build depends of library. + +``` +$ cabal build +$ cabal-docspec --check-properties --property-variables c +``` + +## Licensing + +`unicode-data-core` is an [open source](https://github.com/composewell/unicode-data) +project available under a liberal [Apache-2.0 license](LICENSE). + +## Contributing + +As an open project we welcome contributions. diff --git a/unicode-data/bench/Main.hs b/unicode-data-core/bench/Main.hs similarity index 99% rename from unicode-data/bench/Main.hs rename to unicode-data-core/bench/Main.hs index a30db66a..bfff4fe2 100644 --- a/unicode-data/bench/Main.hs +++ b/unicode-data-core/bench/Main.hs @@ -213,7 +213,6 @@ main = defaultMain ] ] , bgroup "Unicode.Char.Numeric" - -- [TODO] Replace with 'isNumber' once the migration is done. [ bgroup "isNumeric" [ benchChars "unicode-data" Num.isNumeric ] diff --git a/unicode-data/lib/Unicode/Char.hs b/unicode-data-core/lib/Unicode/Char.hs similarity index 65% rename from unicode-data/lib/Unicode/Char.hs rename to unicode-data-core/lib/Unicode/Char.hs index c12dadaf..8f4cfa81 100644 --- a/unicode-data/lib/Unicode/Char.hs +++ b/unicode-data-core/lib/Unicode/Char.hs @@ -9,20 +9,23 @@ -- corresponding to [Unicode Standard version -- 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/). -- --- This module re-exports several sub-modules under it. The sub-module --- structure under `Unicode.Char` is largely based on the --- ["Property Index by Scope of Use" in Unicode® Standard Annex #44](https://www.unicode.org/reports/tr44/#Property_Index_Table). +-- It can be used as a __drop-in replacement of "Data.Char"__, except for +-- string representations. +-- +-- This module re-exports several sub-modules under it. +-- The sub-module structure under `Unicode.Char` is largely based on the +-- [“Property Index by Scope of Use” in Unicode Standard Annex #44](https://www.unicode.org/reports/tr44/#Property_Index_Table). -- -- The @Unicode.Char.*@ modules in turn depend on @Unicode.Internal.Char.*@ --- modules which are programmatically generated from the Unicode standard's +-- modules which are programmatically generated from the Unicode standard’s -- Unicode character database files. The module structure under -- @Unicode.Internal.Char@ is largely based on the UCD text file names from -- which the properties are generated. -- -- For the original UCD files used in this code please refer to the @UCD@ --- section on the Unicode standard page. See --- https://www.unicode.org/reports/tr44/ to understand the contents and the --- format of the unicode database files. +-- section on the Unicode standard page. +-- See the [Unicode Standard Annex #44](https://www.unicode.org/reports/tr44/) +-- to understand the contents and the format of the Unicode database files. -- module Unicode.Char @@ -31,6 +34,7 @@ module Unicode.Char , module Unicode.Char.Case , module Unicode.Char.Case.Compat , module Unicode.Char.Numeric + , module Unicode.Char.Numeric.Compat , module Unicode.Char.Normalization , module Unicode.Char.Identifiers , unicodeVersion @@ -44,14 +48,15 @@ where import Data.Char (chr, ord) import Data.Version (Version, makeVersion) import Unicode.Char.Case hiding (Unfold(..), Step(..)) -import Unicode.Char.Case.Compat hiding (isLower, isUpper) +import Unicode.Char.Case.Compat import Unicode.Char.General -import Unicode.Char.General.Compat hiding (isLetter, isSpace) +import Unicode.Char.General.Compat import Unicode.Char.Identifiers import Unicode.Char.Numeric +import Unicode.Char.Numeric.Compat import Unicode.Char.Normalization --- | Version of Unicode standard used by @unicode-data@. +-- | Version of Unicode standard used by @unicode-data-core@. -- -- @since 0.3.0 unicodeVersion :: Version diff --git a/unicode-data/lib/Unicode/Char/Case.hs b/unicode-data-core/lib/Unicode/Char/Case.hs similarity index 86% rename from unicode-data/lib/Unicode/Char/Case.hs rename to unicode-data-core/lib/Unicode/Char/Case.hs index c23b3719..f079b3ec 100644 --- a/unicode-data/lib/Unicode/Char/Case.hs +++ b/unicode-data-core/lib/Unicode/Char/Case.hs @@ -18,9 +18,7 @@ module Unicode.Char.Case ( -- * Predicates isLowerCase - , isLower , isUpperCase - , isUpper -- * Case mappings -- $case @@ -64,17 +62,6 @@ import qualified Unicode.Internal.Char.SpecialCasing.UpperCaseMapping as C isLowerCase :: Char -> Bool isLowerCase = P.isLowercase --- | Returns 'True' for lower-case characters. --- --- It uses the character property --- . --- --- @since 0.1.0 -{-# INLINE isLower #-} -{-# DEPRECATED isLower "Use isLowerCase instead. Note that the behavior of this function does not match base:Data.Char.isLower. See Unicode.Char.Case.Compat for behavior compatible with base:Data.Char." #-} -isLower :: Char -> Bool -isLower = P.isLowercase - -- | Returns 'True' for upper-case characters. -- -- It uses the character property @@ -89,21 +76,6 @@ isLower = P.isLowercase isUpperCase :: Char -> Bool isUpperCase = P.isUppercase --- | Returns 'True' for upper-case characters. --- --- It uses the character property --- . --- --- Note: it does /not/ match title-cased letters. Those are matched using: --- @'Unicode.Char.General.generalCategory' c == --- 'Unicode.Char.General.TitlecaseLetter'@. --- --- @since 0.1.0 -{-# INLINE isUpper #-} -{-# DEPRECATED isUpper "Use isUpperCase instead. Note that the behavior of this function does not match base:Data.Char.isUpper. See Unicode.Char.Case.Compat for behavior compatible with base:Data.Char." #-} -isUpper :: Char -> Bool -isUpper = P.isUppercase - -- $case -- -- Correct case conversion rules may map one input character to two or three diff --git a/unicode-data/lib/Unicode/Char/Case/Compat.hs b/unicode-data-core/lib/Unicode/Char/Case/Compat.hs similarity index 100% rename from unicode-data/lib/Unicode/Char/Case/Compat.hs rename to unicode-data-core/lib/Unicode/Char/Case/Compat.hs diff --git a/unicode-data/lib/Unicode/Char/General.hs b/unicode-data-core/lib/Unicode/Char/General.hs similarity index 94% rename from unicode-data/lib/Unicode/Char/General.hs rename to unicode-data-core/lib/Unicode/Char/General.hs index 4130ec6a..bdaf7f18 100644 --- a/unicode-data/lib/Unicode/Char/General.hs +++ b/unicode-data-core/lib/Unicode/Char/General.hs @@ -26,8 +26,6 @@ module Unicode.Char.General , isSeparator , isSymbol , isWhiteSpace - , isLetter - , isSpace -- ** Re-export , isAscii , isLatin1 @@ -431,25 +429,6 @@ isSymbol c = case generalCategory c of OtherSymbol -> True _ -> False --- | Returns 'True' for alphabetic Unicode characters (lower-case, upper-case --- and title-case letters, plus letters of caseless scripts and modifiers --- letters). --- --- @since 0.1.0 -{-# INLINE isLetter #-} -{-# DEPRECATED isLetter "Use isAlphabetic instead. Note that the behavior of this function does not match base:Data.Char.isLetter. See Unicode.Char.General.Compat for behavior compatible with base:Data.Char." #-} -isLetter :: Char -> Bool -isLetter = P.isAlphabetic - --- | Returns 'True' for any whitespace characters, and the control --- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@. --- --- @since 0.1.0 -{-# INLINE isSpace #-} -{-# DEPRECATED isSpace "Use isWhiteSpace instead. Note that the behavior of this function does not match base:Data.Char.isSpace. See Unicode.Char.General.Compat for behavior compatible with base:Data.Char." #-} -isSpace :: Char -> Bool -isSpace = P.isWhite_Space - ------------------------------------------------------------------------------- -- Korean Hangul ------------------------------------------------------------------------------- diff --git a/unicode-data/lib/Unicode/Char/General/Blocks.hs b/unicode-data-core/lib/Unicode/Char/General/Blocks.hs similarity index 100% rename from unicode-data/lib/Unicode/Char/General/Blocks.hs rename to unicode-data-core/lib/Unicode/Char/General/Blocks.hs diff --git a/unicode-data/lib/Unicode/Char/General/Compat.hs b/unicode-data-core/lib/Unicode/Char/General/Compat.hs similarity index 100% rename from unicode-data/lib/Unicode/Char/General/Compat.hs rename to unicode-data-core/lib/Unicode/Char/General/Compat.hs diff --git a/unicode-data/lib/Unicode/Char/Identifiers.hs b/unicode-data-core/lib/Unicode/Char/Identifiers.hs similarity index 100% rename from unicode-data/lib/Unicode/Char/Identifiers.hs rename to unicode-data-core/lib/Unicode/Char/Identifiers.hs diff --git a/unicode-data/lib/Unicode/Char/Normalization.hs b/unicode-data-core/lib/Unicode/Char/Normalization.hs similarity index 100% rename from unicode-data/lib/Unicode/Char/Normalization.hs rename to unicode-data-core/lib/Unicode/Char/Normalization.hs diff --git a/unicode-data/lib/Unicode/Char/Numeric.hs b/unicode-data-core/lib/Unicode/Char/Numeric.hs similarity index 68% rename from unicode-data/lib/Unicode/Char/Numeric.hs rename to unicode-data-core/lib/Unicode/Char/Numeric.hs index 770337fa..2098a124 100644 --- a/unicode-data/lib/Unicode/Char/Numeric.hs +++ b/unicode-data-core/lib/Unicode/Char/Numeric.hs @@ -11,7 +11,6 @@ module Unicode.Char.Numeric ( -- * Predicates isNumeric - , isNumber -- * Numeric values , numericValue @@ -28,32 +27,8 @@ module Unicode.Char.Numeric import Data.Char (digitToInt, intToDigit, isDigit, isHexDigit, isOctDigit) import Data.Maybe (isJust) import Data.Ratio (numerator, denominator) -import qualified Unicode.Char.Numeric.Compat as Compat import qualified Unicode.Internal.Char.DerivedNumericValues as V --- | Selects Unicode numeric characters, including digits from various --- scripts, Roman numerals, et cetera. --- --- This function returns 'True' if its argument has one of the --- following 'Unicode.Char.General.GeneralCategory's, or 'False' otherwise: --- --- * 'Unicode.Char.General.DecimalNumber' --- * 'Unicode.Char.General.LetterNumber' --- * 'Unicode.Char.General.OtherNumber' --- --- __Note:__ a character may have a numeric value (see 'numericValue') but return --- 'False', because 'isNumber' only tests 'Unicode.Char.General.GeneralCategory': --- some CJK characters are 'Unicode.Char.General.OtherLetter' and do have a --- numeric value. Use 'isNumeric' to cover those cases as well. --- --- prop> isNumber c == Data.Char.isNumber c --- --- @since 0.3.0 -{-# DEPRECATED isNumber "Use Unicode.Char.Numeric.Compat.isNumber instead. This function will be a synonym for isNumeric in a future release. See Unicode.Char.Numeric.Compat for behavior compatible with base:Data.Char." #-} -{-# INLINE isNumber #-} -isNumber :: Char -> Bool -isNumber = Compat.isNumber - -- | Selects Unicode character with a numeric value. -- -- __Note:__ a character may have a numeric value but return 'False' with diff --git a/unicode-data/lib/Unicode/Char/Numeric/Compat.hs b/unicode-data-core/lib/Unicode/Char/Numeric/Compat.hs similarity index 100% rename from unicode-data/lib/Unicode/Char/Numeric/Compat.hs rename to unicode-data-core/lib/Unicode/Char/Numeric/Compat.hs diff --git a/unicode-data/lib/Unicode/Internal/Bits.hs b/unicode-data-core/lib/Unicode/Internal/Bits.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Bits.hs rename to unicode-data-core/lib/Unicode/Internal/Bits.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/Blocks.hs b/unicode-data-core/lib/Unicode/Internal/Char/Blocks.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/Blocks.hs rename to unicode-data-core/lib/Unicode/Internal/Char/Blocks.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/CaseFolding.hs b/unicode-data-core/lib/Unicode/Internal/Char/CaseFolding.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/CaseFolding.hs rename to unicode-data-core/lib/Unicode/Internal/Char/CaseFolding.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/DerivedCoreProperties.hs b/unicode-data-core/lib/Unicode/Internal/Char/DerivedCoreProperties.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/DerivedCoreProperties.hs rename to unicode-data-core/lib/Unicode/Internal/Char/DerivedCoreProperties.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/DerivedNumericValues.hs b/unicode-data-core/lib/Unicode/Internal/Char/DerivedNumericValues.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/DerivedNumericValues.hs rename to unicode-data-core/lib/Unicode/Internal/Char/DerivedNumericValues.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/PropList.hs b/unicode-data-core/lib/Unicode/Internal/Char/PropList.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/PropList.hs rename to unicode-data-core/lib/Unicode/Internal/Char/PropList.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/SpecialCasing/LowerCaseMapping.hs b/unicode-data-core/lib/Unicode/Internal/Char/SpecialCasing/LowerCaseMapping.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/SpecialCasing/LowerCaseMapping.hs rename to unicode-data-core/lib/Unicode/Internal/Char/SpecialCasing/LowerCaseMapping.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/SpecialCasing/TitleCaseMapping.hs b/unicode-data-core/lib/Unicode/Internal/Char/SpecialCasing/TitleCaseMapping.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/SpecialCasing/TitleCaseMapping.hs rename to unicode-data-core/lib/Unicode/Internal/Char/SpecialCasing/TitleCaseMapping.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/SpecialCasing/UpperCaseMapping.hs b/unicode-data-core/lib/Unicode/Internal/Char/SpecialCasing/UpperCaseMapping.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/SpecialCasing/UpperCaseMapping.hs rename to unicode-data-core/lib/Unicode/Internal/Char/SpecialCasing/UpperCaseMapping.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/CombiningClass.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/CombiningClass.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/CombiningClass.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/CombiningClass.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/Compositions.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/Compositions.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/Compositions.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/Compositions.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/Decomposable.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/Decomposable.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/Decomposable.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/Decomposable.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/DecomposableK.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/DecomposableK.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/DecomposableK.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/DecomposableK.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/Decompositions.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/Decompositions.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/Decompositions.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/Decompositions.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK2.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK2.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK2.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/DecompositionsK2.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/GeneralCategory.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/GeneralCategory.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/GeneralCategory.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/GeneralCategory.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/SimpleLowerCaseMapping.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/SimpleLowerCaseMapping.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/SimpleLowerCaseMapping.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/SimpleLowerCaseMapping.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/SimpleTitleCaseMapping.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/SimpleTitleCaseMapping.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/SimpleTitleCaseMapping.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/SimpleTitleCaseMapping.hs diff --git a/unicode-data/lib/Unicode/Internal/Char/UnicodeData/SimpleUpperCaseMapping.hs b/unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/SimpleUpperCaseMapping.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Char/UnicodeData/SimpleUpperCaseMapping.hs rename to unicode-data-core/lib/Unicode/Internal/Char/UnicodeData/SimpleUpperCaseMapping.hs diff --git a/unicode-data/lib/Unicode/Internal/Division.hs b/unicode-data-core/lib/Unicode/Internal/Division.hs similarity index 97% rename from unicode-data/lib/Unicode/Internal/Division.hs rename to unicode-data-core/lib/Unicode/Internal/Division.hs index e8f036cd..65c54511 100644 --- a/unicode-data/lib/Unicode/Internal/Division.hs +++ b/unicode-data-core/lib/Unicode/Internal/Division.hs @@ -12,7 +12,7 @@ -- Division by a constant can be replaced by a double-word multiplication. -- Roughly speaking, instead of dividing by x, multiply by 2^64/x, -- obtaining 128-bit-long product, and take upper 64 bits. The peculiar --- details can be found in Hacker's Delight, Ch. 10. +-- details can be found in Hacker’s Delight, Ch. 10. -- -- Even GHC 8.10 does not provide a primitive for a signed double-word -- multiplication, but since our applications does not involve negative diff --git a/unicode-data/lib/Unicode/Internal/Unfold.hs b/unicode-data-core/lib/Unicode/Internal/Unfold.hs similarity index 100% rename from unicode-data/lib/Unicode/Internal/Unfold.hs rename to unicode-data-core/lib/Unicode/Internal/Unfold.hs diff --git a/unicode-data/test/Main.hs b/unicode-data-core/test/Main.hs similarity index 100% rename from unicode-data/test/Main.hs rename to unicode-data-core/test/Main.hs diff --git a/unicode-data/test/Unicode/CharSpec.hs b/unicode-data-core/test/Unicode/CharSpec.hs similarity index 93% rename from unicode-data/test/Unicode/CharSpec.hs rename to unicode-data-core/test/Unicode/CharSpec.hs index 2eb3f31b..f2a98148 100644 --- a/unicode-data/test/Unicode/CharSpec.hs +++ b/unicode-data-core/test/Unicode/CharSpec.hs @@ -10,12 +10,6 @@ import Data.Ix (Ix(..)) import Data.Maybe (isJust) import qualified Unicode.Char as UChar import qualified Unicode.Char.General.Blocks as UBlocks --- [TODO] Remove the following qualified imports once isLetter and isSpace --- are removed from Unicode.Char.General -import qualified Unicode.Char.General.Compat as UCharCompat --- [TODO] Remove the following qualified imports once isUpper and isLower --- are removed from Unicode.Char.Case -import qualified Unicode.Char.Case.Compat as UCharCompat import qualified Unicode.Char.Numeric as UNumeric import qualified Unicode.Char.Numeric.Compat as UNumericCompat import Data.Foldable (traverse_) @@ -89,7 +83,7 @@ spec = do it "isControl" do UChar.isControl `shouldBeEqualTo` Char.isControl it "isLetter" do - UCharCompat.isLetter `shouldBeEqualTo` Char.isLetter + UChar.isLetter `shouldBeEqualTo` Char.isLetter it "isMark" do UChar.isMark `shouldBeEqualTo` Char.isMark it "isPrint" do @@ -99,18 +93,18 @@ spec = do it "isSeparator" do UChar.isSeparator `shouldBeEqualTo` Char.isSeparator it "isSpace" do - UCharCompat.isSpace `shouldBeEqualTo` Char.isSpace + UChar.isSpace `shouldBeEqualTo` Char.isSpace it "isSymbol" do UChar.isSymbol `shouldBeEqualTo` Char.isSymbol describe "Case" do it' "isLower" do - UCharCompat.isLower `shouldBeEqualTo` Char.isLower + UChar.isLower `shouldBeEqualTo` Char.isLower #if MIN_VERSION_base(4,18,0) it' "isLowerCase" do UChar.isLowerCase `shouldBeEqualTo` Char.isLowerCase #endif it' "isUpper" do - UCharCompat.isUpper `shouldBeEqualTo` Char.isUpper + UChar.isUpper `shouldBeEqualTo` Char.isUpper #if MIN_VERSION_base(4,18,0) it' "isUpperCase" do UChar.isUpperCase `shouldBeEqualTo` Char.isUpperCase diff --git a/unicode-data-core/unicode-data-core.cabal b/unicode-data-core/unicode-data-core.cabal new file mode 100644 index 00000000..40c78a82 --- /dev/null +++ b/unicode-data-core/unicode-data-core.cabal @@ -0,0 +1,145 @@ +cabal-version: 2.2 +name: unicode-data-core +version: 15.0.0 +synopsis: Access Unicode Character Database (UCD) +description: + @unicode-data-core@ provides Haskell APIs to efficiently access the + (UCD). + Performance is the primary goal in the design of this package. + . + The Haskell data structures are generated programmatically from the UCD files. + The latest Unicode version supported by this library is + @@. +homepage: http://github.com/composewell/unicode-data +bug-reports: https://github.com/composewell/unicode-data/issues +license: Apache-2.0 +license-file: LICENSE +author: Composewell Technologies and Contributors +maintainer: streamly@composewell.com +copyright: 2020 Composewell Technologies and Contributors +category: Data,Text,Unicode +stability: Experimental +build-type: Simple +tested-with: GHC==8.0.2 + , GHC==8.2.2 + , GHC==8.4.4 + , GHC==8.6.5 + , GHC==8.8.4 + , GHC==8.10.7 + , GHC==9.0.2 + , GHC==9.2.5 + , GHC==9.4.4 + , GHC==9.6.0 + +extra-source-files: + Changelog.md + README.md + NOTICE + +source-repository head + type: git + location: https://github.com/composewell/unicode-data + +common default-extensions + default-extensions: + BangPatterns + DeriveGeneric + MagicHash + RecordWildCards + ScopedTypeVariables + TupleSections + FlexibleContexts + + -- Experimental, may lead to issues + DeriveAnyClass + TemplateHaskell + UnboxedTuples + +common compile-options + ghc-options: -Wall + -fwarn-identities + -fwarn-incomplete-record-updates + -fwarn-incomplete-uni-patterns + -fwarn-tabs + default-language: Haskell2010 + +library + import: default-extensions, compile-options + default-language: Haskell2010 + exposed-modules: + Unicode.Char + -- The module structure is derived from + -- https://www.unicode.org/reports/tr44/#Property_Index_Table + Unicode.Char.Case + Unicode.Char.Case.Compat + Unicode.Char.General + Unicode.Char.General.Blocks + Unicode.Char.General.Compat + Unicode.Char.Identifiers + Unicode.Char.Normalization + Unicode.Char.Numeric + Unicode.Char.Numeric.Compat + + -- Internal files + Unicode.Internal.Bits + Unicode.Internal.Division + Unicode.Internal.Unfold + + -- Generated files + -- This module structure is largely based on the UCD file names from which + -- the properties are generated. + Unicode.Internal.Char.Blocks + Unicode.Internal.Char.CaseFolding + Unicode.Internal.Char.DerivedCoreProperties + Unicode.Internal.Char.DerivedNumericValues + Unicode.Internal.Char.PropList + Unicode.Internal.Char.SpecialCasing.LowerCaseMapping + Unicode.Internal.Char.SpecialCasing.TitleCaseMapping + Unicode.Internal.Char.SpecialCasing.UpperCaseMapping + Unicode.Internal.Char.UnicodeData.CombiningClass + Unicode.Internal.Char.UnicodeData.Compositions + Unicode.Internal.Char.UnicodeData.Decomposable + Unicode.Internal.Char.UnicodeData.DecomposableK + Unicode.Internal.Char.UnicodeData.Decompositions + Unicode.Internal.Char.UnicodeData.DecompositionsK + Unicode.Internal.Char.UnicodeData.DecompositionsK2 + Unicode.Internal.Char.UnicodeData.GeneralCategory + Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping + Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping + Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping + + hs-source-dirs: lib + ghc-options: -O2 + build-depends: + base >=4.7 && <4.19 + +test-suite test + import: default-extensions, compile-options + type: exitcode-stdio-1.0 + main-is: Main.hs + hs-source-dirs: + test + other-modules: + Unicode.CharSpec + build-depends: + base >= 4.7 && < 4.19 + , hspec >= 2.0 && < 2.11 + , unicode-data-core + -- We need to match a GHC version with the same Unicode version. + -- See: test/Unicode/CharSpec.hs for compatibility table. + if impl(ghc >= 9.6.0) + cpp-options: -DCOMPATIBLE_GHC_UNICODE + default-language: Haskell2010 + +benchmark bench + import: default-extensions, compile-options + type: exitcode-stdio-1.0 + hs-source-dirs: bench + main-is: Main.hs + build-depends: + base >= 4.7 && < 4.19, + deepseq >= 1.1 && < 1.5, + tasty-bench >= 0.2.5 && < 0.4, + tasty >= 1.4.1, + unicode-data-core + ghc-options: -O2 -fdicts-strict -rtsopts diff --git a/unicode-data-names/Changelog.md b/unicode-data-names/Changelog.md index bc43a162..776838f1 100644 --- a/unicode-data-names/Changelog.md +++ b/unicode-data-names/Changelog.md @@ -1,6 +1,12 @@ # Changelog -## 0.2.0 (September 2022) +## 15.0.0 (February 2023) + +- New version scheme: `U.B.M`, where `U` is the Unicode standard major version + number, `B` marks a breaking change and `M` a non-breaking change per + [PVP](https://pvp.haskell.org/). + +## 0.2.0 (October 2022) - Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/). diff --git a/unicode-data-names/unicode-data-names.cabal b/unicode-data-names/unicode-data-names.cabal index fb4cf3a3..572db6a1 100644 --- a/unicode-data-names/unicode-data-names.cabal +++ b/unicode-data-names/unicode-data-names.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: unicode-data-names -version: 0.2.0 +version: 15.0.0 synopsis: Unicode characters names and aliases description: @unicode-data-names@ provides Haskell APIs to access the Unicode @@ -26,9 +26,9 @@ tested-with: GHC==8.0.2 , GHC==8.6.5 , GHC==8.8.4 , GHC==8.10.7 - , GHC==9.0.1 - , GHC==9.2.1 - , GHC==9.4.2 + , GHC==9.0.2 + , GHC==9.2.5 + , GHC==9.4.4 , GHC==9.6.0 extra-source-files: @@ -84,7 +84,7 @@ library hs-source-dirs: lib build-depends: - base >= 4.7 && < 4.19 + base >= 4.7 && < 4.19 test-suite test import: default-extensions, compile-options @@ -95,9 +95,9 @@ test-suite test other-modules: Unicode.Char.General.NamesSpec build-depends: - base >= 4.7 && < 4.19 - , hspec >= 2.0 && < 2.11 - , unicode-data >= 0.4 && < 0.5 + base >= 4.7 && < 4.19 + , hspec >= 2.0 && < 2.11 + , unicode-data-core >= 15.0 && < 15.1 , unicode-data-names default-language: Haskell2010 @@ -111,7 +111,7 @@ executable export-all-chars buildable: True build-depends: base - , unicode-data + , unicode-data-core , unicode-data-names else buildable: False @@ -122,9 +122,9 @@ benchmark bench hs-source-dirs: bench main-is: Main.hs build-depends: - base >= 4.7 && < 4.19, - deepseq >= 1.1 && < 1.5, - tasty-bench >= 0.2.5 && < 0.4, - tasty >= 1.4.1, + base >= 4.7 && < 4.19, + deepseq >= 1.1 && < 1.5, + tasty-bench >= 0.2.5 && < 0.4, + tasty >= 1.4.1, unicode-data-names ghc-options: -O2 -fdicts-strict -rtsopts diff --git a/unicode-data-scripts/Changelog.md b/unicode-data-scripts/Changelog.md index 22919687..488a3bb7 100644 --- a/unicode-data-scripts/Changelog.md +++ b/unicode-data-scripts/Changelog.md @@ -1,10 +1,16 @@ # Changelog +## 15.0.0 (February 2023) + +- New version scheme: `U.B.M`, where `U` is the Unicode standard major version + number, `B` marks a breaking change and `M` a non-breaking change per + [PVP](https://pvp.haskell.org/). + ## 0.2.0.1 (December 2022) - Fix [Unicode scripts handling on big-endian architectures](https://github.com/composewell/unicode-data/issues/97). -## 0.2.0 (September 2022) +## 0.2.0 (October 2022) - Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/). diff --git a/unicode-data-scripts/unicode-data-scripts.cabal b/unicode-data-scripts/unicode-data-scripts.cabal index d551e063..e7909485 100644 --- a/unicode-data-scripts/unicode-data-scripts.cabal +++ b/unicode-data-scripts/unicode-data-scripts.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: unicode-data-scripts -version: 0.2.0.1 +version: 15.0.0 synopsis: Unicode characters scripts description: @unicode-data-scripts@ provides Haskell APIs to access the Unicode @@ -26,9 +26,9 @@ tested-with: GHC==8.0.2 , GHC==8.6.5 , GHC==8.8.4 , GHC==8.10.7 - , GHC==9.0.1 - , GHC==9.2.1 - , GHC==9.4.2 + , GHC==9.0.2 + , GHC==9.2.5 + , GHC==9.4.4 , GHC==9.6.0 extra-source-files: @@ -79,8 +79,8 @@ library hs-source-dirs: lib build-depends: - base >= 4.7 && < 4.19 - , unicode-data >= 0.4 && < 0.5 + base >= 4.7 && < 4.19 + , unicode-data-core >= 15.0 && < 15.1 test-suite test import: default-extensions, compile-options @@ -91,9 +91,9 @@ test-suite test other-modules: Unicode.Char.General.ScriptsSpec build-depends: - base >= 4.7 && < 4.19 - , hspec >= 2.0 && < 2.11 - , unicode-data + base >= 4.7 && < 4.19 + , hspec >= 2.0 && < 2.11 + , unicode-data-core , unicode-data-scripts default-language: Haskell2010 @@ -103,10 +103,10 @@ benchmark bench hs-source-dirs: bench main-is: Main.hs build-depends: - base >= 4.7 && < 4.19, - deepseq >= 1.1 && < 1.5, - tasty-bench >= 0.2.5 && < 0.4, - tasty >= 1.4.1, - unicode-data, + base >= 4.7 && < 4.19, + deepseq >= 1.1 && < 1.5, + tasty-bench >= 0.2.5 && < 0.4, + tasty >= 1.4.1, + unicode-data-core, unicode-data-scripts ghc-options: -O2 -fdicts-strict -rtsopts diff --git a/unicode-data-security/Changelog.md b/unicode-data-security/Changelog.md index 95028d8f..6affb379 100644 --- a/unicode-data-security/Changelog.md +++ b/unicode-data-security/Changelog.md @@ -1,6 +1,12 @@ # Changelog -## 0.2.0 (September 2022) +## 15.0.0 (February 2023) + +- New version scheme: `U.B.M`, where `U` is the Unicode standard major version + number, `B` marks a breaking change and `M` a non-breaking change per + [PVP](https://pvp.haskell.org/). + +## 0.2.0 (October 2022) - Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/). diff --git a/unicode-data-security/lib/Unicode/Char/Identifiers/Security.hs b/unicode-data-security/lib/Unicode/Char/Identifiers/Security.hs index 4fd9041b..7a57d1df 100644 --- a/unicode-data-security/lib/Unicode/Char/Identifiers/Security.hs +++ b/unicode-data-security/lib/Unicode/Char/Identifiers/Security.hs @@ -61,7 +61,7 @@ isIdentifierTypeAllowed = \case T.Recommended -> True _ -> False --- | Returns the 'IdentifierType's corresponding to a character. +-- | Returns the 'T.IdentifierType's corresponding to a character. -- -- @since 0.1.0 {-# INLINE identifierTypes #-} diff --git a/unicode-data-security/unicode-data-security.cabal b/unicode-data-security/unicode-data-security.cabal index b6260114..58a09f7b 100644 --- a/unicode-data-security/unicode-data-security.cabal +++ b/unicode-data-security/unicode-data-security.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: unicode-data-security -version: 0.2.0 +version: 15.0.0 synopsis: Unicode security mechanisms database description: @unicode-data-security@ provides Haskell APIs to access the @@ -26,9 +26,9 @@ tested-with: GHC==8.0.2 , GHC==8.6.5 , GHC==8.8.4 , GHC==8.10.7 - , GHC==9.0.1 - , GHC==9.2.1 - , GHC==9.4.2 + , GHC==9.0.2 + , GHC==9.2.5 + , GHC==9.4.4 , GHC==9.6.0 extra-source-files: @@ -81,8 +81,8 @@ library hs-source-dirs: lib build-depends: - base >= 4.7 && < 4.19 - , unicode-data >= 0.4 && < 0.5 + base >= 4.7 && < 4.19 + , unicode-data-core >= 15.0 && < 15.1 test-suite test import: default-extensions, compile-options diff --git a/unicode-data/Changelog.md b/unicode-data/Changelog.md index 4f7d7372..d56c8d92 100644 --- a/unicode-data/Changelog.md +++ b/unicode-data/Changelog.md @@ -1,10 +1,30 @@ # Changelog -## 0.4.0.1 (December 2022) +## 15.0.0 (February 2023) -- Fix [Unicode blocks handling on big-endian architectures](https://github.com/composewell/unicode-data/issues/97). +### New version scheme -## 0.4.0 (September 2022) +`U.B.M`, where `U` is the Unicode standard major version +number, `B` marks a breaking change and `M` a non-breaking change per +[PVP](https://pvp.haskell.org/). + +### All-in-one package + +__WARNING:__ `unicode-data` is now an all-in-one package with heavier +dependencies. You should use +[`unicode-data-core`](https://hackage.haskell.org/package/unicode-data-core) +if you only need modules from previous version. + +- Previous modules are now re-exported from the package + [`unicode-data-core`](https://hackage.haskell.org/package/unicode-data-core). +- Re-export + [`unicode-data-names`](https://hackage.haskell.org/package/unicode-data-names). +- Re-export + [`unicode-data-scripts`](https://hackage.haskell.org/package/unicode-data-scripts). +- Re-export + [`unicode-data-security`](https://hackage.haskell.org/package/unicode-data-security). + +## 0.4.0 (October 2022) - Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/). diff --git a/unicode-data/README.md b/unicode-data/README.md index 1a6b78c9..4e9057a6 100644 --- a/unicode-data/README.md +++ b/unicode-data/README.md @@ -4,6 +4,11 @@ character database. [Performance](#performance) is the primary goal in the design of this package. +From version `15.0.0` `unicode-data` is an _all-in-one_ package that re-exports +all the `unicode-data-*` package familly. You should use +[`unicode-data-core`](https://hackage.haskell.org/package/unicode-data-core) +if you only need the core modules. + The Haskell data structures are generated programmatically from the [Unicode character database](https://www.unicode.org/ucd/) (UCD) files. The latest Unicode version supported by this library is @@ -13,144 +18,6 @@ Please see the [Haddock documentation](https://hackage.haskell.org/package/unicode-data) for reference documentation. -## Performance - -`unicode-data` is up to _5 times faster_ than `base` ≤ 4.17 (see -[partial integration to `base`](#partial-integration-of-unicode-data-into-base)). - -The following benchmark compares the time taken in milliseconds to process all -the Unicode code points for `base-4.16` (GHC 9.2.1) and this package (v0.3). -Machine: 8 × AMD Ryzen 5 2500U on Linux. - -``` -All - Unicode.Char.Case.Compat - isLower - base: OK (1.53s) - 24 ms ± 3.8 ms - unicode-data: OK (2.25s) - 4.4 ms ± 88 μs, 0.19x - isUpper - base: OK (1.50s) - 24 ms ± 450 μs - unicode-data: OK (2.37s) - 4.7 ms ± 200 μs, 0.19x - toLower - base: OK (1.40s) - 22 ms ± 1.8 ms - unicode-data: OK (1.89s) - 7.2 ms ± 297 μs, 0.32x - toTitle - base: OK (1.25s) - 20 ms ± 2.0 ms - unicode-data: OK (1.65s) - 6.4 ms ± 509 μs, 0.32x - toUpper - base: OK (1.26s) - 20 ms ± 2.5 ms - unicode-data: OK (1.72s) - 6.8 ms ± 335 μs, 0.34x - Unicode.Char.General - generalCategory - base: OK (2.02s) - 134 ms ± 1.6 ms - unicode-data: OK (1.75s) - 116 ms ± 1.6 ms, 0.87x - isAlphaNum - base: OK (1.53s) - 24 ms ± 1.7 ms - unicode-data: OK (2.16s) - 4.2 ms ± 29 μs, 0.18x - isControl - base: OK (1.47s) - 23 ms ± 2.6 ms - unicode-data: OK (2.23s) - 4.4 ms ± 22 μs, 0.19x - isMark - base: OK (1.47s) - 23 ms ± 624 μs - unicode-data: OK (2.28s) - 4.5 ms ± 48 μs, 0.19x - isPrint - base: OK (1.53s) - 25 ms ± 2.4 ms - unicode-data: OK (2.27s) - 4.4 ms ± 50 μs, 0.18x - isPunctuation - base: OK (1.51s) - 24 ms ± 459 μs - unicode-data: OK (2.24s) - 4.4 ms ± 25 μs, 0.18x - isSeparator - base: OK (1.52s) - 24 ms ± 407 μs - unicode-data: OK (2.43s) - 4.8 ms ± 94 μs, 0.20x - isSymbol - base: OK (1.49s) - 24 ms ± 863 μs - unicode-data: OK (1.34s) - 5.2 ms ± 92 μs, 0.22x - Unicode.Char.General.Compat - isAlpha - base: OK (1.46s) - 23 ms ± 322 μs - unicode-data: OK (2.14s) - 4.1 ms ± 36 μs, 0.18x - isLetter - base: OK (1.44s) - 22 ms ± 640 μs - unicode-data: OK (2.17s) - 4.3 ms ± 58 μs, 0.19x - isSpace - base: OK (1.44s) - 11 ms ± 1.2 ms - unicode-data: OK (1.36s) - 5.3 ms ± 243 μs, 0.49x - Unicode.Char.Numeric - isNumber - base: OK (1.52s) - 24 ms ± 368 μs - unicode-data: OK (2.41s) - 4.7 ms ± 41 μs, 0.19x -``` - -### Partial integration of `unicode-data` into `base` - -Since `base` 4.18, `unicode-data` has been -_partially_ [integrated to GHC](https://gitlab.haskell.org/ghc/ghc/-/merge_requests/8072), -so there should be no relevant difference. However, using `unicode-data` allows -to select the _exact_ version of Unicode to support, therefore not relying on -the version supported by GHC. - -## Unicode database version update - -To update the Unicode version please update the version number in -`ucd.sh`. - -To download the Unicode database, run `ucd.sh download` from the top -level directory of the repo to fetch the database in `./ucd`. - -``` -$ ./ucd.sh download -``` - -To generate the Haskell data structure files from the downloaded database -files, run `ucd.sh generate` from the top level directory of the repo. - -``` -$ ./ucd.sh generate -``` - -## Running property doctests - -Temporarily add `QuickCheck` to build depends of library. - -``` -$ cabal build -$ cabal-docspec --check-properties --property-variables c -``` - ## Licensing `unicode-data` is an [open source](https://github.com/composewell/unicode-data) diff --git a/unicode-data/unicode-data.cabal b/unicode-data/unicode-data.cabal index 54ac1ac8..426e1f48 100644 --- a/unicode-data/unicode-data.cabal +++ b/unicode-data/unicode-data.cabal @@ -1,22 +1,27 @@ cabal-version: 2.2 name: unicode-data -version: 0.4.0.1 +version: 15.0.0 synopsis: Access Unicode Character Database (UCD) description: - @unicode-data@ provides Haskell APIs to efficiently access the + @unicode-data-core@ provides Haskell APIs to efficiently access the (UCD). Performance is the primary goal in the design of this package. . The Haskell data structures are generated programmatically from the UCD files. The latest Unicode version supported by this library is @@. + . + __WARNING:__ from version @15.0.0@, @unicode-data@ is an all-in-one package + with heavier dependencies. You should use + + if you only need core modules. homepage: http://github.com/composewell/unicode-data bug-reports: https://github.com/composewell/unicode-data/issues license: Apache-2.0 license-file: LICENSE author: Composewell Technologies and Contributors -maintainer: streamly@composewell.com -copyright: 2020 Composewell Technologies and Contributors +maintainer: dev@wismill.eu +copyright: 2022 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple @@ -26,15 +31,14 @@ tested-with: GHC==8.0.2 , GHC==8.6.5 , GHC==8.8.4 , GHC==8.10.7 - , GHC==9.0.1 - , GHC==9.2.1 - , GHC==9.4.2 + , GHC==9.0.2 + , GHC==9.2.5 + , GHC==9.4.4 , GHC==9.6.0 extra-source-files: Changelog.md README.md - NOTICE source-repository head type: git @@ -49,6 +53,7 @@ common default-extensions ScopedTypeVariables TupleSections FlexibleContexts + LambdaCase -- Experimental, may lead to issues DeriveAnyClass @@ -66,80 +71,71 @@ common compile-options library import: default-extensions, compile-options default-language: Haskell2010 - exposed-modules: - Unicode.Char + -- [NOTE] The following list must be maintained synced with the other packages. + -- [TODO] The list is built manually. Could we automatize it? + reexported-modules: + Unicode.Char, + -- The module structure is derived from -- https://www.unicode.org/reports/tr44/#Property_Index_Table - Unicode.Char.Case - Unicode.Char.Case.Compat - Unicode.Char.General - Unicode.Char.General.Blocks - Unicode.Char.General.Compat - Unicode.Char.Identifiers - Unicode.Char.Normalization - Unicode.Char.Numeric - Unicode.Char.Numeric.Compat + Unicode.Char.Case, + Unicode.Char.Case.Compat, + Unicode.Char.General, + Unicode.Char.General.Blocks, + Unicode.Char.General.Compat, + Unicode.Char.General.Names, + Unicode.Char.General.Scripts, + Unicode.Char.Identifiers, + Unicode.Char.Identifiers.Security, + Unicode.Char.Normalization, + Unicode.Char.Numeric, + Unicode.Char.Numeric.Compat, -- Internal files - Unicode.Internal.Bits - Unicode.Internal.Division - Unicode.Internal.Unfold + Unicode.Internal.Bits, + Unicode.Internal.Division, + Unicode.Internal.Unfold, -- Generated files -- This module structure is largely based on the UCD file names from which -- the properties are generated. - Unicode.Internal.Char.Blocks - Unicode.Internal.Char.CaseFolding - Unicode.Internal.Char.DerivedCoreProperties - Unicode.Internal.Char.DerivedNumericValues - Unicode.Internal.Char.PropList - Unicode.Internal.Char.SpecialCasing.LowerCaseMapping - Unicode.Internal.Char.SpecialCasing.TitleCaseMapping - Unicode.Internal.Char.SpecialCasing.UpperCaseMapping - Unicode.Internal.Char.UnicodeData.CombiningClass - Unicode.Internal.Char.UnicodeData.Compositions - Unicode.Internal.Char.UnicodeData.Decomposable - Unicode.Internal.Char.UnicodeData.DecomposableK - Unicode.Internal.Char.UnicodeData.Decompositions - Unicode.Internal.Char.UnicodeData.DecompositionsK - Unicode.Internal.Char.UnicodeData.DecompositionsK2 - Unicode.Internal.Char.UnicodeData.GeneralCategory - Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping - Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping + Unicode.Internal.Char.Blocks, + Unicode.Internal.Char.CaseFolding, + Unicode.Internal.Char.DerivedCoreProperties, + Unicode.Internal.Char.DerivedNumericValues, + Unicode.Internal.Char.PropList, + Unicode.Internal.Char.Scripts, + Unicode.Internal.Char.ScriptExtensions, + Unicode.Internal.Char.Security.Confusables, + Unicode.Internal.Char.Security.IdentifierStatus, + Unicode.Internal.Char.Security.IdentifierType, + Unicode.Internal.Char.Security.IntentionalConfusables, + Unicode.Internal.Char.SpecialCasing.LowerCaseMapping, + Unicode.Internal.Char.SpecialCasing.TitleCaseMapping, + Unicode.Internal.Char.SpecialCasing.UpperCaseMapping, + Unicode.Internal.Char.UnicodeData.CombiningClass, + Unicode.Internal.Char.UnicodeData.Compositions, + Unicode.Internal.Char.UnicodeData.DerivedName, + Unicode.Internal.Char.UnicodeData.Decomposable, + Unicode.Internal.Char.UnicodeData.DecomposableK, + Unicode.Internal.Char.UnicodeData.Decompositions, + Unicode.Internal.Char.UnicodeData.DecompositionsK, + Unicode.Internal.Char.UnicodeData.DecompositionsK2, + Unicode.Internal.Char.UnicodeData.GeneralCategory, + Unicode.Internal.Char.UnicodeData.NameAliases, + Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping, + Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping, Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping - - hs-source-dirs: lib - ghc-options: -O2 - build-depends: - base >=4.7 && <4.19 - -test-suite test - import: default-extensions, compile-options - type: exitcode-stdio-1.0 - main-is: Main.hs - hs-source-dirs: - test + -- [NOTE] Without the following 2 sections, we cannot build the documention. + -- [TODO] Remove these once haddock/cabal issues are solved. other-modules: - Unicode.CharSpec - build-depends: - base >= 4.7 && < 4.19 - , hspec >= 2.0 && < 2.11 - , unicode-data - -- We need to match a GHC version with the same Unicode version. - -- See: test/Unicode/CharSpec.hs for compatibility table. - if impl(ghc >= 9.6.0) - cpp-options: -DCOMPATIBLE_GHC_UNICODE - default-language: Haskell2010 + Paths_unicode_data + autogen-modules: + Paths_unicode_data -benchmark bench - import: default-extensions, compile-options - type: exitcode-stdio-1.0 - hs-source-dirs: bench - main-is: Main.hs build-depends: - base >= 4.7 && < 4.19, - deepseq >= 1.1 && < 1.5, - tasty-bench >= 0.2.5 && < 0.4, - tasty >= 1.4.1, - unicode-data - ghc-options: -O2 -fdicts-strict -rtsopts + base >= 4.7 && < 4.19 + , unicode-data-core >= 15.0.0 && < 15.1 + , unicode-data-names >= 15.0.0 && < 15.1 + , unicode-data-scripts >= 15.0.0 && < 15.1 + , unicode-data-security >= 15.0.0 && < 15.1 diff --git a/update-versions.sh b/update-versions.sh index 59399da5..cd227032 100755 --- a/update-versions.sh +++ b/update-versions.sh @@ -53,7 +53,7 @@ bump_packages_versions () { ############################################################################### bump_dependencies() { - version="$(find_package_version unicode-data/unicode-data.cabal)" + version="$(find_package_version unicode-data-core/unicode-data-core.cabal)" unicode_version="$(get_version_field 1 "$version")" unicode_data_major="$(get_version_field 2 "$version")" @@ -62,7 +62,7 @@ bump_dependencies() { unicode_data_min_bound="$unicode_version.$unicode_data_major" unicode_data_max_bound="$unicode_version.$((unicode_data_major+1))" - regex="s/(unicode-data\\s+>= )$VERSION_PATTERN" + regex="s/(unicode-data-core\\s+>= )$VERSION_PATTERN" regex+="(\\s+&&\\s+< )$VERSION_PATTERN/" regex+="\\1$unicode_data_min_bound\\3$unicode_data_max_bound/"