Skip to content

Commit

Permalink
Merge remote-tracking branch 'la-vache/main' into 170-C7
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Oct 13, 2023
2 parents 76127f9 + c597fd8 commit 790669a
Show file tree
Hide file tree
Showing 45 changed files with 1,036 additions and 266 deletions.
118 changes: 94 additions & 24 deletions .github/workflows/cli-build-instructions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,14 @@ jobs:
run: |
mkdir -p Generated/BIN
- name: Run command - Build and Test
run: MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml package -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
# Since these are just examples to smoke-test the in-source build process,
# let’s not run the whole build and test suite, which is quite slow (6 min
# 26 s as of this writing). Just run the invariant tests and smoke-test
# MakeUnicodeFiles. We don’t even check that MakeUnicodeFiles doesn’t
# change anything, which makes little sense; but that is the job of the
# other job.
- name: Run invariant tests
run: MAVEN_OPTS="-ea" mvn -s .github/workflows/mvn-settings.xml test -am -pl unicodetools -Dtest=TestTestUnicodeInvariants -DfailIfNoTests=false -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

Expand All @@ -91,14 +97,15 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

out-of-source-build:
name: Out-of-source Instructions

# Out-of-source build.
ucd-and-smoke-tests:
name: Check UCD consistency, invariants, smoke-test generators
runs-on: ubuntu-latest
steps:
- name: Checkout Unicode Tools
uses: actions/checkout@v3
with:
repository: unicode-org/unicodetools
path: unicodetools/mine/src
- name: Get the CLDR_REF from pom.xml
id: cldr_ref
Expand Down Expand Up @@ -136,6 +143,30 @@ jobs:
run: |
mkdir -p unicodetools/mine/Generated/BIN
- name: Run command - Make Unicode Files
run: |
cd unicodetools/mine/src
mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCD.Main" -Dexec.args="version $CURRENT_UVERSION build MakeUnicodeFiles" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Check that UCD files are consistent
run: |
cd unicodetools/mine/src
./py/copygenerateducd.py --out-of-source -y
git diff --compact-summary --exit-code || {
git diff --compact-summary |
awk '{
if (previous) {
print "::error file="previous",title=File must be regenerated::Run org.unicode.text.UCD.Main build MakeUnicodeFiles and copy any changed files to unicodetools/data/ucd/dev."
}
previous=$1
}'
exit 1
}
# Only test once we know the UCD is internally consistent.
# MakeUnicodeFiles is much faster than this anyway.
- name: Run command - Build and Test
run: |
cd unicodetools/mine/src
Expand All @@ -151,13 +182,6 @@ jobs:
path: |
unicodetools/mine/Generated/UnicodeTestResults.*
- name: Run command - Make Unicode Files
run: |
cd unicodetools/mine/src
mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCD.Main" -Dexec.args="version $CURRENT_UVERSION build MakeUnicodeFiles" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

# https://github.com/unicode-org/unicodetools/blob/main/docs/emoji/aac.md#aacorderjava
- name: Run command - AAC Order
run: |
Expand All @@ -166,18 +190,6 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

# https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md#tools--tests
# Note: Not running desuffixucd.py in UCA jobs because no version numbers detected in data file names
- name: Run command - UCA - collation validity log
run: |
cd unicodetools/mine/src
# invoke main() in class ...UCA.Main
mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCA.Main" -Dexec.args="writeCollationValidityLog ICU" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
# check for output file
compgen -G "../Generated/UCA/*/CheckCollationValidity.html"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

# https://github.com/unicode-org/unicodetools/blob/main/docs/idna.md
- name: Run command - IDNA
run: |
Expand Down Expand Up @@ -252,3 +264,61 @@ jobs:
mvn -s .github/workflows/mvn-settings.xml -Dexec.mainClass="org.unicode.propstest.CheckProperties" -Dexec.classpathScope=test test-compile -Dexec.args="COMPARE ALL $PREVIOUS_UVERSION" compile exec:java -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

# Out-of-source build.
uca:
name: Check UCA data
runs-on: ubuntu-latest
steps:
- name: Checkout Unicode Tools
uses: actions/checkout@v3
with:
repository: unicode-org/unicodetools
path: unicodetools/mine/src
- name: Get the CLDR_REF from pom.xml
id: cldr_ref
run: echo "CLDR_REF="$(mvn --file unicodetools/mine/src/pom.xml help:evaluate -Dexpression=cldr.version -q -DforceStdout | cut -d- -f3) >> $GITHUB_OUTPUT && cat ${GITHUB_OUTPUT}
- name: Verify CLDR checkout ref
run: echo CLDR_REF="${{ steps.cldr_ref.outputs.CLDR_REF }}" && [ "${{ steps.cldr_ref.outputs.CLDR_REF }}x" != "x" ] # fail if empty
- name: Cache CLDR repository
uses: actions/cache@v3
with:
path: cldr/mine/src
key: cldr-${{ steps.cldr_ref.outputs.CLDR_REF }}
restore-keys: |
cldr
- name: Check out CLDR
uses: actions/checkout@v3
with:
repository: unicode-org/cldr
path: cldr/mine/src
ref: main
fetch-depth: 0
- name: Switch CLDR to CLDR_REF
run: cd cldr/mine/src && git fetch && git checkout ${{ steps.cldr_ref.outputs.CLDR_REF }}
- name: Set up JDK 11
uses: actions/setup-java@v1
with:
java-version: 11
- name: Cache local Maven repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Set up out-of-source output dir
run: |
mkdir -p unicodetools/mine/Generated/BIN
# https://github.com/unicode-org/unicodetools/blob/main/docs/uca/index.md#tools--tests
# Note: Not running desuffixucd.py in UCA jobs because no version numbers detected in data file names
- name: Run command - UCA - collation validity log
run: |
cd unicodetools/mine/src
# invoke main() in class ...UCA.Main
mvn -s .github/workflows/mvn-settings.xml compile exec:java -Dexec.mainClass="org.unicode.text.UCA.Main" -Dexec.args="writeCollationValidityLog ICU" -am -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION
# check for output file
compgen -G "../Generated/UCA/*/CheckCollationValidity.html"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
2 changes: 2 additions & 0 deletions UnicodeJsps/jetty.d/ROOT/robots.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
User-agent: *
Disallow: /UnicodeJsps
2 changes: 1 addition & 1 deletion UnicodeJsps/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>29.0-jre</version>
<version>32.0.0-jre</version>
</dependency>

<!-- test -->
Expand Down
2 changes: 1 addition & 1 deletion UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import org.unicode.props.UnicodeProperty;

public class CachedProps {
public static final boolean IS_BETA = true;
public static final boolean IS_BETA = false;

public static final Splitter HASH_SPLITTER = Splitter.on('#').trimResults();
public static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();
Expand Down
38 changes: 24 additions & 14 deletions UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -637,16 +637,7 @@ private void showString(final String string, String separator, Appendable out)
if (UnicodeUtilities.RTL.containsSome(literal)) {
literal = '\u200E' + literal + '\u200E';
}
String name = UnicodeUtilities.getName(string, separator, false);
if (name == null || name.length() == 0) {
name = "<i>no name</i>";
} else {
boolean special = name.indexOf('<') >= 0;
name = UnicodeUtilities.toHTML.transliterate(name);
if (special) {
name = "<i>" + name + "</i>";
}
}
String name = UnicodeUtilities.getName(string, separator, false, false);
literal = UnicodeSetUtilities.addEmojiVariation(literal);
if (doTable) {
out.append(
Expand Down Expand Up @@ -801,7 +792,8 @@ String getPropString(List<UnicodeProperty> props, String codePoints, boolean sho
// }
}

private static String getName(String string, String separator, boolean andCode) {
private static String getName(
String string, String separator, boolean andCode, boolean plainText) {
StringBuilder result = new StringBuilder();
int cp;
for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
Expand All @@ -812,7 +804,25 @@ private static String getName(String string, String separator, boolean andCode)
if (andCode) {
result.append("U+").append(com.ibm.icu.impl.Utility.hex(cp, 4)).append(' ');
}
result.append(CachedProps.NAMES.getValue(cp));
final String name = CachedProps.NAMES.getValue(cp);
if (name != null) {
result.append(name);
} else {
// TODO(egg): We only have Name_Aliasβ during β, which is silly. This will probably
// solve itself as part of https://github.com/unicode-org/unicodetools/issues/432.
String alias =
getFactory()
.getProperty(CachedProps.IS_BETA ? "Name_Aliasβ" : "Name_Alias")
.getValue(cp);
if (alias == null) {
alias = "no name";
}
if (plainText) {
result.append("(" + alias + ")");
} else {
result.append("<i>" + alias + "</i>");
}
}
}
return result.toString();
}
Expand Down Expand Up @@ -1931,7 +1941,7 @@ private static void showBidiLine(
writer.println("</tr><tr><th>Character</th>");
for (int i = 0; i < str.length(); ++i) {
final String s = str.substring(i, i + 1);
String title = toHTML.transform(getName(s, "", true));
String title = toHTML.transform(getName(s, "", true, true));
writer.println(
"<td class='bccell' title='"
+ title
Expand Down Expand Up @@ -1982,7 +1992,7 @@ private static void showBidiLine(
String title =
bidiChar.length() == 0
? "deleted"
: toHTML.transform(getName(bidiChar, "", true));
: toHTML.transform(getName(bidiChar, "", true, true));
String td = bidiChar.length() == 0 ? "bxcell" : "bccell";
writer.println(
"<td class='"
Expand Down
27 changes: 20 additions & 7 deletions docs/unicodejsps/index.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
# Building UnicodeJsp

- Note: you can run the latest UnicodeJsp locally with docker using:

```
docker run --rm -p 8080:8080 unicode/unicode-jsp
```

- Note 2: there are some notes on updated processes for using GCP at [gcp-run.md](./gcp-run.md) - at present, automated deployment is TODO.

## Compiling
Expand Down Expand Up @@ -113,7 +107,26 @@ Look at <http://localhost:8080/UnicodeJsps/properties.jsp>, and make sure that
there aren't any Z-Other props at the bottom (you'll need to update via Adding
New Properties if there are).

(:construction: **TODO**: explain how to do a Docker-based build here.)
### Running a Docker-based build

compile java stuff

- `mvn -B package -am -pl UnicodeJsps -DskipTests=true`

”backup” copy of CLDR and UnicodeTools. (`~/src/cldr` is an optional existing CLDR dir to save a few packets)

- `git clone --reference-if-able ~/src/cldr https://github.com/unicode-org/cldr.git || (cd cldr && git pull)`
- `mkdir -p UnicodeJsps/target && tar -cpz --exclude=.git --exclude=unicodetools/target/ -f UnicodeJsps/target/cldr-unicodetools.tgz ./cldr/ ./unicodetools/`

Now, finally build.

- `docker build -t unicode/unicode-jsp:latest UnicodeJsps/`

… And run. Control-C to cancel it, otherwise visit <http://127.0.0.1:8080>

```
docker run --rm -p 8080:8080 unicode/unicode-jsp:latest
```

## Commit/PR

Expand Down
5 changes: 3 additions & 2 deletions py/copygenerateducd.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@


def main():
out_of_source = '--out-of-source' in sys.argv[1:]
cwd = Path().cwd()
uversion = os.getenv("CURRENT_UVERSION")
genucddir = cwd / "Generated" / "UCD" / uversion
genucddir = (cwd / ".." if out_of_source else cwd) / "Generated" / "UCD" / uversion
if not genucddir.exists():
raise Exception(f"Generated directory not found at {genucddir.absolute()}")

Expand All @@ -34,7 +35,7 @@ def main():
print("THE FOLLOWING FILES WILL BE MOVED:\n")
print("\n".join([f"{str(p.name)} --> {devucddir / p.relative_to(genucddir)}" for p in to_move])) # noqa: E501

confirm = bool(sys.argv[-1] == "-y") # enable running this in automation
confirm = bool("-y" in sys.argv[1:]) # enable running this in automation
if not confirm:
confirm = input("\nProceed [y/N]?").lower() == "y"

Expand Down
5 changes: 5 additions & 0 deletions unicodetools/data/ucd/dev/ArabicShaping.txt
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,11 @@ A873; PHAGS-PA CANDRABINDU; U; No_Joining_Group
10D22; HANIFI ROHINGYA SAKIN; R; No_Joining_Group
10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA

# Arabic Extended-D Characters
10EC2; DAL WITH VERTICAL 2 DOTS BELOW; R; DAL
10EC3; TAH WITH VERTICAL 2 DOTS BELOW; D; TAH
10EC4; KAF WITH VERTICAL 2 DOTS BELOW; D; KAF

# Sogdian Characters

10F30; SOGDIAN ALEPH; D; No_Joining_Group
Expand Down
1 change: 1 addition & 0 deletions unicodetools/data/ucd/dev/Blocks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ FFF0..FFFF; Specials
11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A
11AC0..11AFF; Pau Cin Hau
11B00..11B5F; Devanagari Extended-A
11BC0..11BFF; Sunuwar
11C00..11C6F; Bhaiksuki
11C70..11CBF; Marchen
11D00..11D5F; Masaram Gondi
Expand Down
5 changes: 3 additions & 2 deletions unicodetools/data/ucd/dev/CaseFolding.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# CaseFolding-15.1.0.txt
# Date: 2023-05-12, 21:53:10 GMT
# CaseFolding-16.0.0.txt
# Date: 2023-10-03, 19:01:21 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -603,6 +603,7 @@
1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN
1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT
1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK
1C89; C; 1C8A; # CYRILLIC CAPITAL LETTER TJE
1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN
1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN
1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN
Expand Down
10 changes: 8 additions & 2 deletions unicodetools/data/ucd/dev/DerivedAge.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# DerivedAge-16.0.0.txt
# Date: 2023-10-02, 12:51:03 GMT
# Date: 2023-10-13, 15:52:11 GMT
# © 2023 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
Expand Down Expand Up @@ -2009,9 +2009,15 @@ FDFE..FDFF ; 14.0 # [2] ARABIC LIGATURE SUBHAANAHU WA TAAALAA..ARABIC LIGAT

# Newly assigned in Unicode 16.0.0 (September, 2024)

0897 ; 16.0 # ARABIC PEPET
0C5C ; 16.0 # TELUGU ARCHAIC SHRII
0CDC ; 16.0 # KANNADA ARCHAIC SHRII
1C89..1C8A ; 16.0 # [2] CYRILLIC CAPITAL LETTER TJE..CYRILLIC SMALL LETTER TJE
10EC2..10EC4 ; 16.0 # [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW
10EFC ; 16.0 # ARABIC COMBINING ALEF OVERLAY
11BC0..11BE1 ; 16.0 # [34] SUNUWAR LETTER DEVI..SUNUWAR SIGN PVO
11BF0..11BF9 ; 16.0 # [10] SUNUWAR DIGIT ZERO..SUNUWAR DIGIT NINE

# Total code points: 2
# Total code points: 53

# EOF
Loading

0 comments on commit 790669a

Please sign in to comment.