Skip to content

Commit

Permalink
Interim checkin: implemented groups
Browse files Browse the repository at this point in the history
  • Loading branch information
jowilco committed Jun 6, 2024
1 parent 5986dfa commit feeb0e2
Show file tree
Hide file tree
Showing 6 changed files with 974 additions and 708 deletions.
63 changes: 30 additions & 33 deletions unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.unicode.xml;

import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.util.VersionInfo;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.props.*;

Expand Down Expand Up @@ -99,11 +98,11 @@ public AttributeResolver(IndexUnicodeProperties iup) {
}

private enum AliasType {
ABBREVIATION ("abbreviation"),
ALTERNATE ("alternate"),
CONTROL ("control"),
CORRECTION ("correction"),
FIGMENT ("figment");
ABBREVIATION("abbreviation"),
ALTERNATE("alternate"),
CONTROL("control"),
CORRECTION("correction"),
FIGMENT("figment");

private final String aliasType;

Expand All @@ -129,6 +128,7 @@ private NameAlias(String alias, AliasType type) {
public String getAlias() {
return alias;
}

public AliasType getType() {
return type;
}
Expand All @@ -154,15 +154,14 @@ private HashMap<Integer, LinkedList<NameAlias>> loadNameAliases() {
String[] parts = line.getParts();
int codepoint = Integer.parseInt(parts[0], 16);
NameAlias nameAlias = new NameAlias(
parts[1], AliasType.valueOf(parts[2].toUpperCase()));
parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT)));

if (nameAliasesByCodepoint.containsKey(codepoint)) {
LinkedList<NameAlias> nameAliases = new LinkedList<>(nameAliasesByCodepoint.get(codepoint));
nameAliases.add(nameAlias);
nameAliases.sort(nameAliasComparator);
nameAliasesByCodepoint.replace(codepoint, nameAliases);
}
else {
} else {
nameAliasesByCodepoint.put(codepoint, new LinkedList<>(List.of(nameAlias)));
}
}
Expand All @@ -171,9 +170,9 @@ private HashMap<Integer, LinkedList<NameAlias>> loadNameAliases() {

public String getAttributeValue(UcdProperty prop, int codepoint) {
String resolvedValue = indexUnicodeProperties.getResolvedValue(prop, codepoint);
switch(prop.getType()) {
switch (prop.getType()) {
case Numeric:
switch(prop) {
switch (prop) {
case kOtherNumeric:
case kPrimaryNumeric:
case kAccountingNumeric:
Expand All @@ -182,7 +181,7 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
return Optional.ofNullable(resolvedValue).orElse("NaN");
}
case String:
switch(prop) {
switch (prop) {
case Equivalent_Unified_Ideograph:
String EqUIdeo = getMappingValue(codepoint, resolvedValue, false, "");
return (EqUIdeo.equals("#")) ? null : EqUIdeo;
Expand All @@ -191,51 +190,51 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
return (kCompatibilityVariant.equals("#")) ? "" : kCompatibilityVariant;
case kSimplifiedVariant:
case kTraditionalVariant:
String kVariant = getMappingValue(codepoint, resolvedValue, isUnihanAttributeRange(codepoint), "U+");
String kVariant = getMappingValue(codepoint, resolvedValue, isUnihanAttributeRange(codepoint)
, "U+");
return (kVariant.equals("#")) ? "" : kVariant;
case Bidi_Mirroring_Glyph:
//TODO: Question for PAG - This is probably not the desired behavior, but adding this case to maintain consistent output.
// Check the spec. But otherwise keep consistent. Update this comment to indicate why.
//Returning empty string for bmg to maintain compatibility with older generated files.
String bmg = getMappingValue(codepoint, resolvedValue, false, "");
return (bmg.equals("#")) ? "" : bmg;
default:
return getMappingValue(codepoint, resolvedValue, false, "");
}
case Miscellaneous:
switch(prop) {
switch (prop) {
case Jamo_Short_Name:
//return map_jamo_short_name.get(codepoint).getShortName();
return Optional.ofNullable(resolvedValue).orElse("");
case Name:
if(resolvedValue != null && resolvedValue.startsWith("CJK UNIFIED IDEOGRAPH-")) {
if (resolvedValue != null && resolvedValue.startsWith("CJK UNIFIED IDEOGRAPH-")) {
return "CJK UNIFIED IDEOGRAPH-#";
}
if(resolvedValue != null && resolvedValue.startsWith("CJK COMPATIBILITY IDEOGRAPH-")) {
if (resolvedValue != null && resolvedValue.startsWith("CJK COMPATIBILITY IDEOGRAPH-")) {
return "CJK COMPATIBILITY IDEOGRAPH-#";
}
if(resolvedValue != null && resolvedValue.startsWith("TANGUT IDEOGRAPH-")) {
if (resolvedValue != null && resolvedValue.startsWith("TANGUT IDEOGRAPH-")) {
return "TANGUT IDEOGRAPH-#";
}
if(resolvedValue != null && resolvedValue.startsWith("KHITAN SMALL SCRIPT CHARACTER-")) {
if (resolvedValue != null && resolvedValue.startsWith("KHITAN SMALL SCRIPT CHARACTER-")) {
return "KHITAN SMALL SCRIPT CHARACTER-#";
}
if(resolvedValue != null && resolvedValue.startsWith("NUSHU CHARACTER-")) {
if (resolvedValue != null && resolvedValue.startsWith("NUSHU CHARACTER-")) {
return "NUSHU CHARACTER-#";
}
if(resolvedValue != null && resolvedValue.startsWith("EGYPTIAN HIEROGLYPH-")) {
if (resolvedValue != null && resolvedValue.startsWith("EGYPTIAN HIEROGLYPH-")) {
return "EGYPTIAN HIEROGLYPH-#";
}
return Optional.ofNullable(resolvedValue).orElse("");
case kDefinition:
return resolvedValue;
default:
if (resolvedValue!= null) {
if (resolvedValue != null) {
return resolvedValue.replaceAll("\\|", " ");
}
return "";
}
case Catalog:
switch(prop) {
switch (prop) {
case Age:
String age = map_age.get(codepoint).getShortName();
return (age.equals("NA")) ? "unassigned" : age;
Expand All @@ -245,7 +244,7 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
return map_script.get(codepoint).getShortName();
case Script_Extensions:
StringBuilder extensionBuilder = new StringBuilder();
String[] extensions = map_script_extensions.get(codepoint).split("\\|", 0);
String[] extensions = map_script_extensions.get(codepoint).split("\\|", 0);
for (String extension : extensions) {
extensionBuilder.append(UcdPropertyValues.Script_Values.valueOf(extension).getShortName());
extensionBuilder.append(" ");
Expand All @@ -255,17 +254,16 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
throw new RuntimeException("Missing Catalog case");
}
case Enumerated:
switch(prop) {
switch (prop) {
case Bidi_Class:
return map_bidi_class.get(codepoint).getShortName();
case Bidi_Paired_Bracket_Type:
return map_bidi_paired_bracket_type.get(codepoint).getShortName();
case Canonical_Combining_Class:
return map_canonical_combining_class.get(codepoint).getShortName();
case Decomposition_Type:
//TODO: Question for PAG - This is probably not the desired behavior, but specifying lower case to maintain consistent output.
// Check the spec. But otherwise keep consistent. Update this comment to indicate why.
return map_decomposition_type.get(codepoint).getShortName().toLowerCase();
//Returning lower case to maintain compatibility with older generated files.
return map_decomposition_type.get(codepoint).getShortName().toLowerCase(Locale.ROOT);
case Do_Not_Emit_Type:
return map_do_not_emit_type.get(codepoint).getShortName();
case East_Asian_Width:
Expand Down Expand Up @@ -317,9 +315,8 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
default:
throw new RuntimeException("Missing Enumerated case");
}
case Binary:
{
switch(resolvedValue) {
case Binary: {
switch (resolvedValue) {
// Seems overkill to get this from UcdPropertyValues.Binary
case "No":
return "N";
Expand Down Expand Up @@ -385,7 +382,7 @@ public boolean isDifferentRange(int codepointA, int codepointB) {
}

private static String getCPString(int codepoint) {
return String.format("%4s", Integer.toHexString(codepoint)).replace(" ", "0").toUpperCase();
return String.format("%4s", Integer.toHexString(codepoint)).replace(" ", "0").toUpperCase(Locale.ROOT);
}

public String getHexString(int codepoint) {
Expand Down
23 changes: 15 additions & 8 deletions unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep
UcdSectionComponent[] ucdSectionComponents = ucdSection.getUcdSectionDetail().getUcdSectionComponents();

if (isCompatibleVersion(minVersion, maxVersion)) {
writer.startElement(tag); {
writer.startElement(tag);
{
for (UcdSectionComponent ucdSectionComponent : ucdSectionComponents) {
if (isCompatibleVersion(ucdSectionComponent.getMinVersion(), ucdSectionComponent.getMaxVersion())) {
final PropertyParsingInfo fileInfoEVS = PropertyParsingInfo.getPropertyInfo(ucdSectionComponent.getUcdProperty());
final PropertyParsingInfo fileInfoEVS =
PropertyParsingInfo.getPropertyInfo(ucdSectionComponent.getUcdProperty());
String fullFilename = fileInfoEVS.getFullFileName(indexUnicodeProperties.getUcdVersion());
UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename));
parser.withRange(parserWithRange);
Expand All @@ -46,7 +48,8 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep
for (UcdLineParser.UcdLine line : parser) {
if (!line.getOriginalLine().startsWith("#")) {
AttributesImpl attributes = getBlockAttributes(namespace, line);
writer.startElement(childTag, attributes); {
writer.startElement(childTag, attributes);
{
writer.endElement(childTag);
}
}
Expand All @@ -61,8 +64,10 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep
List<String> names = new ArrayList<>(namedSequences.keySet());
Collections.sort(names);
for (String name : names) {
AttributesImpl attributes = getNamedSequenceAttributes(namespace, name, namedSequences);
writer.startElement(childTag, attributes); {
AttributesImpl attributes = getNamedSequenceAttributes(namespace, name,
namedSequences);
writer.startElement(childTag, attributes);
{
writer.endElement(childTag);
}
}
Expand All @@ -83,8 +88,9 @@ public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXExcep
}
}

private AttributesImpl getAttributes(UcdSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) {
switch(ucdSection) {
private AttributesImpl getAttributes(UcdSectionDetail.UcdSection ucdSection, String namespace,
UcdLineParser.UcdLine line) {
switch (ucdSection) {
case CJKRADICALS:
return getCJKRadicalAttributes(namespace, line);
case DONOTEMIT:
Expand Down Expand Up @@ -151,7 +157,8 @@ private static AttributesImpl getEmojiSourceAttributes(String namespace, UcdLine
return attributes;
}

private static AttributesImpl getNamedSequenceAttributes(String namespace, String name, HashMap<String, String> namedSequences) {
private static AttributesImpl getNamedSequenceAttributes(String namespace, String name,
HashMap<String, String> namedSequences) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "name", "name", "CDATA", name);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public void startFile() throws SAXException {
transformerHandler.startDocument ();
char[] c = "\n".toCharArray ();
transformerHandler.characters (c, 0, c.length);
//TODO: JRW change hardcoded 2023 to current year.
c = " \u00A9 2023 Unicode\u00AE, Inc. ".toCharArray ();
transformerHandler.comment (c, 0, c.length);
c = "\n".toCharArray ();
Expand Down
Loading

0 comments on commit feeb0e2

Please sign in to comment.