Skip to content

Commit

Permalink
Initial checkin for UcdXML
Browse files Browse the repository at this point in the history
  • Loading branch information
jowilco committed Jun 6, 2024
1 parent 2f29705 commit 5986dfa
Show file tree
Hide file tree
Showing 8 changed files with 2,143 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ perf-*.xml
test-*.xml

# Directories
.idea/
.settings/
.vs/
.vscode/
Expand Down
404 changes: 404 additions & 0 deletions unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java

Large diffs are not rendered by default.

194 changes: 194 additions & 0 deletions unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package org.unicode.xml;

import com.ibm.icu.util.VersionInfo;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.props.IndexUnicodeProperties;
import org.unicode.props.PropertyParsingInfo;
import org.unicode.props.UcdLineParser;
import org.unicode.props.UcdProperty;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import java.util.*;

public class UCDDataResolver {

private final IndexUnicodeProperties indexUnicodeProperties;
private final String namespace;
private final UCDXMLWriter writer;

public UCDDataResolver(IndexUnicodeProperties iup, String namespace, UCDXMLWriter writer) {
indexUnicodeProperties = iup;
this.namespace = namespace;
this.writer = writer;
}

public void buildSection(UcdSectionDetail.UcdSection ucdSection) throws SAXException {
VersionInfo minVersion = ucdSection.getMinVersion();
VersionInfo maxVersion = ucdSection.getMaxVersion();
String tag = ucdSection.toString();
String childTag = ucdSection.getChildTag();
boolean parserWithRange = ucdSection.getParserWithRange();
boolean parserWithMissing = ucdSection.getParserWithMissing();
UcdSectionComponent[] ucdSectionComponents = ucdSection.getUcdSectionDetail().getUcdSectionComponents();

if (isCompatibleVersion(minVersion, maxVersion)) {
writer.startElement(tag); {
for (UcdSectionComponent ucdSectionComponent : ucdSectionComponents) {
if (isCompatibleVersion(ucdSectionComponent.getMinVersion(), ucdSectionComponent.getMaxVersion())) {
final PropertyParsingInfo fileInfoEVS = PropertyParsingInfo.getPropertyInfo(ucdSectionComponent.getUcdProperty());
String fullFilename = fileInfoEVS.getFullFileName(indexUnicodeProperties.getUcdVersion());
UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename));
parser.withRange(parserWithRange);
parser.withMissing(parserWithMissing);
switch (ucdSection) {
case BLOCKS:
for (UcdLineParser.UcdLine line : parser) {
if (!line.getOriginalLine().startsWith("#")) {
AttributesImpl attributes = getBlockAttributes(namespace, line);
writer.startElement(childTag, attributes); {
writer.endElement(childTag);
}
}
}
break;
case NAMEDSEQUENCES:
HashMap<String, String> namedSequences = new HashMap<>();
for (UcdLineParser.UcdLine line : parser) {
String[] parts = line.getParts();
namedSequences.put(parts[0], parts[1]);
}
List<String> names = new ArrayList<>(namedSequences.keySet());
Collections.sort(names);
for (String name : names) {
AttributesImpl attributes = getNamedSequenceAttributes(namespace, name, namedSequences);
writer.startElement(childTag, attributes); {
writer.endElement(childTag);
}
}
break;
default:
for (UcdLineParser.UcdLine line : parser) {
AttributesImpl attributes = getAttributes(ucdSection, namespace, line);
writer.startElement(childTag, attributes);
{
writer.endElement(childTag);
}
}
}
}
}
writer.endElement(tag);
}
}
}

private AttributesImpl getAttributes(UcdSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) {
switch(ucdSection) {
case CJKRADICALS:
return getCJKRadicalAttributes(namespace, line);
case DONOTEMIT:
return getDoNotEmitAttributes(namespace, line);
case EMOJISOURCES:
return getEmojiSourceAttributes(namespace, line);
case NORMALIZATIONCORRECTIONS:
return getNCAttributes(namespace, line);
case STANDARDIZEDVARIANTS:
return getSVAttributes(namespace, line);
default:
throw new IllegalArgumentException("getAttributes failed on an unexpected UcdSection");
}
}

private static AttributesImpl getBlockAttributes(String namespace, UcdLineParser.UcdLine line) {
String[] parts = line.getParts();
String[] range = parts[0].split("\\.\\.");
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "first-cp", "first-cp", "CDATA", range[0]);
attributes.addAttribute(
namespace, "last-cp", "last-cp", "CDATA", range[1]);
attributes.addAttribute(
namespace, "name", "name", "CDATA", parts[1]);
return attributes;
}

private static AttributesImpl getCJKRadicalAttributes(String namespace, UcdLineParser.UcdLine line) {
String[] parts = line.getParts();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "number", "number", "CDATA", parts[0]);
attributes.addAttribute(
namespace, "radical", "radical", "CDATA", parts[1]);
attributes.addAttribute(
namespace, "ideograph", "ideograph", "CDATA", parts[2]);
return attributes;
}

private static AttributesImpl getDoNotEmitAttributes(String namespace, UcdLineParser.UcdLine line) {
String[] parts = line.getParts();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "of", "of", "CDATA", parts[0]);
attributes.addAttribute(
namespace, "use", "use", "CDATA", parts[1]);
attributes.addAttribute(
namespace, "because", "because", "CDATA", parts[2]);
return attributes;
}

private static AttributesImpl getEmojiSourceAttributes(String namespace, UcdLineParser.UcdLine line) {
String[] parts = line.getParts();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "unicode", "unicode", "CDATA", parts[0]);
attributes.addAttribute(
namespace, "docomo", "docomo", "CDATA", parts[1]);
attributes.addAttribute(
namespace, "kddi", "kddi", "CDATA", parts[2]);
attributes.addAttribute(
namespace, "softbank", "softbank", "CDATA", parts[3]);
return attributes;
}

private static AttributesImpl getNamedSequenceAttributes(String namespace, String name, HashMap<String, String> namedSequences) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "name", "name", "CDATA", name);
attributes.addAttribute(
namespace, "cps", "cps", "CDATA", namedSequences.get(name));
return attributes;
}

private static AttributesImpl getNCAttributes(String namespace, UcdLineParser.UcdLine line) {
String[] parts = line.getParts();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "cp", "cp", "CDATA", parts[0]);
attributes.addAttribute(
namespace, "old", "old", "CDATA", parts[1]);
attributes.addAttribute(
namespace, "new", "new", "CDATA", parts[2]);
attributes.addAttribute(
namespace, "version", "version", "CDATA", parts[3]);
return attributes;
}

private static AttributesImpl getSVAttributes(String namespace, UcdLineParser.UcdLine line) {
String[] parts = line.getParts();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(
namespace, "cps", "cps", "CDATA", parts[0]);
attributes.addAttribute(
namespace, "desc", "desc", "CDATA", parts[1]);
attributes.addAttribute(
namespace, "when", "when", "CDATA",
parts[2] != null ? parts[2] : "");
return attributes;
}

private boolean isCompatibleVersion(VersionInfo minVersion, VersionInfo maxVersion) {
return (indexUnicodeProperties.getUcdVersion().compareTo(minVersion) >= 0 && (
maxVersion == null || indexUnicodeProperties.getUcdVersion().compareTo(maxVersion) <= 0));
}
}
78 changes: 78 additions & 0 deletions unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package org.unicode.xml;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.FileOutputStream;

public class UCDXMLWriter {

public static final String NAMESPACE
= "http://www.unicode.org/ns/2003/ucd/1.0";

private final TransformerHandler transformerHandler;

public TransformerHandler getTransformerHandler() {
return transformerHandler;
}

public UCDXMLWriter(FileOutputStream f) throws TransformerConfigurationException {
TransformerFactory tfactory = TransformerFactory.newInstance();
SAXTransformerFactory sfactory = (SAXTransformerFactory) tfactory;
transformerHandler = sfactory.newTransformerHandler ();
Transformer transformer = transformerHandler.getTransformer ();
transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.STANDALONE, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "3");
transformerHandler.setResult (new StreamResult(f));
}

public void startFile() throws SAXException {
transformerHandler.startDocument ();
char[] c = "\n".toCharArray ();
transformerHandler.characters (c, 0, c.length);
c = " \u00A9 2023 Unicode\u00AE, Inc. ".toCharArray ();
transformerHandler.comment (c, 0, c.length);
c = "\n".toCharArray ();
transformerHandler.characters (c, 0, c.length);
c = " For terms of use, see http://www.unicode.org/terms_of_use.html ".toCharArray ();
transformerHandler.comment (c, 0, c.length);
c = "\n\n\n".toCharArray ();
transformerHandler.characters (c, 0, c.length);

}
public void endFile() throws SAXException {
transformerHandler.endDocument ();
}

public void startElement(String tagName) throws SAXException {
AttributesImpl attributes = new AttributesImpl ();
startElement(tagName, attributes);
}

public void startElement(String tagName, AttributesImpl attributes) throws SAXException {
transformerHandler.startElement (NAMESPACE, tagName, tagName, attributes);
}

public void addContent(String s) throws SAXException {
char[] d = s.toCharArray ();
transformerHandler.characters (d, 0, d.length);
}

public void endElement(String tagName) throws SAXException {
transformerHandler.endElement (NAMESPACE, tagName, tagName);
}
}


Loading

0 comments on commit 5986dfa

Please sign in to comment.