Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLDR-17844 Modify the date report #3920

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package org.unicode.cldr.util;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import com.ibm.icu.text.Bidi;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;

/**
* A set of utilities for handling BIDI, especially in charts and examples but not restricted to
* that.
*/
public class BidiUtils {
public static final String ALERT = "⚠️";
static final String LRM = CodePointEscaper.LRM.getString();

// These are intended to be classes of characters that "stick together in order"
// The initial focus is dates, so this will probably need to be expanded for numbers; might need
// more syntax

private enum SpanClass {
NUMBERS("\\p{N}"),
LETTERS_MARKS("[\\p{L}\\p{M}]"),
DATE_PUNCT("[+]"),
SPACES("\\p{Z}"),
OTHERS("\\p{any}") // must be last, to pick up remainder.
;
final UnicodeSet uset;

private SpanClass(String unicodeSetSource) {
uset = new UnicodeSet(unicodeSetSource);
}

static {
// clean up by removing previous values
UnicodeSet soFar = new UnicodeSet();
for (SpanClass sc : SpanClass.values()) {
sc.uset.removeAll(soFar).freeze();
soFar.addAll(sc.uset);
}
}
}
/**
* Checks the ordering of the example, under the specified bidiDirectionOptions;
*
* @param example Source text, not HTMLified
* @param outputReorderedResults One string for each specified bidiDirectionOption
* @param bidiDirectionOptions an array of BIDI directions from com.ibm.icu.text.Bidi. if there
* are no items, the default is DIRECTION_DEFAULT_LEFT_TO_RIGHT (dir="auto"),
* DIRECTION_RIGHT_TO_LEFT (dir="rtl").
* @return true unless two or more of the resulting strings are different.
*/
public static boolean isOrderingUnchanged(
String example, List<String> outputReorderedResults, int... bidiDirectionOptions) {
boolean hasList = outputReorderedResults != null;
if (!hasList) {
outputReorderedResults = new ArrayList<>();
} else {
outputReorderedResults.clear();
}
boolean result = true;
for (int count = 0; count < bidiDirectionOptions.length; ++count) {
String reordered = new Bidi(example, bidiDirectionOptions[count]).writeReordered(0);
outputReorderedResults.add(reordered);
if (result && count != 0 && !reordered.equals(outputReorderedResults.get(0))) {
result = false;
if (!hasList) {
break; // if the output results are not needed, then stop.
}
}
}
return result;
}

/**
* Return a list of the , where each span is a sequence of:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

word missing before the comma? Actually this whole comment looks unattached to anything, obsoleted by the following comment Gets the fields

*
* @param orderedLTR
* @return
*/
/**
* Gets the 'fields' in a formatted string, used to test whether bidi reordering causes the
* original fields to merge when reordered. Each field is the longest contiguous span of
* characters with the same properties: *
*
* <ul>
* <li>numbers (\p{N})
* <li>letters & marks ([\p{L}\p{M}
* <li>Other
* </ul>
*
* @param ordered
* @return a set of fields, in the same order as found in the text but duplicates removed (ike
* LinkedHashSeet).
*/
public static Set<String> getFields(String reordred, Set<String> result) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reordred should be reordered and the comment above should have @param reordered and @param result

int start = 0;
while (start < reordred.length()) {
for (SpanClass sc : SpanClass.values()) {
int end = sc.uset.span(reordred, start, SpanCondition.CONTAINED);
if (end != start) {
result.add(reordred.substring(start, end));
start = end;
break;
}
}
}
return ImmutableSet.copyOf(result);
}

/**
* Show when the fields in strings are different
*
* @param bidiReordereds
* @return
*/
public static String getAlert(List<String> bidiReordereds) {
Set<Set<String>> results = new LinkedHashSet<>();
for (String bidiReordered : bidiReordereds) {
Set<String> fieldsLTR = BidiUtils.getFields(bidiReordered, new TreeSet<>());
results.add(fieldsLTR);
}
if (results.size() < 2) {
return "";
}
// there can still be differences within a field of OTHERS, that we ignore.
// EG ⚠️ 20,28,2B; 2B,28,20 " (+" vs " (+"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2B,28,20 should be " ( "?


// show just the difference in the first 2, for now.
Iterator<Set<String>> it = results.iterator();
Set<String> first = it.next();
Set<String> second = it.next();
SetView<String> uniqueFirst = Sets.difference(first, second);
SetView<String> uniqueSecond = Sets.difference(second, first);
return ALERT + " " + escape(uniqueFirst) + "; " + escape(uniqueSecond);
}

public static String escape(Set<String> uniqueFirst) {
return uniqueFirst.stream()
.map(x -> CodePointEscaper.toEscaped(x))
.collect(Collectors.joining(LRM + ", " + LRM, LRM, LRM));
}

public static String alphagram(String string) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alphagram seems unused?

return string.codePoints()
.sorted()
.collect(
StringBuilder::new, // Supplier<R> supplier
StringBuilder::appendCodePoint, // ObjIntConsumer<R> accumulator
StringBuilder::append // BiConsumer<R,​R> combiner
)
.toString();
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.unicode.cldr.util;

import com.ibm.icu.impl.UnicodeMap;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.Locale;

Expand All @@ -19,10 +21,13 @@ public enum CodePointEscaper {
LF(0xA, "line feed"),
CR(0xD, "carriage return"),
SP(0x20, "space", "ASCII space"),
NSP(0x2009, "narrow/thin space", "Also known as ‘thin space’"),
TSP(0x2009, "thin space", "Aka ‘narrow space’"),
NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."),

NNBSP(0x202F, "narrow/thin no-break space", "Same as narrow space, but doesn’t line wrap."),
NBTSP(
0x202F,
"no-break thin space",
"Same as thin space, but doesn’t line wrap. Aka 'narrow no-break space'"),

WNJ(
0x200B,
Expand Down Expand Up @@ -110,9 +115,7 @@ public enum CodePointEscaper {
private final String description;

private CodePointEscaper(int codePoint, String shortName) {
this.codePoint = codePoint;
this.shortName = shortName;
this.description = "";
this(codePoint, shortName, "");
}

private CodePointEscaper(int codePoint, String shortName, String description) {
Expand Down Expand Up @@ -147,6 +150,11 @@ public int getCodePoint() {
return codePoint;
}

/** Return the string form of the code point for this character. */
public String getString() {
return UTF16.valueOf(codePoint);
}

/** Returns the escaped form from the code point for this enum */
public String codePointToEscaped() {
return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END;
Expand Down Expand Up @@ -196,6 +204,15 @@ public static String toEscaped(String unescaped, UnicodeSet toEscape) {
});
return result.toString();
}

public static String getEscaped(int cp, UnicodeSet toEscape) {
if (!toEscape.contains(cp)) {
return UTF16.valueOf(cp);
} else {
return codePointToEscaped(cp);
}
}

/** Return unescaped string */
public static String toUnescaped(String escaped) {
if (escaped == null) {
Expand Down Expand Up @@ -273,4 +290,54 @@ public static String rawCodePointToEscaped(int codePoint) {
? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT)
: result.toString();
}

public static final String getHtmlRows(
UnicodeSet escapesToShow, String tableOptions, String cellOptions) {
if (!escapesToShow.strings().isEmpty()) {
throw new IllegalArgumentException("No strings allowed in the unicode set.");
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe also confirm tableOptions and cellOptions start with spaces if not empty -- either throw IllegalArgumentException or insert the spaces if missing

StringBuilder result = new StringBuilder("<table" + tableOptions + ">");
UnicodeSet remaining = new UnicodeSet(escapesToShow);
String tdPlus = "<td" + cellOptions + ">";
for (CodePointEscaper cpe : CodePointEscaper.values()) {
int cp = cpe.getCodePoint();
remaining.remove(cp);
if (escapesToShow.contains(cpe.getCodePoint())) {
final String id = cpe.name();
final String shortName = cpe.getShortName();
final String description = cpe.getDescription();
addREsult(result, tdPlus, id, shortName, description);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is capital E in addREsult a typo?

}
}
for (String cps : remaining) {
int cp = cps.codePointAt(0);
final String extendedName = UCharacter.getExtendedName(cp);
addREsult(
result,
tdPlus,
Utility.hex(cp, 2),
"",
extendedName == null ? "" : extendedName.toLowerCase());
}
return result.append("</table>").toString();
}

public static void addREsult(
StringBuilder result,
String tdPlus,
final String id,
final String shortName,
final String description) {
result.append("<tr>")
.append(tdPlus)
.append(ESCAPE_START)
.append(id)
.append(ESCAPE_END + "</td>")
.append(tdPlus)
.append(shortName)
.append("</td>")
.append(tdPlus)
.append(description)
.append("</td><tr>");
}
}
Loading
Loading