Skip to content

Commit

Permalink
Deleting code points encoded as 2 characters in UTF-16
Browse files Browse the repository at this point in the history
  • Loading branch information
reboutli-crim committed Oct 31, 2017
1 parent 0b93671 commit f6df6c0
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>ca.crim.nlp</groupId>
<artifactId>crim-heideltime</artifactId>
<version>3.0.7-SNAPSHOT</version>
<version>3.0.8-SNAPSHOT</version>

<name>HeidelTime</name>
<description> This version of HeidelTime extends the well-known multilingual cross-domain temporal tagger (com.github.hiedltime) that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.</description>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -844,7 +844,25 @@ public static void main(String[] args) {
}

// double-newstring should not be necessary, but without this, it's not running on Windows (?)
String input = new String(new String(inArr, encodingType).getBytes("UTF-8"), "UTF-8");
String input_raw = new String(new String(inArr, encodingType).getBytes("UTF-8"), "UTF-8");
String input ;

// Elimination of code points encoded as two 16-bit code units (they were counted as 2 characters)
if (outputType.equals(OutputType.JSON)) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i< input_raw.length(); ++i) {
if (Character.isHighSurrogate(input_raw.charAt(i))) {
sb.append('?');
}
else if (Character.isLowSurrogate(input_raw.charAt(i)))
continue;
else
sb.append(input_raw.charAt(i));
}
input = sb.toString();
}
else
input = input_raw;

HeidelTimeStandalone standalone = new HeidelTimeStandalone(language, type, outputType, null, posTagger, doIntervalTagging);
String out = standalone.process(input, dct, pos_file, sentence_file);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public String format(JCas jcas) throws Exception {
try {
l.log(Level.FINEST, "Preparing to serialize the results in JSON");
JsonCasSerializer serializer = new JsonCasSerializer();

serializer.setPrettyPrint(true);
serializer.serialize(jcas.getCas(), output);
outText = output.toString();
l.log(Level.FINEST, "JSON-serialization finished.");
Expand Down

0 comments on commit f6df6c0

Please sign in to comment.