Skip to content

Commit

Permalink
OPENNLP-1512 Fix incorrect encoding used in Conll02NameSampleStream
Browse files Browse the repository at this point in the history
  • Loading branch information
mawiesne authored and rzo1 committed Sep 4, 2023
1 parent 886493a commit 8da0a97
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ public void run(String format, String[] args) {
}

try (ObjectStream<T> sampleStream = streamFactory.create(formatArgs)) {
Object sample;
T sample;
while ((sample = sampleStream.read()) != null) {
logger.info(sample.toString());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,20 @@ public Conll02NameSampleStream(LANGUAGE lang, ObjectStream<String> lineStream, i
* @throws IOException Thrown if IO errors occurred.
*/
public Conll02NameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException {
this (lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8), types);
/*
* NOTE: KEEP this encoding here! The original CONLL 2002 data is provided as: ISO_8859_1.
*/
this (lang, new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1), types);
/*
* If related files are (incorrectly) interpreted as 'UTF-8' without prior conversion of
* the train/test files, then á, é, ñ,.. will be misinterpreted during processing and in
* resulting outcomes, e.g. produced via TokenNameFinderConverter.
*
* As a consequence, users of related tooling (OpenNLP Doc: CONLL 2002) will thus suffer
* from corrupted intermediate files, as an out-of the box experience.
*
* Details see: https://issues.apache.org/jira/browse/OPENNLP-1512
*/
}

static Span extract(int begin, int end, String beginTag) throws InvalidFormatException {
Expand Down

0 comments on commit 8da0a97

Please sign in to comment.