OPENNLP-1512 Fix incorrect encoding used in Conll02NameSampleStream

apache · Sep 4, 2023 · 8da0a97 · 8da0a97
1 parent 886493a
commit 8da0a97
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 2 deletions.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/AbstractConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/AbstractConverterTool.java
@@ -113,7 +113,7 @@ public void run(String format, String[] args) {
       }
 
       try (ObjectStream<T> sampleStream = streamFactory.create(formatArgs)) {
-        Object sample;
+        T sample;
         while ((sample = sampleStream.read()) != null) {
           logger.info(sample.toString());
         }

diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStream.java
@@ -91,7 +91,20 @@ public Conll02NameSampleStream(LANGUAGE lang, ObjectStream<String> lineStream, i
    * @throws IOException Thrown if IO errors occurred.
    */
   public Conll02NameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException {
-    this (lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8), types);
+    /*
+     * NOTE: KEEP this encoding here! The original CONLL 2002 data is provided as: ISO_8859_1.
+     */
+    this (lang, new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1), types);
+    /*
+     * If related files are (incorrectly) interpreted as 'UTF-8' without prior conversion of
+     * the train/test files, then á, é, ñ,.. will be misinterpreted during processing and in
+     * resulting outcomes, e.g. produced via TokenNameFinderConverter.
+     *
+     * As a consequence, users of related tooling (OpenNLP Doc: CONLL 2002) will thus suffer
+     * from corrupted intermediate files, as an out-of the box experience.
+     * 
+     * Details see: https://issues.apache.org/jira/browse/OPENNLP-1512
+     */
   }
 
   static Span extract(int begin, int end, String beginTag) throws InvalidFormatException {