Skip to content

Commit

Permalink
OPENNLP-1650 Update DownloadUtil to use Models release 1.2
Browse files Browse the repository at this point in the history
- adapts DownloadUtil, related classes and tests towards Models 1.2
- updates index.html in opennlp/tools/util to latest data Models 1.2 for DownloadParserTest
- introduces DownloadUtil.ModelType#LEMMATIZER as those are now available
- adds LemmatizerModelLoaderIT
- extracts some cnp'ed strings to constants
- fixes broken JavaDoc in PerceptronTrainer along the path
  • Loading branch information
mawiesne authored and rzo1 committed Nov 25, 2024
1 parent 7b38536 commit a238e18
Show file tree
Hide file tree
Showing 12 changed files with 676 additions and 400 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public PerceptronTrainer(TrainingParameters parameters) {
* {@inheritDoc}
*
* @throws IllegalArgumentException Thrown if the algorithm name is not equal to
* {{@link #PERCEPTRON_VALUE}}.
* {@link #PERCEPTRON_VALUE}.
*/
@Override
public void validate() {
Expand Down Expand Up @@ -215,7 +215,7 @@ public void setSkippedAveraging(boolean averaging) {
*
* @param iterations The number of iterations to use for training.
* @param di The {@link DataIndexer} used as data input.
* @param cutoff The {{@link #CUTOFF_PARAM}} value to use for training.
* @param cutoff The {@link TrainingParameters#CUTOFF_PARAM} value to use for training.
*
* @return A valid, trained {@link AbstractModel perceptron model}.
*/
Expand All @@ -228,9 +228,9 @@ public AbstractModel trainModel(int iterations, DataIndexer di, int cutoff) {
*
* @param iterations The number of iterations to use for training.
* @param di The {@link DataIndexer} used as data input.
* @param cutoff The {{@link #CUTOFF_PARAM}} value to use for training.
* @param cutoff The {@link TrainingParameters#CUTOFF_PARAM} value to use for training.
* @param useAverage Whether to use 'averaging', or not.
* See {{@link #setSkippedAveraging(boolean)}} for details.
* See {@link #setSkippedAveraging(boolean)} for details.
*
* @return A valid, trained {@link AbstractModel perceptron model}.
*/
Expand Down
25 changes: 23 additions & 2 deletions opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public class DownloadUtil {
* The type of model.
*/
public enum ModelType {
LEMMATIZER("lemma"),
TOKENIZER("token"),
SENTENCE_DETECTOR("sent"),
POS("pos-perceptron"),
Expand All @@ -72,13 +73,13 @@ public enum ModelType {
}

private static final String BASE_URL = "https://dlcdn.apache.org/opennlp/";
private static final String MODELS_UD_MODELS_1_1 = "models/ud-models-1.1/";
private static final String MODELS_UD_MODELS_1_2 = "models/ud-models-1.2/";

public static final Map<String, Map<ModelType, String>> available_models;

static {
try {
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_1)).getAvailableModels();
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_2)).getAvailableModels();
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
Expand Down Expand Up @@ -240,18 +241,34 @@ private Map<String, Map<ModelType, String>> toMap(List<String> links) {
addModel("fr", link, result);
} else if (link.contains("bg-ud")) { // Bulgarian
addModel("bg", link, result);
} else if (link.contains("ca-ud")) { // Catalan
addModel("ca", link, result);
} else if (link.contains("cs-ud")) { // Czech
addModel("cs", link, result);
} else if (link.contains("hr-ud")) { // Croatian
addModel("hr", link, result);
} else if (link.contains("da-ud")) { // Danish
addModel("da", link, result);
} else if (link.contains("el-ud")) { // Greek
addModel("el", link, result);
} else if (link.contains("es-ud")) { // Spanish
addModel("es", link, result);
} else if (link.contains("et-ud")) { // Estonian
addModel("et", link, result);
} else if (link.contains("eu-ud")) { // Basque
addModel("eu", link, result);
} else if (link.contains("fi-ud")) { // Finnish
addModel("fi", link, result);
} else if (link.contains("hy-ud")) { // Armenian
addModel("hy", link, result);
} else if (link.contains("is-ud")) { // Icelandic
addModel("is", link, result);
} else if (link.contains("ka-ud")) { // Georgian
addModel("ka", link, result);
} else if (link.contains("kk-ud")) { // Kazakh
addModel("kk", link, result);
} else if (link.contains("ko-ud")) { // Korean
addModel("ko", link, result);
} else if (link.contains("lv-ud")) { // Latvian
addModel("lv", link, result);
} else if (link.contains("no-ud")) { // Norwegian
Expand All @@ -272,6 +289,8 @@ private Map<String, Map<ModelType, String>> toMap(List<String> links) {
addModel("sl", link, result);
} else if (link.contains("sv-ud")) { // Swedish
addModel("sv", link, result);
} else if (link.contains("tr-ud")) { // Turkish
addModel("tr", link, result);
} else if (link.contains("uk-ud")) { // Ukrainian
addModel("uk", link, result);
}
Expand All @@ -288,6 +307,8 @@ private void addModel(String locale, String link, Map<String, Map<ModelType, Str
models.put(ModelType.SENTENCE_DETECTOR, url);
} else if (link.contains("tokens")) {
models.put(ModelType.TOKENIZER, url);
} else if (link.contains("lemma")) {
models.put(ModelType.LEMMATIZER, url);
} else if (link.contains("pos")) {
models.put(ModelType.POS, url);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -36,6 +37,13 @@ public abstract class AbstractModelLoaderTest {
private static final String BASE_URL_MODELS_V15 = "https://opennlp.sourceforge.net/models-1.5/";
private static final String BASE_URL_MODELS_V183 = "https://dlcdn.apache.org/opennlp/models/langdetect/1.8.3/";
protected static final Path OPENNLP_DIR = Paths.get(System.getProperty("user.home") + "/.opennlp/");
protected static final String VER = "1.2-2.5.0";
protected static final String BIN = ".bin";
protected static List<String> SUPPORTED_LANG_CODES = List.of(
"en", "fr", "de", "it", "nl", "bg", "ca", "cs", "da", "el",
"es", "et", "eu", "fi", "hr", "hy", "is", "ka", "kk", "ko",
"lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv",
"tr", "uk");

protected static void downloadVersion15Model(String modelName) throws IOException {
downloadModel(new URL(BASE_URL_MODELS_V15 + modelName));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.cmdline.lemmatizer;

import java.io.IOException;
import java.nio.file.Files;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import opennlp.tools.AbstractModelLoaderTest;
import opennlp.tools.EnabledWhenCDNAvailable;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.util.DownloadUtil;

@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
public class LemmatizerModelLoaderIT extends AbstractModelLoaderTest {

// SUT
private LemmatizerModelLoader loader;

@BeforeAll
public static void initResources() {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.LEMMATIZER, LemmatizerModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}

@BeforeEach
public void setup() {
loader = new LemmatizerModelLoader();
}

@ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-lemmas-" + VER + BIN;
LemmatizerModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
Expand All @@ -40,12 +39,9 @@ public class POSModelLoaderIT extends AbstractModelLoaderTest {

@BeforeAll
public static void initResources() {
List<String> langs = List.of("en", "fr", "de", "it", "nl", "bg", "cs", "da",
"es", "et", "fi", "hr", "lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "uk");
langs.forEach(lang -> {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang,
DownloadUtil.ModelType.POS, POSModel.class);
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.POS, POSModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand All @@ -59,11 +55,12 @@ public void setup() {

@ParameterizedTest(name = "Verify \"{0}\" POS model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "cs-ud-pdt", "da-ud-ddt", "es-ud-gsd", "et-ud-edt", "fi-ud-tdt", "hr-ud-set",
"lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "uk-ud-iu"})
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-pos-1.1-2.4.0.bin";
String modelName = "opennlp-" + langModel + "-pos-" + VER + BIN;
POSModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
Expand All @@ -40,12 +39,9 @@ public class SentenceModelLoaderIT extends AbstractModelLoaderTest {

@BeforeAll
public static void initResources() {
List<String> langs = List.of("en", "fr", "de", "it", "nl", "bg", "cs", "da",
"es", "et", "fi", "hr", "lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "uk");
langs.forEach(lang -> {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang,
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand All @@ -59,11 +55,12 @@ public void setup() {

@ParameterizedTest(name = "Verify \"{0}\" sentence model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "cs-ud-pdt", "da-ud-ddt", "es-ud-gsd", "et-ud-edt", "fi-ud-tdt", "hr-ud-set",
"lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "uk-ud-iu"})
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-sentence-1.1-2.4.0.bin";
String modelName = "opennlp-" + langModel + "-sentence-" + VER + BIN;
SentenceModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
Expand All @@ -40,12 +39,9 @@ public class TokenizerModelLoaderIT extends AbstractModelLoaderTest {

@BeforeAll
public static void initResources() {
List<String> langs = List.of("en", "fr", "de", "it", "nl", "bg", "cs", "da",
"es", "et", "fi", "hr", "lv", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "uk");
langs.forEach(lang -> {
SUPPORTED_LANG_CODES.forEach(lang -> {
try {
DownloadUtil.downloadModel(lang,
DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class);
DownloadUtil.downloadModel(lang, DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand All @@ -59,11 +55,12 @@ public void setup() {

@ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
@ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
"bg-ud-btb", "cs-ud-pdt", "da-ud-ddt", "es-ud-gsd", "et-ud-edt", "fi-ud-tdt", "hr-ud-set",
"lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "uk-ud-iu"})
"bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
"eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
"ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
"sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
public void testLoadModelByLanguage(String langModel) throws IOException {
String modelName = "opennlp-" + langModel + "-tokens-1.1-2.4.0.bin";
String modelName = "opennlp-" + langModel + "-tokens-" + VER + BIN;
TokenizerModel model = loader.loadModel(Files.newInputStream(OPENNLP_DIR.resolve(modelName)));
Assertions.assertNotNull(model);
Assertions.assertTrue(model.isLoadedFromSerialized());
Expand Down
Loading

0 comments on commit a238e18

Please sign in to comment.