Skip to content

Commit

Permalink
OPENNLP-1659 - Enhancements for DownloadUtil
Browse files Browse the repository at this point in the history
- Make the BASE_URL and MODEL_URI_PATH configurable via System properties in order to be backwards compatible if models are removed from the release area of OpenNLP / ASF CDN
- Make the OPENNLP_DOWNLOAD_HOME configurable, i.e. for tests in CI environments. This can also be useful for OpenNLP devs to avoid cleaning models in their user home.
- Replace hard coded file separators for multi OS support
- Log a DEBUG message if the model file already exists and the download is skipped.
- Adds a test case to verify that models are not downloaded twice (if they exist) by testing for the related LOG output using LogCaptor.
- Replaces slf4j-simple with logback (from LogCaptor) for further testing of specific output
  • Loading branch information
rzo1 authored and Richard Zowalla committed Nov 28, 2024
1 parent fcf7e3d commit 99e0b5f
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 112 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,6 @@ jobs:
distribution: temurin
java-version: ${{ matrix.java }}
- name: Build with Maven
run: mvn -V clean test install --no-transfer-progress -Pjacoco
run: mvn -V clean test install --no-transfer-progress -Pjacoco -Pci
- name: Jacoco
run: mvn jacoco:report
34 changes: 27 additions & 7 deletions opennlp-tools/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,6 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<scope>test</scope>
</dependency>

<!-- JUnit5 extension used in CLITest to prevent System.exit(..) calls terminating test runs -->
<dependency>
<groupId>com.ginsberg</groupId>
Expand All @@ -71,6 +65,13 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.github.hakky54</groupId>
<artifactId>logcaptor</artifactId>
<version>${logcaptor.version}</version>
<scope>test</scope>
</dependency>

</dependencies>

<build>
Expand Down Expand Up @@ -121,7 +122,7 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<argLine>-Xmx2048m -Dorg.slf4j.simpleLogger.defaultLogLevel=off -javaagent:${settings.localRepository}/com/ginsberg/junit5-system-exit/${junit5-system-exit.version}/junit5-system-exit-${junit5-system-exit.version}.jar</argLine>
<argLine>-Xmx2048m -DOPENNLP_DOWNLOAD_HOME=${opennlp.download.home} -javaagent:${settings.localRepository}/com/ginsberg/junit5-system-exit/${junit5-system-exit.version}/junit5-system-exit-${junit5-system-exit.version}.jar</argLine>
<forkCount>${opennlp.forkCount}</forkCount>
<failIfNoSpecifiedTests>false</failIfNoSpecifiedTests>
<excludes>
Expand All @@ -131,10 +132,29 @@
</excludes>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
<version>${maven.failsafe.plugin}</version>
<configuration>
<argLine>-DOPENNLP_DOWNLOAD_HOME=${opennlp.download.home}</argLine>
</configuration>
</plugin>
</plugins>
</build>

<properties>
<opennlp.download.home>${user.home}</opennlp.download.home>
</properties>

<profiles>
<profile>
<id>ci</id>
<properties>
<opennlp.download.home>${project.build.directory}</opennlp.download.home>
</properties>
</profile>
<profile>
<id>jmh</id>
<dependencies>
Expand Down
48 changes: 31 additions & 17 deletions opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Formatter;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -72,18 +73,12 @@ public enum ModelType {
}
}

private static final String BASE_URL = "https://dlcdn.apache.org/opennlp/";
private static final String MODELS_UD_MODELS_1_2 = "models/ud-models-1.2/";
private static final String BASE_URL =
System.getProperty("OPENNLP_DOWNLOAD_BASE_URL", "https://dlcdn.apache.org/opennlp/");
private static final String MODEL_URI_PATH =
System.getProperty("OPENNLP_DOWNLOAD_MODEL_PATH", "models/ud-models-1.2/");

public static final Map<String, Map<ModelType, String>> available_models;

static {
try {
available_models = new DownloadParser(new URL(BASE_URL + MODELS_UD_MODELS_1_2)).getAvailableModels();
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
}
private static Map<String, Map<ModelType, String>> availableModels;

/**
* Triggers a download for the specified {@link DownloadUtil.ModelType}.
Expand All @@ -98,14 +93,14 @@ public enum ModelType {
public static <T extends BaseModel> T downloadModel(String language, ModelType modelType,
Class<T> type) throws IOException {

if (available_models.containsKey(language)) {
final String url = (available_models.get(language).get(modelType));
if (getAvailableModels().containsKey(language)) {
final String url = (getAvailableModels().get(language).get(modelType));
if (url != null) {
return downloadModel(new URL(url), type);
}
}

throw new IOException("Invalid model.");
throw new IOException("There is no model available: " + language + " " + modelType.name);
}

/**
Expand All @@ -124,9 +119,15 @@ public static <T extends BaseModel> T downloadModel(String language, ModelType m
*/
public static <T extends BaseModel> T downloadModel(URL url, Class<T> type) throws IOException {

final Path homeDirectory = Paths.get(System.getProperty("user.home") + "/.opennlp/");
final Path homeDirectory = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
System.getProperty("user.home"))).resolve(".opennlp");

if (!Files.isDirectory(homeDirectory)) {
homeDirectory.toFile().mkdir();
try {
Files.createDirectories(homeDirectory);
} catch (IOException e) {
throw new RuntimeException(e);
}
}

final String filename = url.toString().substring(url.toString().lastIndexOf("/") + 1);
Expand All @@ -141,8 +142,10 @@ public static <T extends BaseModel> T downloadModel(URL url, Class<T> type) thro

validateModel(new URL(url + ".sha512"), localFile);


logger.debug("Download complete.");
} else {
System.out.println("Model file already exists. Skipping download.");
logger.debug("Model file '{}' already exists. Skipping download.", filename);
}

try {
Expand All @@ -152,6 +155,17 @@ public static <T extends BaseModel> T downloadModel(URL url, Class<T> type) thro
}
}

public static Map<String, Map<ModelType, String>> getAvailableModels() {
if (availableModels == null) {
try {
availableModels = new DownloadParser(new URL(BASE_URL + MODEL_URI_PATH)).getAvailableModels();
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
}
return Collections.unmodifiableMap(availableModels);
}

/**
* Validates the downloaded model.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.util;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.junit.jupiter.api.BeforeAll;

import opennlp.tools.EnabledWhenCDNAvailable;

import static org.junit.jupiter.api.Assertions.fail;

@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
public abstract class AbstractDownloadUtilTest {

private static final String APACHE_CDN = "dlcdn.apache.org";

@BeforeAll
public static void cleanupWhenOnline() {
boolean isOnline;
try (Socket socket = new Socket()) {
socket.connect(new InetSocketAddress(APACHE_CDN, 80), EnabledWhenCDNAvailable.TIMEOUT_MS);
isOnline = true;
} catch (IOException e) {
// Unreachable, unresolvable or timeout
isOnline = false;
}
// If CDN is available -> go cleanup in preparation of the actual tests
if (isOnline) {
wipeExistingModelFiles("-tokens-");
wipeExistingModelFiles("-sentence-");
wipeExistingModelFiles("-pos-");
wipeExistingModelFiles("-lemma-");
}
}


/*
* Helper method that wipes out mode files if they exist on the text execution env.
* Those model files are wiped from a hidden '.opennlp' subdirectory.
*
* Thereby, a clean download can be guaranteed - ín CDN is available and test are executed.
*/
private static void wipeExistingModelFiles(final String fragment) {
final Path dir = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
System.getProperty("user.home"))).resolve(".opennlp");
if (Files.exists(dir)) {
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*opennlp-*" + fragment + "*")) {
for (Path modelFileToWipe : stream) {
Files.deleteIfExists(modelFileToWipe);
}
} catch (IOException e) {
fail(e.getLocalizedMessage());
}
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.util;

import java.io.IOException;
import java.util.List;

import ch.qos.logback.classic.Level;
import ch.qos.logback.classic.Logger;
import ch.qos.logback.classic.LoggerContext;
import nl.altindag.log.LogCaptor;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.LoggerFactory;

import opennlp.tools.EnabledWhenCDNAvailable;
import opennlp.tools.sentdetect.SentenceModel;

import static org.junit.jupiter.api.Assertions.assertEquals;

@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
public class DownloadUtilDownloadTwiceTest extends AbstractDownloadUtilTest {

/*
* Programmatic change to debug log to ensure that we can see log messages to
* confirm no duplicate download is happening
*/
@BeforeAll
public static void prepare() {
LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory();
Logger logger = context.getLogger("opennlp");
logger.setLevel(Level.DEBUG);
}

/*
* Programmatic restore the default log level (= OFF) after the test
*/
@AfterAll
public static void cleanup() {
LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory();
Logger logger = context.getLogger("opennlp");
logger.setLevel(Level.OFF);
}

@Test
public void testDownloadModelTwice() throws IOException {
try (LogCaptor logCaptor = LogCaptor.forClass(DownloadUtil.class)) {

DownloadUtil.downloadModel("de",
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);

assertEquals(2, logCaptor.getDebugLogs().size());
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download complete.");
logCaptor.clearLogs();

// try to download again
DownloadUtil.downloadModel("de",
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
assertEquals(1, logCaptor.getDebugLogs().size());
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download.");
logCaptor.clearLogs();

DownloadUtil.downloadModel("de",
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
assertEquals(1, logCaptor.getDebugLogs().size());
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download.");
logCaptor.clearLogs();

}
}

private void checkDebugLogsContainMessageFragment(List<String> debugLogs, String message) {
for (String log : debugLogs) {
if (log.contains(message)) {
return;
}
}
throw new AssertionError("Expected message fragment not found in logs: " + message);
}

}
Loading

0 comments on commit 99e0b5f

Please sign in to comment.