Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrated code lifecycle: Improve reinitialization when pausing build agents #9939

Merged
merged 33 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
12c6041
improve re-init
krusche Nov 28, 2024
790dc64
re-init during pause
krusche Nov 28, 2024
ec2326f
split pause and resume
krusche Nov 28, 2024
bf3b58b
re init dockerClient
BBesrour Nov 28, 2024
5480b76
re init dockerClient
BBesrour Nov 28, 2024
cb409b7
re init dockerClient
BBesrour Nov 28, 2024
0b54c3d
re init dockerClient
BBesrour Nov 29, 2024
0b2ebf4
re init dockerClient
BBesrour Nov 29, 2024
815083a
logging
BBesrour Nov 29, 2024
fb8a1f6
reinit
BBesrour Nov 29, 2024
74d8477
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
BBesrour Dec 3, 2024
c83645a
increase timeout
BBesrour Dec 4, 2024
7fa6a0e
cancel task when timeout
BBesrour Dec 4, 2024
a4d612d
logging
BBesrour Dec 4, 2024
f82a9b3
logging
BBesrour Dec 4, 2024
8816eee
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
BBesrour Dec 5, 2024
c38e88b
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
BBesrour Dec 11, 2024
6b8d2a6
dont cancel future (doesnt do anything)
BBesrour Dec 11, 2024
585d696
parse results before stopping container
BBesrour Dec 11, 2024
5cc18e4
rename methods
BBesrour Dec 11, 2024
e54bebf
format
BBesrour Dec 11, 2024
df6077f
close input stream
BBesrour Dec 11, 2024
20654d7
fix tests
BBesrour Dec 11, 2024
98342c5
fix tests
BBesrour Dec 11, 2024
bbd7591
fix init
BBesrour Dec 11, 2024
58ab1b5
Revert "fix init"
BBesrour Dec 11, 2024
d62194b
Revert "close input stream"
BBesrour Dec 11, 2024
2b6114b
Revert "format"
BBesrour Dec 11, 2024
06ab517
Revert "parse results before stopping container"
BBesrour Dec 11, 2024
3593492
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
BBesrour Dec 16, 2024
4a44991
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
BBesrour Dec 19, 2024
7a8e4bf
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
BBesrour Dec 20, 2024
ff7fd8c
Merge branch 'develop' into bugfix/integrated-code-lifecycle/re-init
az108 Dec 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import static de.tum.cit.aet.artemis.core.config.Constants.PROFILE_BUILDAGENT;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.RejectedExecutionHandler;
Expand All @@ -14,9 +14,11 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Profile;
import org.springframework.context.event.EventListener;

import com.github.dockerjava.api.DockerClient;
import com.github.dockerjava.api.model.HostConfig;
Expand All @@ -40,6 +42,12 @@ public class BuildAgentConfiguration {

private final ProgrammingLanguageConfiguration programmingLanguageConfiguration;

private ThreadPoolExecutor buildExecutor;

private int threadPoolSize = 0;

private DockerClient dockerClient;

private static final Logger log = LoggerFactory.getLogger(BuildAgentConfiguration.class);

@Value("${artemis.continuous-integration.docker-connection-uri}")
Expand All @@ -55,6 +63,24 @@ public BuildAgentConfiguration(ProgrammingLanguageConfiguration programmingLangu
this.programmingLanguageConfiguration = programmingLanguageConfiguration;
}

@EventListener(ApplicationReadyEvent.class)
BBesrour marked this conversation as resolved.
Show resolved Hide resolved
public void onApplicationReady() {
buildExecutor = createBuildExecutor();
dockerClient = createDockerClient();
}

public ThreadPoolExecutor getBuildExecutor() {
return buildExecutor;
}

public int getThreadPoolSize() {
return threadPoolSize;
}

public DockerClient getDockerClient() {
return dockerClient;
}

/**
* Creates a HostConfig object that is used to configure the Docker container for build jobs.
* The configuration is based on the default Docker flags for build jobs as specified in artemis.continuous-integration.build.
Expand Down Expand Up @@ -114,12 +140,11 @@ else if (bytes < 1024 * 1024 * 1024) {
}

/**
* Creates an executor service that manages the queue of build jobs.
* Creates an thread pool executor that manages the queue of build jobs.
*
* @return The executor service bean.
* @return The executor service.
*/
@Bean
public ExecutorService localCIBuildExecutorService() {
private ThreadPoolExecutor createBuildExecutor() {
int threadPoolSize;

if (specifyConcurrentBuilds) {
Expand All @@ -129,6 +154,7 @@ public ExecutorService localCIBuildExecutorService() {
int availableProcessors = Runtime.getRuntime().availableProcessors();
threadPoolSize = Math.max(1, (availableProcessors - 2) / 2);
}
this.threadPoolSize = threadPoolSize;

ThreadFactory customThreadFactory = new ThreadFactoryBuilder().setNameFormat("local-ci-build-%d")
.setUncaughtExceptionHandler((thread, exception) -> log.error("Uncaught exception in thread {}", thread.getName(), exception)).build();
Expand All @@ -144,11 +170,9 @@ public ExecutorService localCIBuildExecutorService() {
/**
* Creates a Docker client that is used to communicate with the Docker daemon.
*
* @return The DockerClient bean.
* @return The DockerClient.
*/
@Bean
// TODO: reconsider if a bean is necessary here, this could also be created after application startup with @EventListener(ApplicationReadyEvent.class) to speed up the startup
public DockerClient dockerClient() {
public DockerClient createDockerClient() {
log.debug("Create bean dockerClient");
DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder().withDockerHost(dockerConnectionUri).build();
DockerHttpClient httpClient = new ZerodepDockerHttpClient.Builder().dockerHost(config.getDockerHost()).sslConfig(config.getSSLConfig()).build();
Expand All @@ -175,4 +199,40 @@ else if (memoryString.endsWith("k\"")) {
return Long.parseLong(memoryString);
}
}

private void shutdownBuildExecutor() {
// Shut down the current executor gracefully
if (buildExecutor != null && !buildExecutor.isShutdown()) {
buildExecutor.shutdown();
try {
buildExecutor.awaitTermination(5, TimeUnit.SECONDS);
}
catch (InterruptedException e) {
log.warn("Executor termination interrupted", e);
}
}
buildExecutor = null;
}
BBesrour marked this conversation as resolved.
Show resolved Hide resolved

private void closeDockerClient() {
if (dockerClient != null) {
try {
dockerClient.close();
}
catch (IOException e) {
log.error("Error closing Docker client", e);
}
dockerClient = null;
}
}

public void closeBuildAgentServices() {
shutdownBuildExecutor();
closeDockerClient();
}

public void openBuildAgentServices() {
this.buildExecutor = createBuildExecutor();
this.dockerClient = createDockerClient();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.map.IMap;

import de.tum.cit.aet.artemis.buildagent.BuildAgentConfiguration;
import de.tum.cit.aet.artemis.buildagent.dto.BuildJobQueueItem;
import de.tum.cit.aet.artemis.core.exception.LocalCIException;
import de.tum.cit.aet.artemis.core.util.TimeLogUtil;
Expand All @@ -53,7 +54,7 @@ public class BuildAgentDockerService {

private static final Logger log = LoggerFactory.getLogger(BuildAgentDockerService.class);

private final DockerClient dockerClient;
private final BuildAgentConfiguration buildAgentConfiguration;

private final HazelcastInstance hazelcastInstance;

Expand Down Expand Up @@ -88,9 +89,9 @@ public class BuildAgentDockerService {
@Value("${artemis.continuous-integration.image-architecture:amd64}")
private String imageArchitecture;

public BuildAgentDockerService(DockerClient dockerClient, @Qualifier("hazelcastInstance") HazelcastInstance hazelcastInstance,
public BuildAgentDockerService(BuildAgentConfiguration buildAgentConfiguration, @Qualifier("hazelcastInstance") HazelcastInstance hazelcastInstance,
BuildJobContainerService buildJobContainerService, @Qualifier("taskScheduler") TaskScheduler taskScheduler) {
this.dockerClient = dockerClient;
this.buildAgentConfiguration = buildAgentConfiguration;
this.hazelcastInstance = hazelcastInstance;
this.buildJobContainerService = buildJobContainerService;
this.taskScheduler = taskScheduler;
Expand Down Expand Up @@ -123,6 +124,7 @@ public void applicationReady() {
public void cleanUpContainers() {
List<Container> danglingBuildContainers;
log.info("Start cleanup dangling build containers");
DockerClient dockerClient = buildAgentConfiguration.getDockerClient();
if (isFirstCleanup) {
// Cleanup all dangling build containers after the application has started
try {
Expand Down Expand Up @@ -213,6 +215,7 @@ public void onComplete() {
* @throws LocalCIException if the image pull is interrupted or fails due to other exceptions.
*/
public void pullDockerImage(BuildJobQueueItem buildJob, BuildLogsMap buildLogsMap) {
DockerClient dockerClient = buildAgentConfiguration.getDockerClient();
final String imageName = buildJob.buildConfig().dockerImage();
try {
// First check if the image is already available
Expand Down Expand Up @@ -319,7 +322,7 @@ public void deleteOldDockerImages() {
if (dockerImageCleanupInfo.get(dockerImage).isBefore(ZonedDateTime.now().minusDays(imageExpiryDays))) {
log.info("Deleting docker image {}", dockerImage);
try {
dockerClient.removeImageCmd(dockerImage).exec();
buildAgentConfiguration.getDockerClient().removeImageCmd(dockerImage).exec();
}
catch (NotFoundException e) {
log.warn("Docker image {} not found during cleanup", dockerImage);
Expand All @@ -342,6 +345,9 @@ public void checkUsableDiskSpaceThenCleanUp() {
if (!imageCleanupEnabled) {
return;
}

DockerClient dockerClient = buildAgentConfiguration.getDockerClient();

try {
// Get the Docker root directory to check disk space.
File dockerRootDirectory = new File(Objects.requireNonNullElse(dockerClient.infoCmd().exec().getDockerRootDir(), "/"));
Expand Down Expand Up @@ -399,6 +405,8 @@ public void checkUsableDiskSpaceThenCleanUp() {
* @return a set of image names that are not associated with any running containers.
*/
private Set<String> getUnusedDockerImages() {
DockerClient dockerClient = buildAgentConfiguration.getDockerClient();

// Get list of all running containers
List<Container> containers = dockerClient.listContainersCmd().exec();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import com.github.dockerjava.api.model.Frame;
import com.github.dockerjava.api.model.HostConfig;

import de.tum.cit.aet.artemis.buildagent.BuildAgentConfiguration;
import de.tum.cit.aet.artemis.buildagent.dto.BuildLogDTO;
import de.tum.cit.aet.artemis.core.exception.LocalCIException;
import de.tum.cit.aet.artemis.programming.domain.ProgrammingLanguage;
Expand All @@ -58,7 +59,7 @@ public class BuildJobContainerService {

private static final Logger log = LoggerFactory.getLogger(BuildJobContainerService.class);

private final DockerClient dockerClient;
private final BuildAgentConfiguration buildAgentConfiguration;

private final HostConfig hostConfig;

Expand All @@ -76,8 +77,8 @@ public class BuildJobContainerService {
@Value("${artemis.continuous-integration.proxies.default.no-proxy:}")
private String noProxy;

public BuildJobContainerService(DockerClient dockerClient, HostConfig hostConfig, BuildLogsMap buildLogsMap) {
this.dockerClient = dockerClient;
public BuildJobContainerService(BuildAgentConfiguration buildAgentConfiguration, HostConfig hostConfig, BuildLogsMap buildLogsMap) {
this.buildAgentConfiguration = buildAgentConfiguration;
this.hostConfig = hostConfig;
this.buildLogsMap = buildLogsMap;
}
Expand All @@ -102,7 +103,7 @@ public CreateContainerResponse configureContainer(String containerName, String i
if (exerciseEnvVars != null && !exerciseEnvVars.isEmpty()) {
envVars.addAll(exerciseEnvVars);
}
return dockerClient.createContainerCmd(image).withName(containerName).withHostConfig(hostConfig).withEnv(envVars)
return buildAgentConfiguration.getDockerClient().createContainerCmd(image).withName(containerName).withHostConfig(hostConfig).withEnv(envVars)
// Command to run when the container starts. This is the command that will be executed in the container's main process, which runs in the foreground and blocks the
// container from exiting until it finishes.
// It waits until the script that is running the tests (see below execCreateCmdResponse) is completed, and until the result files are extracted which is indicated
Expand All @@ -119,7 +120,7 @@ public CreateContainerResponse configureContainer(String containerName, String i
* @param containerId the ID of the container to be started
*/
public void startContainer(String containerId) {
dockerClient.startContainerCmd(containerId).exec();
buildAgentConfiguration.getDockerClient().startContainerCmd(containerId).exec();
}

/**
Expand All @@ -133,7 +134,7 @@ public void runScriptInContainer(String containerId, String buildJobId, boolean
if (isNetworkDisabled) {
log.info("disconnecting container with id {} from network", containerId);
try {
dockerClient.disconnectFromNetworkCmd().withContainerId(containerId).withNetworkId("bridge").exec();
buildAgentConfiguration.getDockerClient().disconnectFromNetworkCmd().withContainerId(containerId).withNetworkId("bridge").exec();
}
catch (Exception e) {
log.error("Failed to disconnect container with id {} from network: {}", containerId, e.getMessage());
Expand Down Expand Up @@ -176,7 +177,7 @@ public void moveResultsToSpecifiedDirectory(String containerId, List<String> sou
* @return a {@link TarArchiveInputStream} that can be used to read the archive.
*/
public TarArchiveInputStream getArchiveFromContainer(String containerId, String path) throws NotFoundException {
return new TarArchiveInputStream(dockerClient.copyArchiveFromContainerCmd(containerId, path).exec());
return new TarArchiveInputStream(buildAgentConfiguration.getDockerClient().copyArchiveFromContainerCmd(containerId, path).exec());
}

/**
Expand Down Expand Up @@ -222,43 +223,44 @@ public void stopContainer(String containerName) {
* @param containerId The ID of the container to stop or kill.
*/
public void stopUnresponsiveContainer(String containerId) {
ExecutorService executor = Executors.newSingleThreadExecutor();
try {
// Attempt to stop the container. It should stop the container and auto-remove it.
// {@link DockerClient#stopContainerCmd(String)} first sends a SIGTERM command to the container to gracefully stop it,
// and if it does not stop within the timeout, it sends a SIGKILL command to kill the container.
log.info("Stopping container with id {}", containerId);

// Submit Docker stop command to executor service
Future<Void> future = executor.submit(() -> {
dockerClient.stopContainerCmd(containerId).withTimeout(5).exec();
return null; // Return type to match Future<Void>
});

// Await the future with a timeout
future.get(10, TimeUnit.SECONDS); // Wait for the stop command to complete with a timeout
}
catch (NotFoundException | NotModifiedException e) {
log.warn("Container with id {} is already stopped.", containerId, e);
}
catch (Exception e) {
log.error("Failed to stop container with id {}. Attempting to kill container.", containerId, e);

// Attempt to kill the container if stop fails
try (ExecutorService executor = Executors.newSingleThreadExecutor()) {
try {
Future<Void> killFuture = executor.submit(() -> {
dockerClient.killContainerCmd(containerId).exec();
return null;
// Attempt to stop the container. It should stop the container and auto-remove it.
// {@link DockerClient#stopContainerCmd(String)} first sends a SIGTERM command to the container to gracefully stop it,
// and if it does not stop within the timeout, it sends a SIGKILL command to kill the container.
log.info("Stopping container with id {}", containerId);

// Submit Docker stop command to executor service
Future<Void> future = executor.submit(() -> {
buildAgentConfiguration.getDockerClient().stopContainerCmd(containerId).withTimeout(15).exec();
return null; // Return type to match Future<Void>
});

killFuture.get(5, TimeUnit.SECONDS); // Wait for the kill command to complete with a timeout
// Await the future with a timeout
future.get(20, TimeUnit.SECONDS); // Wait for the stop command to complete with a timeout
}
catch (Exception killException) {
log.error("Failed to kill container with id {}.", containerId, killException);
catch (NotFoundException | NotModifiedException e) {
log.warn("Container with id {} is already stopped.", containerId, e);
}
catch (Exception e) {
log.error("Failed to stop container with id {}. Attempting to kill container.", containerId, e);

// Attempt to kill the container if stop fails
try {
Future<Void> killFuture = executor.submit(() -> {
buildAgentConfiguration.getDockerClient().killContainerCmd(containerId).exec();
return null;
});

killFuture.get(10, TimeUnit.SECONDS); // Wait for the kill command to complete with a timeout
}
catch (Exception killException) {
log.error("Failed to kill container with id {}.", containerId, killException);
}
}
finally {
executor.shutdown();
}
}
finally {
executor.shutdown();
}
}

Expand Down Expand Up @@ -356,7 +358,7 @@ private void addDirectory(String containerId, String directoryName, boolean crea

private void copyToContainer(String sourcePath, String containerId) {
try (InputStream uploadStream = new ByteArrayInputStream(createTarArchive(sourcePath).toByteArray())) {
dockerClient.copyArchiveToContainerCmd(containerId).withRemotePath(LOCALCI_WORKING_DIRECTORY).withTarInputStream(uploadStream).exec();
buildAgentConfiguration.getDockerClient().copyArchiveToContainerCmd(containerId).withRemotePath(LOCALCI_WORKING_DIRECTORY).withTarInputStream(uploadStream).exec();
}
catch (IOException e) {
throw new LocalCIException("Could not copy to container " + containerId, e);
Expand Down Expand Up @@ -408,11 +410,13 @@ private void addFileToTar(TarArchiveOutputStream tarArchiveOutputStream, File fi
}

private void executeDockerCommandWithoutAwaitingResponse(String containerId, String... command) {
DockerClient dockerClient = buildAgentConfiguration.getDockerClient();
ExecCreateCmdResponse createCmdResponse = dockerClient.execCreateCmd(containerId).withCmd(command).exec();
dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(true).exec(new ResultCallback.Adapter<>());
}

private void executeDockerCommand(String containerId, String buildJobId, boolean attachStdout, boolean attachStderr, boolean forceRoot, String... command) {
DockerClient dockerClient = buildAgentConfiguration.getDockerClient();
boolean detach = !attachStdout && !attachStderr;

ExecCreateCmd execCreateCmd = dockerClient.execCreateCmd(containerId).withAttachStdout(attachStdout).withAttachStderr(attachStderr).withCmd(command);
Expand Down Expand Up @@ -458,7 +462,7 @@ private void checkPath(String path) {
}

private Container getContainerForName(String containerName) {
List<Container> containers = dockerClient.listContainersCmd().withShowAll(true).exec();
List<Container> containers = buildAgentConfiguration.getDockerClient().listContainersCmd().withShowAll(true).exec();
return containers.stream().filter(container -> container.getNames()[0].equals("/" + containerName)).findFirst().orElse(null);
}
}
Loading
Loading