From 4d055b85ab48dcb52ec48477fad44ccf06ce2194 Mon Sep 17 00:00:00 2001 From: Gaurav Bafna <85113518+gbbafna@users.noreply.github.com> Date: Mon, 5 Feb 2024 12:18:28 +0530 Subject: [PATCH] [Remote Store] Waiting for remote store upload in snapshot/local recovery (#11720) * Giving time for snapshot recovery/local time to upload all the data to remote Signed-off-by: Gaurav Bafna --- .../indices/create/RemoteCloneIndexIT.java | 59 +++++++ .../RemoteStoreBaseIntegTestCase.java | 21 ++- .../remotestore/RemoteStoreRestoreIT.java | 19 ++- .../snapshots/SystemRepositoryIT.java | 11 +- .../common/settings/ClusterSettings.java | 1 + .../opensearch/index/shard/IndexShard.java | 28 ++- .../shard/RemoteStoreRefreshListener.java | 22 ++- .../opensearch/index/shard/StoreRecovery.java | 38 ++++- .../indices/recovery/RecoverySettings.java | 21 +++ .../repositories/RepositoriesModule.java | 6 + .../fs/ReloadableFsRepository.java | 161 +++++++++++++++++- .../RemoteStoreRefreshListenerTests.java | 42 +++++ 12 files changed, 404 insertions(+), 25 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java index a081110e6c5a1..f50e8fd0a38cf 100644 --- a/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/action/admin/indices/create/RemoteCloneIndexIT.java @@ -50,6 +50,8 @@ import org.opensearch.remotestore.RemoteStoreBaseIntegTestCase; import org.opensearch.test.VersionUtils; +import java.util.concurrent.ExecutionException; + import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertHitCount; import static org.hamcrest.Matchers.equalTo; @@ -130,4 +132,61 @@ public void testCreateCloneIndex() { } + public void testCreateCloneIndexFailure() throws ExecutionException, InterruptedException { + Version version = VersionUtils.randomIndexCompatibleVersion(random()); + int numPrimaryShards = 1; + prepareCreate("source").setSettings( + Settings.builder().put(indexSettings()).put("number_of_shards", numPrimaryShards).put("index.version.created", version) + ).get(); + final int docs = 2; + for (int i = 0; i < docs; i++) { + client().prepareIndex("source").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", MediaTypeRegistry.JSON).get(); + } + internalCluster().ensureAtLeastNumDataNodes(2); + // ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node + // if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due + // to the require._name below. + ensureGreen(); + // relocate all shards to one node such that we can merge it. + client().admin().indices().prepareUpdateSettings("source").setSettings(Settings.builder().put("index.blocks.write", true)).get(); + ensureGreen(); + + // disable rebalancing to be able to capture the right stats. balancing can move the target primary + // making it hard to pin point the source shards. + client().admin() + .cluster() + .prepareUpdateSettings() + .setTransientSettings(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none")) + .get(); + try { + setFailRate(REPOSITORY_NAME, 100); + + client().admin() + .indices() + .prepareResizeIndex("source", "target") + .setResizeType(ResizeType.CLONE) + .setWaitForActiveShards(0) + .setSettings(Settings.builder().put("index.number_of_replicas", 0).putNull("index.blocks.write").build()) + .get(); + + Thread.sleep(2000); + ensureYellow("target"); + + } catch (ExecutionException | InterruptedException e) { + throw new RuntimeException(e); + } finally { + setFailRate(REPOSITORY_NAME, 0); + ensureGreen(); + // clean up + client().admin() + .cluster() + .prepareUpdateSettings() + .setTransientSettings( + Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), (String) null) + ) + .get(); + } + + } + } diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java index 751de66a97806..e43ff9a412784 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java @@ -9,6 +9,8 @@ package org.opensearch.remotestore; import org.opensearch.action.admin.cluster.remotestore.restore.RestoreRemoteStoreRequest; +import org.opensearch.action.admin.cluster.repositories.get.GetRepositoriesRequest; +import org.opensearch.action.admin.cluster.repositories.get.GetRepositoriesResponse; import org.opensearch.action.admin.indices.get.GetIndexRequest; import org.opensearch.action.admin.indices.get.GetIndexResponse; import org.opensearch.action.bulk.BulkItemResponse; @@ -37,7 +39,7 @@ import org.opensearch.indices.replication.common.ReplicationType; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.blobstore.BlobStoreRepository; -import org.opensearch.repositories.fs.FsRepository; +import org.opensearch.repositories.fs.ReloadableFsRepository; import org.opensearch.test.OpenSearchIntegTestCase; import org.junit.After; @@ -60,6 +62,7 @@ import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_REPOSITORY_TYPE_ATTRIBUTE_KEY_FORMAT; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_SEGMENT_REPOSITORY_NAME_ATTRIBUTE_KEY; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_TRANSLOG_REPOSITORY_NAME_ATTRIBUTE_KEY; +import static org.opensearch.repositories.fs.ReloadableFsRepository.REPOSITORIES_FAILRATE_SETTING; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; public class RemoteStoreBaseIntegTestCase extends OpenSearchIntegTestCase { @@ -146,6 +149,18 @@ protected Settings nodeSettings(int nodeOrdinal) { } } + protected void setFailRate(String repoName, int value) throws ExecutionException, InterruptedException { + GetRepositoriesRequest gr = new GetRepositoriesRequest(new String[] { repoName }); + GetRepositoriesResponse res = client().admin().cluster().getRepositories(gr).get(); + RepositoryMetadata rmd = res.repositories().get(0); + Settings.Builder settings = Settings.builder() + .put("location", rmd.settings().get("location")) + .put(REPOSITORIES_FAILRATE_SETTING.getKey(), value); + assertAcked( + client().admin().cluster().preparePutRepository(repoName).setType(ReloadableFsRepository.TYPE).setSettings(settings).get() + ); + } + public Settings indexSettings() { return defaultIndexSettings(); } @@ -224,10 +239,10 @@ public static Settings buildRemoteStoreNodeAttributes( return buildRemoteStoreNodeAttributes( segmentRepoName, segmentRepoPath, - FsRepository.TYPE, + ReloadableFsRepository.TYPE, translogRepoName, translogRepoPath, - FsRepository.TYPE, + ReloadableFsRepository.TYPE, withRateLimiterAttributes ); } diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreRestoreIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreRestoreIT.java index dfa5528eafcf2..94acf2b1dbb27 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreRestoreIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreRestoreIT.java @@ -24,6 +24,7 @@ import org.opensearch.indices.IndicesService; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; +import org.opensearch.repositories.fs.ReloadableFsRepository; import org.opensearch.test.InternalTestCluster; import org.opensearch.test.OpenSearchIntegTestCase; @@ -479,7 +480,14 @@ public void testRateLimitedRemoteDownloads() throws Exception { settingsMap.entrySet().forEach(entry -> settings.put(entry.getKey(), entry.getValue())); settings.put("location", segmentRepoPath).put("max_remote_download_bytes_per_sec", 4, ByteSizeUnit.KB); - assertAcked(client().admin().cluster().preparePutRepository(REPOSITORY_NAME).setType("fs").setSettings(settings).get()); + assertAcked( + client().admin() + .cluster() + .preparePutRepository(REPOSITORY_NAME) + .setType(ReloadableFsRepository.TYPE) + .setSettings(settings) + .get() + ); for (RepositoriesService repositoriesService : internalCluster().getDataNodeInstances(RepositoriesService.class)) { Repository segmentRepo = repositoriesService.repository(REPOSITORY_NAME); @@ -508,7 +516,14 @@ public void testRateLimitedRemoteDownloads() throws Exception { // revert repo metadata to pass asserts on repo metadata vs. node attrs during teardown // https://github.com/opensearch-project/OpenSearch/pull/9569#discussion_r1345668700 settings.remove("max_remote_download_bytes_per_sec"); - assertAcked(client().admin().cluster().preparePutRepository(REPOSITORY_NAME).setType("fs").setSettings(settings).get()); + assertAcked( + client().admin() + .cluster() + .preparePutRepository(REPOSITORY_NAME) + .setType(ReloadableFsRepository.TYPE) + .setSettings(settings) + .get() + ); for (RepositoriesService repositoriesService : internalCluster().getDataNodeInstances(RepositoriesService.class)) { Repository segmentRepo = repositoriesService.repository(REPOSITORY_NAME); assertNull(segmentRepo.getMetadata().settings().get("max_remote_download_bytes_per_sec")); diff --git a/server/src/internalClusterTest/java/org/opensearch/snapshots/SystemRepositoryIT.java b/server/src/internalClusterTest/java/org/opensearch/snapshots/SystemRepositoryIT.java index f50fc691fb232..28b84655a2cc7 100644 --- a/server/src/internalClusterTest/java/org/opensearch/snapshots/SystemRepositoryIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/snapshots/SystemRepositoryIT.java @@ -12,7 +12,7 @@ import org.opensearch.common.settings.Settings; import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.repositories.RepositoryException; -import org.opensearch.repositories.fs.FsRepository; +import org.opensearch.repositories.fs.ReloadableFsRepository; import org.opensearch.test.OpenSearchIntegTestCase; import org.junit.Before; @@ -53,7 +53,7 @@ public void testRestrictedSettingsCantBeUpdated() { assertEquals( e.getMessage(), "[system-repo-name] trying to modify an unmodifiable attribute type of system " - + "repository from current value [fs] to new value [mock]" + + "repository from current value [reloadable-fs] to new value [mock]" ); } @@ -65,7 +65,12 @@ public void testSystemRepositoryNonRestrictedSettingsCanBeUpdated() { final Settings.Builder repoSettings = Settings.builder().put("location", absolutePath).put("chunk_size", new ByteSizeValue(20)); assertAcked( - client.admin().cluster().preparePutRepository(systemRepoName).setType(FsRepository.TYPE).setSettings(repoSettings).get() + client.admin() + .cluster() + .preparePutRepository(systemRepoName) + .setType(ReloadableFsRepository.TYPE) + .setSettings(repoSettings) + .get() ); } } diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index 14b915935062c..0c97d62c44a5e 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -293,6 +293,7 @@ public void apply(Settings value, Settings current, Settings previous) { RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING, RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING, RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_REMOTE_STORE_STREAMS_SETTING, + RecoverySettings.INDICES_INTERNAL_REMOTE_UPLOAD_TIMEOUT, RecoverySettings.CLUSTER_REMOTE_INDEX_SEGMENT_METADATA_RETENTION_MAX_COUNT_SETTING, ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING, ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING, diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 5834eabfa9af0..977155a1cbb72 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -2025,23 +2025,35 @@ public RemoteSegmentStoreDirectory getRemoteDirectory() { } /** - Returns true iff it is able to verify that remote segment store - is in sync with local + * Returns true iff it is able to verify that remote segment store + * is in sync with local */ boolean isRemoteSegmentStoreInSync() { assert indexSettings.isRemoteStoreEnabled(); try { RemoteSegmentStoreDirectory directory = getRemoteDirectory(); if (directory.readLatestMetadataFile() != null) { - // verifying that all files except EXCLUDE_FILES are uploaded to the remote Collection uploadFiles = directory.getSegmentsUploadedToRemoteStore().keySet(); - SegmentInfos segmentInfos = store.readLastCommittedSegmentsInfo(); - Collection localFiles = segmentInfos.files(true); - if (uploadFiles.containsAll(localFiles)) { - return true; + try (GatedCloseable segmentInfosGatedCloseable = getSegmentInfosSnapshot()) { + Collection localSegmentInfosFiles = segmentInfosGatedCloseable.get().files(true); + Set localFiles = new HashSet<>(localSegmentInfosFiles); + // verifying that all files except EXCLUDE_FILES are uploaded to the remote + localFiles.removeAll(RemoteStoreRefreshListener.EXCLUDE_FILES); + if (uploadFiles.containsAll(localFiles)) { + return true; + } + logger.debug( + () -> new ParameterizedMessage( + "RemoteSegmentStoreSyncStatus localSize={} remoteSize={}", + localFiles.size(), + uploadFiles.size() + ) + ); } } - } catch (IOException e) { + } catch (AlreadyClosedException e) { + throw e; + } catch (Throwable e) { logger.error("Exception while reading latest metadata", e); } return false; diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index f75e86540ed9b..7bb80b736693f 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -172,13 +172,33 @@ private boolean shouldSync(boolean didRefresh, boolean skipPrimaryTermCheck) { // When the shouldSync is called the first time, then 1st condition on primary term is true. But after that // we update the primary term and the same condition would not evaluate to true again in syncSegments. // Below check ensures that if there is commit, then that gets picked up by both 1st and 2nd shouldSync call. - || isRefreshAfterCommitSafe(); + || isRefreshAfterCommitSafe() + || isRemoteSegmentStoreInSync() == false; if (shouldSync || skipPrimaryTermCheck) { return shouldSync; } return this.primaryTerm != indexShard.getOperationPrimaryTerm(); } + /** + * Checks if all files present in local store are uploaded to remote store or part of excluded files. + * + * Different from IndexShard#isRemoteSegmentStoreInSync as + * it uses files uploaded cache in RemoteDirector and it doesn't make a remote store call. + * Doesn't throw an exception on store getting closed as store will be open + * + * + * @return true iff all the local files are uploaded to remote store. + */ + boolean isRemoteSegmentStoreInSync() { + try (GatedCloseable segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) { + return segmentInfosGatedCloseable.get().files(true).stream().allMatch(this::skipUpload); + } catch (Throwable throwable) { + logger.error("Throwable thrown during isRemoteSegmentStoreInSync", throwable); + } + return false; + } + /* @return false if retry is needed */ diff --git a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java index 5b1940bb1d9a5..3faef2da05320 100644 --- a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java +++ b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java @@ -38,11 +38,13 @@ import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.search.Sort; +import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.opensearch.ExceptionsHelper; +import org.opensearch.OpenSearchException; import org.opensearch.action.StepListener; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.metadata.MappingMetadata; @@ -191,7 +193,8 @@ void recoverFromLocalShards( // just trigger a merge to do housekeeping on the // copied segments - we will also see them in stats etc. indexShard.getEngine().forceMerge(false, -1, false, false, false, UUIDs.randomBase64UUID()); - if (indexShard.isRemoteTranslogEnabled()) { + if (indexShard.isRemoteTranslogEnabled() && indexShard.shardRouting.primary()) { + waitForRemoteStoreSync(indexShard); if (indexShard.isRemoteSegmentStoreInSync() == false) { throw new IndexShardRecoveryException( indexShard.shardId(), @@ -432,7 +435,8 @@ void recoverFromSnapshotAndRemoteStore( } indexShard.getEngine().fillSeqNoGaps(indexShard.getPendingPrimaryTerm()); indexShard.finalizeRecovery(); - if (indexShard.isRemoteTranslogEnabled()) { + if (indexShard.isRemoteTranslogEnabled() && indexShard.shardRouting.primary()) { + waitForRemoteStoreSync(indexShard); if (indexShard.isRemoteSegmentStoreInSync() == false) { listener.onFailure(new IndexShardRestoreFailedException(shardId, "Failed to upload to remote segment store")); return; @@ -717,7 +721,8 @@ private void restore( } indexShard.getEngine().fillSeqNoGaps(indexShard.getPendingPrimaryTerm()); indexShard.finalizeRecovery(); - if (indexShard.isRemoteTranslogEnabled()) { + if (indexShard.isRemoteTranslogEnabled() && indexShard.shardRouting.primary()) { + waitForRemoteStoreSync(indexShard); if (indexShard.isRemoteSegmentStoreInSync() == false) { listener.onFailure(new IndexShardRestoreFailedException(shardId, "Failed to upload to remote segment store")); return; @@ -791,4 +796,31 @@ private void bootstrap(final IndexShard indexShard, final Store store) throws IO ); store.associateIndexWithNewTranslog(translogUUID); } + + /* + Blocks the calling thread, waiting for the remote store to get synced till internal Remote Upload Timeout + */ + private void waitForRemoteStoreSync(IndexShard indexShard) { + if (indexShard.shardRouting.primary() == false) { + return; + } + long startNanos = System.nanoTime(); + + while (System.nanoTime() - startNanos < indexShard.getRecoverySettings().internalRemoteUploadTimeout().nanos()) { + try { + if (indexShard.isRemoteSegmentStoreInSync()) { + break; + } else { + try { + Thread.sleep(TimeValue.timeValueMinutes(1).seconds()); + } catch (InterruptedException ie) { + throw new OpenSearchException("Interrupted waiting for completion of [{}]", ie); + } + } + } catch (AlreadyClosedException e) { + // There is no point in waiting as shard is now closed . + return; + } + } + } } diff --git a/server/src/main/java/org/opensearch/indices/recovery/RecoverySettings.java b/server/src/main/java/org/opensearch/indices/recovery/RecoverySettings.java index 5351ae7fe08dd..2b41eb125d808 100644 --- a/server/src/main/java/org/opensearch/indices/recovery/RecoverySettings.java +++ b/server/src/main/java/org/opensearch/indices/recovery/RecoverySettings.java @@ -46,6 +46,8 @@ import org.opensearch.core.common.unit.ByteSizeUnit; import org.opensearch.core.common.unit.ByteSizeValue; +import java.util.concurrent.TimeUnit; + /** * Settings for the recovery mechanism * @@ -176,6 +178,13 @@ public class RecoverySettings { Property.Dynamic ); + public static final Setting INDICES_INTERNAL_REMOTE_UPLOAD_TIMEOUT = Setting.timeSetting( + "indices.recovery.internal_remote_upload_timeout", + new TimeValue(1, TimeUnit.HOURS), + Property.Dynamic, + Property.NodeScope + ); + // choose 512KB-16B to ensure that the resulting byte[] is not a humongous allocation in G1. public static final ByteSizeValue DEFAULT_CHUNK_SIZE = new ByteSizeValue(512 * 1024 - 16, ByteSizeUnit.BYTES); @@ -193,6 +202,7 @@ public class RecoverySettings { private volatile int minRemoteSegmentMetadataFiles; private volatile ByteSizeValue chunkSize = DEFAULT_CHUNK_SIZE; + private volatile TimeValue internalRemoteUploadTimeout; public RecoverySettings(Settings settings, ClusterSettings clusterSettings) { this.retryDelayStateSync = INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.get(settings); @@ -216,6 +226,7 @@ public RecoverySettings(Settings settings, ClusterSettings clusterSettings) { } logger.debug("using max_bytes_per_sec[{}]", maxBytesPerSec); + this.internalRemoteUploadTimeout = INDICES_INTERNAL_REMOTE_UPLOAD_TIMEOUT.get(settings); clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING, this::setMaxBytesPerSec); clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING, this::setMaxConcurrentFileChunks); @@ -237,6 +248,8 @@ public RecoverySettings(Settings settings, ClusterSettings clusterSettings) { CLUSTER_REMOTE_INDEX_SEGMENT_METADATA_RETENTION_MAX_COUNT_SETTING, this::setMinRemoteSegmentMetadataFiles ); + clusterSettings.addSettingsUpdateConsumer(INDICES_INTERNAL_REMOTE_UPLOAD_TIMEOUT, this::setInternalRemoteUploadTimeout); + } public RateLimiter rateLimiter() { @@ -267,6 +280,10 @@ public TimeValue internalActionLongTimeout() { return internalActionLongTimeout; } + public TimeValue internalRemoteUploadTimeout() { + return internalRemoteUploadTimeout; + } + public ByteSizeValue getChunkSize() { return chunkSize; } @@ -298,6 +315,10 @@ public void setInternalActionLongTimeout(TimeValue internalActionLongTimeout) { this.internalActionLongTimeout = internalActionLongTimeout; } + public void setInternalRemoteUploadTimeout(TimeValue internalRemoteUploadTimeout) { + this.internalRemoteUploadTimeout = internalRemoteUploadTimeout; + } + private void setMaxBytesPerSec(ByteSizeValue maxBytesPerSec) { this.maxBytesPerSec = maxBytesPerSec; if (maxBytesPerSec.getBytes() <= 0) { diff --git a/server/src/main/java/org/opensearch/repositories/RepositoriesModule.java b/server/src/main/java/org/opensearch/repositories/RepositoriesModule.java index cc4d3c006d84c..afb6e530b0eec 100644 --- a/server/src/main/java/org/opensearch/repositories/RepositoriesModule.java +++ b/server/src/main/java/org/opensearch/repositories/RepositoriesModule.java @@ -39,6 +39,7 @@ import org.opensearch.indices.recovery.RecoverySettings; import org.opensearch.plugins.RepositoryPlugin; import org.opensearch.repositories.fs.FsRepository; +import org.opensearch.repositories.fs.ReloadableFsRepository; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportService; @@ -71,6 +72,11 @@ public RepositoriesModule( metadata -> new FsRepository(metadata, env, namedXContentRegistry, clusterService, recoverySettings) ); + factories.put( + ReloadableFsRepository.TYPE, + metadata -> new ReloadableFsRepository(metadata, env, namedXContentRegistry, clusterService, recoverySettings) + ); + for (RepositoryPlugin repoPlugin : repoPlugins) { Map newRepoTypes = repoPlugin.getRepositories( env, diff --git a/server/src/main/java/org/opensearch/repositories/fs/ReloadableFsRepository.java b/server/src/main/java/org/opensearch/repositories/fs/ReloadableFsRepository.java index c06c805a39396..e8020a432a58a 100644 --- a/server/src/main/java/org/opensearch/repositories/fs/ReloadableFsRepository.java +++ b/server/src/main/java/org/opensearch/repositories/fs/ReloadableFsRepository.java @@ -8,18 +8,52 @@ package org.opensearch.repositories.fs; +import org.opensearch.OpenSearchException; import org.opensearch.cluster.metadata.RepositoryMetadata; import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.Randomness; +import org.opensearch.common.blobstore.BlobContainer; +import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.blobstore.BlobStore; +import org.opensearch.common.blobstore.fs.FsBlobContainer; +import org.opensearch.common.blobstore.fs.FsBlobStore; +import org.opensearch.common.settings.Setting; import org.opensearch.core.xcontent.NamedXContentRegistry; import org.opensearch.env.Environment; import org.opensearch.indices.recovery.RecoverySettings; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import java.util.Random; + /** - * Extension of {@link FsRepository} that can be reloaded inplace + * Extension of {@link FsRepository} that can be reloaded inplace , supports failing operation and slowing it down * * @opensearch.internal */ public class ReloadableFsRepository extends FsRepository { + public static final String TYPE = "reloadable-fs"; + + private final FailSwitch fail; + private final SlowDownWriteSwitch slowDown; + + public static final Setting REPOSITORIES_FAILRATE_SETTING = Setting.intSetting( + "repositories.fail.rate", + 0, + 0, + 100, + Setting.Property.NodeScope + ); + + public static final Setting REPOSITORIES_SLOWDOWN_SETTING = Setting.intSetting( + "repositories.slowdown", + 0, + 0, + 100, + Setting.Property.NodeScope + ); + /** * Constructs a shared file system repository that is reloadable in-place. */ @@ -31,6 +65,11 @@ public ReloadableFsRepository( RecoverySettings recoverySettings ) { super(metadata, environment, namedXContentRegistry, clusterService, recoverySettings); + fail = new FailSwitch(); + fail.failRate(REPOSITORIES_FAILRATE_SETTING.get(metadata.settings())); + slowDown = new SlowDownWriteSwitch(); + slowDown.setSleepSeconds(REPOSITORIES_SLOWDOWN_SETTING.get(metadata.settings())); + readRepositoryMetadata(); } @Override @@ -40,12 +79,124 @@ public boolean isReloadable() { @Override public void reload(RepositoryMetadata repositoryMetadata) { - if (isReloadable() == false) { - return; - } - super.reload(repositoryMetadata); + readRepositoryMetadata(); validateLocation(); readMetadata(); } + + private void readRepositoryMetadata() { + fail.failRate(REPOSITORIES_FAILRATE_SETTING.get(metadata.settings())); + slowDown.setSleepSeconds(REPOSITORIES_SLOWDOWN_SETTING.get(metadata.settings())); + } + + protected BlobStore createBlobStore() throws Exception { + final String location = REPOSITORIES_LOCATION_SETTING.get(getMetadata().settings()); + final Path locationFile = environment.resolveRepoFile(location); + return new ThrowingBlobStore(bufferSize, locationFile, isReadOnly(), fail, slowDown); + } + + // A random integer from min-max (inclusive). + public static int randomIntBetween(int min, int max) { + Random random = Randomness.get(); + return random.nextInt(max - min + 1) + min; + } + + static class FailSwitch { + private volatile int failRate; + private volatile boolean onceFailedFailAlways = false; + + public boolean fail() { + final int rnd = randomIntBetween(1, 100); + boolean fail = rnd <= failRate; + if (fail && onceFailedFailAlways) { + failAlways(); + } + return fail; + } + + public void failAlways() { + failRate = 100; + } + + public void failRate(int rate) { + failRate = rate; + } + + public void onceFailedFailAlways() { + onceFailedFailAlways = true; + } + } + + static class SlowDownWriteSwitch { + private volatile int sleepSeconds; + + public void setSleepSeconds(int sleepSeconds) { + this.sleepSeconds = sleepSeconds; + } + + public int getSleepSeconds() { + return sleepSeconds; + } + } + + private static class ThrowingBlobStore extends FsBlobStore { + + private final FailSwitch fail; + private final SlowDownWriteSwitch slowDown; + + public ThrowingBlobStore(int bufferSizeInBytes, Path path, boolean readonly, FailSwitch fail, SlowDownWriteSwitch slowDown) + throws IOException { + super(bufferSizeInBytes, path, readonly); + this.fail = fail; + this.slowDown = slowDown; + } + + @Override + public BlobContainer blobContainer(BlobPath path) { + try { + return new ThrowingBlobContainer(this, path, buildAndCreate(path), fail, slowDown); + } catch (IOException ex) { + throw new OpenSearchException("failed to create blob container", ex); + } + } + } + + private static class ThrowingBlobContainer extends FsBlobContainer { + + private final FailSwitch fail; + private final SlowDownWriteSwitch slowDown; + + public ThrowingBlobContainer(FsBlobStore blobStore, BlobPath blobPath, Path path, FailSwitch fail, SlowDownWriteSwitch slowDown) { + super(blobStore, blobPath, path); + this.fail = fail; + this.slowDown = slowDown; + } + + @Override + public void writeBlobAtomic(final String blobName, final InputStream inputStream, final long blobSize, boolean failIfAlreadyExists) + throws IOException { + checkFailRateAndSleep(blobName); + super.writeBlobAtomic(blobName, inputStream, blobSize, failIfAlreadyExists); + } + + private void checkFailRateAndSleep(String blobName) throws IOException { + if (fail.fail() && blobName.contains(".dat") == false) { + throw new IOException("blob container throwing error"); + } + if (slowDown.getSleepSeconds() > 0) { + try { + Thread.sleep(slowDown.getSleepSeconds() * 1000L); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public void writeBlob(String blobName, InputStream inputStream, long blobSize, boolean failIfAlreadyExists) throws IOException { + checkFailRateAndSleep(blobName); + super.writeBlob(blobName, inputStream, blobSize, failIfAlreadyExists); + } + } } diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java index 6567cb03f3dc6..acec50f28860e 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java @@ -335,6 +335,7 @@ public void testRefreshSuccessOnFirstAttempt() throws Exception { RemoteStoreStatsTrackerFactory trackerFactory = tuple.v2(); RemoteSegmentTransferTracker segmentTracker = trackerFactory.getRemoteSegmentTransferTracker(indexShard.shardId()); assertNoLagAndTotalUploadsFailed(segmentTracker, 0); + assertTrue("remote store in sync", tuple.v1().isRemoteSegmentStoreInSync()); } public void testRefreshSuccessOnSecondAttempt() throws Exception { @@ -404,6 +405,20 @@ public void testRefreshSuccessOnThirdAttempt() throws Exception { assertNoLagAndTotalUploadsFailed(segmentTracker, 2); } + public void testRefreshPersistentFailure() throws Exception { + int succeedOnAttempt = 10; + CountDownLatch refreshCountLatch = new CountDownLatch(1); + CountDownLatch successLatch = new CountDownLatch(10); + Tuple tuple = mockIndexShardWithRetryAndScheduleRefresh( + succeedOnAttempt, + refreshCountLatch, + successLatch + ); + // Giving 10ms for some iterations of remote refresh upload + Thread.sleep(10); + assertFalse("remote store should not in sync", tuple.v1().isRemoteSegmentStoreInSync()); + } + private void assertNoLagAndTotalUploadsFailed(RemoteSegmentTransferTracker segmentTracker, long totalUploadsFailed) throws Exception { assertBusy(() -> { assertEquals(0, segmentTracker.getBytesLag()); @@ -568,6 +583,7 @@ private Tuple mockIn // Mock indexShard.getSegmentInfosSnapshot() doAnswer(invocation -> { if (counter.incrementAndGet() <= succeedOnAttempt) { + logger.error("Failing in get segment info {}", counter.get()); throw new RuntimeException("Inducing failure in upload"); } return indexShard.getSegmentInfosSnapshot(); @@ -583,6 +599,7 @@ private Tuple mockIn doAnswer(invocation -> { if (Objects.nonNull(successLatch)) { successLatch.countDown(); + logger.info("Value fo latch {}", successLatch.getCount()); } return indexShard.getEngine(); }).when(shard).getEngine(); @@ -642,6 +659,31 @@ private void verifyUploadedSegments(RemoteSegmentStoreDirectory remoteSegmentSto } } } + assertTrue(remoteStoreRefreshListener.isRemoteSegmentStoreInSync()); + } + + public void testRemoteSegmentStoreNotInSync() throws IOException { + setup(true, 3); + remoteStoreRefreshListener.afterRefresh(true); + try (Store remoteStore = indexShard.remoteStore()) { + RemoteSegmentStoreDirectory remoteSegmentStoreDirectory = + (RemoteSegmentStoreDirectory) ((FilterDirectory) ((FilterDirectory) remoteStore.directory()).getDelegate()).getDelegate(); + verifyUploadedSegments(remoteSegmentStoreDirectory); + remoteStoreRefreshListener.isRemoteSegmentStoreInSync(); + boolean oneFileDeleted = false; + // Delete any one file from remote store + try (GatedCloseable segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) { + SegmentInfos segmentInfos = segmentInfosGatedCloseable.get(); + for (String file : segmentInfos.files(true)) { + if (oneFileDeleted == false && RemoteStoreRefreshListener.EXCLUDE_FILES.contains(file) == false) { + remoteSegmentStoreDirectory.deleteFile(file); + oneFileDeleted = true; + break; + } + } + } + assertFalse(remoteStoreRefreshListener.isRemoteSegmentStoreInSync()); + } } }