Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checks for problems in Accumulo #4957

Open
wants to merge 3 commits into
base: 3.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions core/src/main/java/org/apache/accumulo/core/fate/AdminUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -424,18 +424,6 @@ public void print(ReadOnlyTStore<T> zs, ZooReader zk, ServiceLock.ServiceLockPat
txStatus.getWaitingLocks(), txStatus.getTop(), txStatus.getTimeCreatedFormatted());
}
fmt.format(" %s transactions", fateStatus.getTransactions().size());

if (!fateStatus.getDanglingHeldLocks().isEmpty()
|| !fateStatus.getDanglingWaitingLocks().isEmpty()) {
fmt.format("%nThe following locks did not have an associated FATE operation%n");
for (Entry<String,List<String>> entry : fateStatus.getDanglingHeldLocks().entrySet()) {
fmt.format("txid: %s locked: %s%n", entry.getKey(), entry.getValue());
}

for (Entry<String,List<String>> entry : fateStatus.getDanglingWaitingLocks().entrySet()) {
fmt.format("txid: %s locking: %s%n", entry.getKey(), entry.getValue());
}
}
}

public boolean prepDelete(TStore<T> zs, ZooReaderWriter zk, ServiceLockPath path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ private static byte[] readField(DataInputStream din) throws IOException {
return b;
}

// decode a bunch of key value pairs that have been encoded into a single value
/**
* decode a bunch of key value pairs that have been encoded into a single value
*/
public static final SortedMap<Key,Value> decodeRow(Key rowKey, Value rowValue)
throws IOException {
SortedMap<Key,Value> map = new TreeMap<>();
Expand All @@ -110,8 +112,10 @@ private static void encode(DataOutputStream dout, ByteSequence bs) throws IOExce
dout.write(bs.getBackingArray(), bs.offset(), bs.length());
}

// take a stream of keys and values and output a value that encodes everything but their row
// keys and values must be paired one for one
/**
* take a stream of keys and values and output a value that encodes everything but their row keys
* and values must be paired one for one
*/
public static final Value encodeRow(List<Key> keys, List<Value> values) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dout = new DataOutputStream(out);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
import org.apache.accumulo.server.util.checkCommand.RootTableCheckRunner;
import org.apache.accumulo.server.util.checkCommand.SystemConfigCheckRunner;
import org.apache.accumulo.server.util.checkCommand.SystemFilesCheckRunner;
import org.apache.accumulo.server.util.checkCommand.TableLocksCheckRunner;
import org.apache.accumulo.server.util.checkCommand.UserFilesCheckRunner;
import org.apache.accumulo.server.util.fateCommand.FateSummaryReport;
import org.apache.accumulo.start.spi.KeywordExecutable;
Expand Down Expand Up @@ -145,6 +146,10 @@ public static class CheckCommand {
@Parameter(description = "[<Check>...]")
List<String> checks;

@Parameter(names = "--fixFiles", description = "Removes dangling file pointers. Used by the "
+ "USER_FILES and SYSTEM_FILES checks.")
boolean fixFiles = false;

/**
* This should be used to get the check runner instead of {@link Check#getCheckRunner()}. This
* exists so that its functionality can be changed for testing.
Expand All @@ -159,6 +164,9 @@ public enum Check {
// Caution should be taken when changing or adding any new checks: order is important
SYSTEM_CONFIG(SystemConfigCheckRunner::new, "Validate the system config stored in ZooKeeper",
Collections.emptyList()),
TABLE_LOCKS(TableLocksCheckRunner::new,
"Ensures that table and namespace locks are valid and are associated with a FATE op",
Collections.singletonList(SYSTEM_CONFIG)),
ROOT_METADATA(RootMetadataCheckRunner::new,
"Checks integrity of the root tablet metadata stored in ZooKeeper",
Collections.singletonList(SYSTEM_CONFIG)),
Expand Down Expand Up @@ -214,16 +222,6 @@ public enum CheckStatus {
}
}

@Parameters(commandDescription = "print tablets that are offline in online tables")
static class CheckTabletsCommand {
@Parameter(names = "--fixFiles", description = "Remove dangling file pointers")
boolean fixFiles = false;

@Parameter(names = {"-t", "--table"},
description = "Table to check, if not set checks all tables")
String tableName = null;
}

@Parameters(commandDescription = "stop the manager")
static class StopManagerCommand {}

Expand Down Expand Up @@ -377,9 +375,6 @@ public void execute(final String[] args) {
CheckCommand checkCommand = new CheckCommand();
cl.addCommand("check", checkCommand);

CheckTabletsCommand checkTabletsCommand = new CheckTabletsCommand();
cl.addCommand("checkTablets", checkTabletsCommand);

DeleteZooInstanceCommand deleteZooInstOpts = new DeleteZooInstanceCommand();
cl.addCommand("deleteZooInstance", deleteZooInstOpts);

Expand Down Expand Up @@ -441,24 +436,6 @@ public void execute(final String[] args) {
if (ping(context, pingCommand.args) != 0) {
rc = 4;
}
} else if (cl.getParsedCommand().equals("checkTablets")) {
System.out.println("\n*** Looking for offline tablets ***\n");
if (FindOfflineTablets.findOffline(context, checkTabletsCommand.tableName) != 0) {
rc = 5;
}
System.out.println("\n*** Looking for missing files ***\n");
if (checkTabletsCommand.tableName == null) {
if (RemoveEntriesForMissingFiles.checkAllTables(context, checkTabletsCommand.fixFiles)
!= 0) {
rc = 6;
}
} else {
if (RemoveEntriesForMissingFiles.checkTable(context, checkTabletsCommand.tableName,
checkTabletsCommand.fixFiles) != 0) {
rc = 6;
}
}

} else if (cl.getParsedCommand().equals("stop")) {
stopTabletServer(context, stopOpts.args, opts.force);
} else if (cl.getParsedCommand().equals("dumpConfig")) {
Expand All @@ -482,7 +459,7 @@ public void execute(final String[] args) {
} else if (cl.getParsedCommand().equals("serviceStatus")) {
printServiceStatus(context, serviceStatusCommandOpts);
} else if (cl.getParsedCommand().equals("check")) {
executeCheckCommand(context, checkCommand);
executeCheckCommand(context, checkCommand, opts);
} else {
everything = cl.getParsedCommand().equals("stopAll");

Expand Down Expand Up @@ -1012,15 +989,16 @@ private EnumSet<ReadOnlyTStore.TStatus> getCmdLineStatusFilters(List<String> sta
}

@VisibleForTesting
public static void executeCheckCommand(ServerContext context, CheckCommand cmd) {
public static void executeCheckCommand(ServerContext context, CheckCommand cmd,
ServerUtilOpts opts) throws Exception {
validateAndTransformCheckCommand(cmd);

if (cmd.list) {
listChecks();
} else if (cmd.run) {
var givenChecks =
cmd.checks.stream().map(CheckCommand.Check::valueOf).collect(Collectors.toList());
executeRunCheckCommand(cmd, givenChecks);
var givenChecks = cmd.checks.stream()
.map(name -> CheckCommand.Check.valueOf(name.toUpperCase())).collect(Collectors.toList());
executeRunCheckCommand(cmd, givenChecks, context, opts);
}
}

Expand Down Expand Up @@ -1051,19 +1029,19 @@ private static void validateAndTransformCheckCommand(CheckCommand cmd) {

private static void listChecks() {
System.out.println();
System.out.printf("%-20s | %-80s | %-20s%n", "Check Name", "Description", "Depends on");
System.out.println("-".repeat(120));
System.out.printf("%-20s | %-90s | %-20s%n", "Check Name", "Description", "Depends on");
System.out.println("-".repeat(130));
for (CheckCommand.Check check : CheckCommand.Check.values()) {
System.out.printf("%-20s | %-80s | %-20s%n", check.name(), check.getDescription(),
System.out.printf("%-20s | %-90s | %-20s%n", check.name(), check.getDescription(),
check.getDependencies().stream().map(CheckCommand.Check::name)
.collect(Collectors.joining(", ")));
}
System.out.println("-".repeat(120));
System.out.println("-".repeat(130));
System.out.println();
}

private static void executeRunCheckCommand(CheckCommand cmd,
List<CheckCommand.Check> givenChecks) {
private static void executeRunCheckCommand(CheckCommand cmd, List<CheckCommand.Check> givenChecks,
ServerContext context, ServerUtilOpts opts) throws Exception {
// Get all the checks in the order they are declared in the enum
final var allChecks = CheckCommand.Check.values();
final TreeMap<CheckCommand.Check,CheckCommand.CheckStatus> checkStatus = new TreeMap<>();
Expand All @@ -1073,7 +1051,7 @@ private static void executeRunCheckCommand(CheckCommand cmd,
checkStatus.put(check, CheckCommand.CheckStatus.SKIPPED_DEPENDENCY_FAILED);
} else {
if (givenChecks.contains(check)) {
checkStatus.put(check, cmd.getCheckRunner(check).runCheck());
checkStatus.put(check, cmd.getCheckRunner(check).runCheck(context, opts, cmd.fixFiles));
} else {
checkStatus.put(check, CheckCommand.CheckStatus.FILTERED_OUT);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeSet;
import java.util.function.Consumer;

import org.apache.accumulo.core.client.Accumulo;
import org.apache.accumulo.core.client.AccumuloClient;
Expand All @@ -45,75 +46,83 @@
import io.opentelemetry.context.Scope;

public class CheckForMetadataProblems {
private static boolean sawProblems = false;
private static ServerUtilOpts opts;

private static void checkTable(TableId tableId, TreeSet<KeyExtent> tablets) {
private static boolean checkTable(TableId tableId, TreeSet<KeyExtent> tablets,
ServerUtilOpts opts, Consumer<String> printInfoMethod, Consumer<String> printProblemMethod) {
// sanity check of metadata table entries
// make sure tablets has no holes, and that it starts and ends w/ null
// make sure tablets have no holes, and that it starts and ends w/ null
String tableName;
boolean sawProblems = false;

try {
tableName = opts.getServerContext().getTableName(tableId);
} catch (TableNotFoundException e) {
tableName = null;
}

printInfoMethod.accept(String.format("Ensuring tablets for table %s (%s) have: no holes, "
+ "valid (null) prev end row for first tablet, and valid (null) end row "
+ "for last tablet...\n", tableName, tableId));

if (tablets.isEmpty()) {
System.out.println(
"...No entries found in metadata table for table " + tableName + " (" + tableId + ")");
sawProblems = true;
return;
printProblemMethod.accept(String
.format("...No entries found in metadata table for table %s (%s)", tableName, tableId));
return true;
}

if (tablets.first().prevEndRow() != null) {
System.out.println("...First entry for table " + tableName + " (" + tableId + ") - "
+ tablets.first() + " - has non null prev end row");
sawProblems = true;
return;
printProblemMethod
.accept(String.format("...First entry for table %s (%s) - %s - has non-null prev end row",
tableName, tableId, tablets.first()));
return true;
}

if (tablets.last().endRow() != null) {
System.out.println("...Last entry for table " + tableName + " (" + tableId + ") - "
+ tablets.last() + " - has non null end row");
sawProblems = true;
return;
printProblemMethod
.accept(String.format("...Last entry for table %s (%s) - %s - has non-null end row",
tableName, tableId, tablets.last()));
return true;
}

Iterator<KeyExtent> tabIter = tablets.iterator();
Text lastEndRow = tabIter.next().endRow();
boolean everythingLooksGood = true;
while (tabIter.hasNext()) {
KeyExtent tabke = tabIter.next();
KeyExtent table = tabIter.next();
boolean broke = false;
if (tabke.prevEndRow() == null) {
System.out.println("...Table " + tableName + " (" + tableId
+ ") has null prev end row in middle of table " + tabke);
if (table.prevEndRow() == null) {
printProblemMethod
.accept(String.format("...Table %s (%s) has null prev end row in middle of table %s",
tableName, tableId, table));
broke = true;
} else if (!tabke.prevEndRow().equals(lastEndRow)) {
System.out.println("...Table " + tableName + " (" + tableId + ") has a hole "
+ tabke.prevEndRow() + " != " + lastEndRow);
} else if (!table.prevEndRow().equals(lastEndRow)) {
printProblemMethod.accept(String.format("...Table %s (%s) has a hole %s != %s", tableName,
tableId, table.prevEndRow(), lastEndRow));
broke = true;
}
if (broke) {
everythingLooksGood = false;
}

lastEndRow = tabke.endRow();
lastEndRow = table.endRow();
}
if (everythingLooksGood) {
System.out.println("...All is well for table " + tableName + " (" + tableId + ")");
printInfoMethod.accept(String.format("...All is well for table %s (%s)", tableName, tableId));
} else {
sawProblems = true;
}

return sawProblems;
}

private static void checkMetadataAndRootTableEntries(String tableNameToCheck, ServerUtilOpts opts)
public static boolean checkMetadataAndRootTableEntries(String tableNameToCheck,
ServerUtilOpts opts, Consumer<String> printInfoMethod, Consumer<String> printProblemMethod)
throws Exception {
TableId tableCheckId = opts.getServerContext().getTableId(tableNameToCheck);
System.out.println("Checking tables whose metadata is found in: " + tableNameToCheck + " ("
+ tableCheckId + ")");
printInfoMethod.accept(String.format("Checking tables whose metadata is found in: %s (%s)...\n",
tableNameToCheck, tableCheckId));
Map<TableId,TreeSet<KeyExtent>> tables = new HashMap<>();
boolean sawProblems = false;

try (AccumuloClient client = Accumulo.newClient().from(opts.getClientProps()).build();
Scanner scanner = client.createScanner(tableNameToCheck, Authorizations.EMPTY)) {
Expand All @@ -139,7 +148,10 @@ private static void checkMetadataAndRootTableEntries(String tableNameToCheck, Se
TreeSet<KeyExtent> tablets = tables.get(tableId);
if (tablets == null) {

tables.forEach(CheckForMetadataProblems::checkTable);
for (var e : tables.entrySet()) {
sawProblems = CheckForMetadataProblems.checkTable(e.getKey(), e.getValue(), opts,
printInfoMethod, printProblemMethod) || sawProblems;
}

tables.clear();

Expand All @@ -153,37 +165,45 @@ private static void checkMetadataAndRootTableEntries(String tableNameToCheck, Se
justLoc = false;
} else if (colf.equals(CurrentLocationColumnFamily.NAME)) {
if (justLoc) {
System.out.println("Problem at key " + entry.getKey());
printProblemMethod.accept("Problem at key " + entry.getKey());
sawProblems = true;
}
justLoc = true;
}
}

if (count == 0) {
System.err
.println("ERROR : table " + tableNameToCheck + " (" + tableCheckId + ") is empty");
printProblemMethod.accept(
String.format("ERROR : table %s (%s) is empty", tableNameToCheck, tableCheckId));
kevinrr888 marked this conversation as resolved.
Show resolved Hide resolved
sawProblems = true;
}
}

tables.forEach(CheckForMetadataProblems::checkTable);
for (var e : tables.entrySet()) {
sawProblems = CheckForMetadataProblems.checkTable(e.getKey(), e.getValue(), opts,
printInfoMethod, printProblemMethod) || sawProblems;
}

if (!sawProblems) {
System.out.println("No problems found in " + tableNameToCheck + " (" + tableCheckId + ")");
printInfoMethod.accept(
String.format("\n...No problems found in %s (%s)", tableNameToCheck, tableCheckId));
}
// end METADATA table sanity check
return sawProblems;
}

public static void main(String[] args) throws Exception {
opts = new ServerUtilOpts();
ServerUtilOpts opts = new ServerUtilOpts();
opts.parseArgs(CheckForMetadataProblems.class.getName(), args);
Span span = TraceUtil.startSpan(CheckForMetadataProblems.class, "main");
boolean sawProblems;
try (Scope scope = span.makeCurrent()) {

checkMetadataAndRootTableEntries(AccumuloTable.ROOT.tableName(), opts);
sawProblems = checkMetadataAndRootTableEntries(AccumuloTable.ROOT.tableName(), opts,
System.out::println, System.out::println);
System.out.println();
checkMetadataAndRootTableEntries(AccumuloTable.METADATA.tableName(), opts);
sawProblems = checkMetadataAndRootTableEntries(AccumuloTable.METADATA.tableName(), opts,
System.out::println, System.out::println) || sawProblems;
if (sawProblems) {
throw new IllegalStateException();
}
Expand Down
Loading