From 566f0bd5bbcd9350f4216f1506811d59f1d68fe8 Mon Sep 17 00:00:00 2001 From: mocchira Date: Thu, 22 Mar 2018 15:46:56 +0900 Subject: [PATCH 1/2] leofs: Add recover-disk command for #983 --- apps/leo_manager/include/leo_manager.hrl | 3 + apps/leo_manager/src/leo_manager_api.erl | 83 +++++++++++++++++++- apps/leo_manager/src/leo_manager_console.erl | 8 ++ apps/leo_storage/src/leo_storage_api.erl | 2 +- apps/leo_storage/src/leo_storage_mq.erl | 14 +++- docs/admin/system_operations/data.md | 23 +++++- leofs-adm | 22 ++++++ 7 files changed, 149 insertions(+), 6 deletions(-) diff --git a/apps/leo_manager/include/leo_manager.hrl b/apps/leo_manager/include/leo_manager.hrl index 602d1a76..f9b63b39 100644 --- a/apps/leo_manager/include/leo_manager.hrl +++ b/apps/leo_manager/include/leo_manager.hrl @@ -169,6 +169,7 @@ {?CMD_RECOVER, lists:append( ["recover file ", ?CRLF, "recover dir []", ?CRLF, + "recover disk ", ?CRLF, "recover node ", ?CRLF, "recover ring ", ?CRLF, "recover cluster " @@ -248,6 +249,7 @@ %% recover type -define(RECOVER_FILE, "file"). +-define(RECOVER_DISK, "disk"). -define(RECOVER_NODE, "node"). -define(RECOVER_RING, "ring"). -define(RECOVER_REMOTE_CLUSTER, "cluster"). @@ -330,6 +332,7 @@ -define(ERROR_FAILED_GET_VERSION, "Failed to get the version"). -define(ERROR_FAILED_REGISTERING_DEL_BUCKET_MSG, "Failed to register a del-bucket's message"). -define(ERROR_FAILED_REMOVING_DEL_BUCKET_MSG, "Failed to remove a del-bucket's message"). +-define(ERROR_FAILED_RECOVER_DISK_DUE_TO_DIFFERENT_AVS_CONF, "Failed to recover disk due to different AVS conf found"). %% type of console -define(CONSOLE_CUI, 'cui'). diff --git a/apps/leo_manager/src/leo_manager_api.erl b/apps/leo_manager/src/leo_manager_api.erl index 9b1a5f8e..a443c22f 100644 --- a/apps/leo_manager/src/leo_manager_api.erl +++ b/apps/leo_manager/src/leo_manager_api.erl @@ -67,7 +67,7 @@ -export([register/4, register/7, register/8, notify/3, notify/4, purge/1, remove/1, - whereis/2, recover/3, rebuild_dir_metadata/2, + whereis/2, recover/3, recover/4, rebuild_dir_metadata/2, compact/2, compact/4, diagnose_data/1, stats/2, mq_stats/1, mq_suspend/2, mq_resume/2, @@ -1583,6 +1583,87 @@ recover(_,_,true) -> recover(_,_,false) -> {error, ?ERROR_COULD_NOT_GET_RING}. +recover(?RECOVER_DISK, Node, Disk, true) when is_list(Node) -> + recover(?RECOVER_DISK, list_to_atom(Node), Disk, true); +recover(?RECOVER_DISK, Node, Disk, true) when is_list(Disk) -> + case catch list_to_integer(Disk) of + {'EXIT', _} -> + {error, ?ERROR_INVALID_ARGS}; + Val -> + recover(?RECOVER_DISK, Node, Val, true) + end; +recover(?RECOVER_DISK, Node, Disk, true) -> + %% Check the target node and system-state + case leo_misc:node_existence(Node) of + true -> + Ret = case leo_redundant_manager_api:get_member_by_node(Node) of + {ok, #member{state = ?STATE_RUNNING}} -> + true; + _ -> + false + end, + recover_disk_1(Ret, Node, Disk); + false -> + {error, ?ERROR_COULD_NOT_CONNECT} + end; +recover(_,_,_,true) -> + {error, ?ERROR_INVALID_ARGS}; +recover(_,_,_,false) -> + {error, ?ERROR_COULD_NOT_GET_RING}. + +%% @doc Execute recovery of the target disk +%% Check conditions +%% @private +recover_disk_1(true, Node, Disk) -> + {Ret, Members} = is_allow_to_distribute_command(Node), + recover_disk_2(Ret, Members, Node, Disk); +recover_disk_1(false, _, _) -> + {error, ?ERROR_TARGET_NODE_NOT_RUNNING}. + +%% @doc Execute recovery of the target disk +%% @private +recover_disk_2(true, Members, Node, Disk) -> + case rpc:multicall(Members, ?API_STORAGE, get_node_status, + [], ?DEF_TIMEOUT) of + {RetL, []} -> + case has_same_avs_conf(RetL) of + {true, Size} -> + case 0 < Disk andalso Disk =< Size of + true -> + case rpc:multicall(Members, ?API_STORAGE, synchronize, + [{Node, Disk}], ?DEF_TIMEOUT) of + {_RetL, []} -> + ok; + {_, BadNodes} -> + ?warn("recover_disk_2/4", + [{bad_nodes, BadNodes}]), + {error, BadNodes} + end; + false -> + {error, ?ERROR_INVALID_ARGS} + end; + _ -> + {error, ?ERROR_FAILED_RECOVER_DISK_DUE_TO_DIFFERENT_AVS_CONF} + end; + {_, BadNodes} -> + ?warn("recover_disk_2/4", + [{bad_nodes, BadNodes}]), + {error, BadNodes} + end; +recover_disk_2(false,_,_,_) -> + {error, ?ERROR_NOT_SATISFY_CONDITION}. + +%% @doc Check if every storage node has a same AVS config +%% @private +has_same_avs_conf(RetL) -> + ObjContainerConfList = [leo_misc:get_value('avs', Status, []) || {ok, Status} <- RetL], + NormalizedList = [lists:map(fun(Items) -> + leo_misc:get_value('num_of_containers', Items, 0) + end, ObjContainer) || ObjContainer <- ObjContainerConfList], + [H|Rest] = NormalizedList, + {lists:all(fun(Elem) -> + Elem == H + end, Rest), length(H)}. %% @doc Execute recovery of the target node %% Check conditions diff --git a/apps/leo_manager/src/leo_manager_console.erl b/apps/leo_manager/src/leo_manager_console.erl index e6acb936..d2ed92a7 100644 --- a/apps/leo_manager/src/leo_manager_console.erl +++ b/apps/leo_manager/src/leo_manager_console.erl @@ -2067,6 +2067,14 @@ recover(Socket, Option) -> {_, Cause} -> {error, Cause} end; + {ok, [?RECOVER_DISK, Node, Disk |Rest]} when Rest == [] -> + HasRoutingTable = (leo_redundant_manager_api:checksum(ring) >= 0), + case catch leo_manager_api:recover(?RECOVER_DISK, Node, Disk, HasRoutingTable) of + ok -> + ok; + {_, Cause} -> + {error, Cause} + end; _ -> {error, ?ERROR_INVALID_ARGS} end. diff --git a/apps/leo_storage/src/leo_storage_api.erl b/apps/leo_storage/src/leo_storage_api.erl index 71fca79c..1bceb805 100644 --- a/apps/leo_storage/src/leo_storage_api.erl +++ b/apps/leo_storage/src/leo_storage_api.erl @@ -246,7 +246,7 @@ attach(SystemConf) -> %% @doc synchronize a data. %% -spec(synchronize(Node) -> - ok | {error, any()} when Node::atom()). + ok | {error, any()} when Node::atom()|{atom(),pos_integer()}). synchronize(Node) -> leo_storage_mq:publish(?QUEUE_ID_RECOVERY_NODE, Node). diff --git a/apps/leo_storage/src/leo_storage_mq.erl b/apps/leo_storage/src/leo_storage_mq.erl index ae309c09..53684c3f 100644 --- a/apps/leo_storage/src/leo_storage_mq.erl +++ b/apps/leo_storage/src/leo_storage_mq.erl @@ -107,7 +107,7 @@ start_1([{Id, Path}|Rest], Sup, Root) -> %% @doc Input a message into the queue. --spec(publish(mq_id(), atom()|binary()) -> +-spec(publish(mq_id(), atom()|binary()|{atom(),pos_integer()}) -> ok | {error, any()}). publish(?QUEUE_ID_RECOVERY_NODE = Id, Node) -> KeyBin = term_to_binary(Node), @@ -412,6 +412,9 @@ handle_call({consume, ?QUEUE_ID_ASYNC_DELETION, MessageBin}) -> handle_call({consume, ?QUEUE_ID_RECOVERY_NODE, MessageBin}) -> case catch binary_to_term(MessageBin) of + #recovery_node_message{node = NodeAndDisk} when is_tuple(NodeAndDisk) -> + {Node, Disk} = NodeAndDisk, + recover_disk(Node, Disk); #recovery_node_message{node = Node} -> recover_node(Node); _ -> @@ -492,6 +495,15 @@ handle_call(_,_,_) -> %% INNTERNAL FUNCTIONS-1 %%-------------------------------------------------------------------- %% @doc synchronize by vnode-id. +%% @private +-spec(recover_disk(Node, Disk) -> + ok when Node::node(), + Disk::pos_integer()). +recover_disk(Node, Disk) -> + Callback = recover_node_callback(Node), + _ = leo_object_storage_api:fetch_by_addr_id_and_disk(0, Disk, Callback), + ok. + %% @private -spec(recover_node(Node) -> ok when Node::node()). diff --git a/docs/admin/system_operations/data.md b/docs/admin/system_operations/data.md index 2832da4c..0655e023 100644 --- a/docs/admin/system_operations/data.md +++ b/docs/admin/system_operations/data.md @@ -224,6 +224,7 @@ This section provides information about the recovery commands that can be used i | Shell | Description | |--- |--- | |leofs-adm recover-file \|Recover an inconsistent object specified by the file-path.| +|leofs-adm recover-disk \ \|Recover all inconsistent objects on the specified disk in the specified storage-node. Note that this command can be used ONLY in case all LeoStorage have the same obj_containers configuration.| |leofs-adm recover-node \|Recover all inconsistent objects in the specified storage-node.| |leofs-adm recover-cluster \|Recover all inconsistent objects in the specified cluster-id.| @@ -235,6 +236,22 @@ This section provides information about the recovery commands that can be used i $ leofs-adm recover-file leo/fast/storage.key OK ``` + +#### recover-disk + +```bash +## Example: +## If you have the following configuration in leo_storage.conf +## obj_containers.path = [./avs1,./avs2] +## then the below command will recover files stored under ./avs1 +$ leofs-adm recover-disk storage_0@127.0.0.1 1 +OK + +## If you want to recover files stored under ./avs2 then issue the below one. +$ leofs-adm recover-disk storage_0@127.0.0.1 2 +OK +``` + #### recover-node ```bash @@ -243,7 +260,7 @@ $ leofs-adm recover-node storage_0@127.0.0.1 OK ``` -- recover-cluster +#### recover-cluster ```bash ## Example: @@ -257,7 +274,7 @@ OK When/How to use recover commands. - AVS/KVS Broken - - Invoke `recover-node` with a node having broken AVS/KVS files. + - Invoke `recover-node` with a node having broken AVS/KVS files or `recover-disk` with a disk having broken AVS/KVS files if you have multiple container directories. - Queue Broken - Invoke `recover-node` with every node except which having broken Queue files. - The procedure might be improved in future when [issue#618](https://github.com/leo-project/leofs/issues/618) solved. @@ -265,7 +282,7 @@ When/How to use recover commands. - Invoke `suspend` with a node having broken Disk arrays and subsequently run `leo_storage stop`. - Exchange broken Disk arrays. - Run `leo_storage start` and subsequently Invoke `resume` with the node. - - Invoke `recover-node` with the node. + - Invoke `recover-node` with the node or `recover-disk` with the broken disk if you have multiple container directories. - Node Broken - Invoke `detach` with a broken node. - Prepare a new node that will take over all objects assigned to a detached node. diff --git a/leofs-adm b/leofs-adm index 3cda294d..09d04c03 100755 --- a/leofs-adm +++ b/leofs-adm @@ -236,6 +236,20 @@ usage_recover_file() { # } # << +usage_recover_disk() { + USAGE="recover-disk $STORAGE_NODE " + case "$1" in + min) + output "$WHITESPACE $USAGE " + ;; + *) + output "Usage:" + output " $SCRIPT $USAGE" + output "Description:" + output " - Recover all inconsistent objects on the specified disk in the specified node" + esac +} + usage_recover_node() { USAGE="recover-node $STORAGE_NODE" case "$1" in @@ -932,6 +946,7 @@ usage() { output "" output " ${UNDERLINE}Recover Commands:${NONE}" usage_recover_file min + usage_recover_disk min usage_recover_node min usage_recover_ring min usage_recover_cluster min @@ -1194,6 +1209,13 @@ case "$1" in # fi # ;; # << + recover-disk) + if [ $# -eq 3 ]; then + send_command "recover disk $2 $3" + else + usage_recover_disk + fi + ;; recover-node) if [ $# -eq 2 ]; then case "$2" in From a21d487dc17fdc2976f37accac2e0e7e6d1df525 Mon Sep 17 00:00:00 2001 From: mocchira Date: Fri, 23 Mar 2018 08:59:24 +0900 Subject: [PATCH 2/2] leofs: Bump leo_object_storage to 1.3.31 --- apps/leo_gateway/rebar.config | 2 +- apps/leo_manager/rebar.config | 2 +- apps/leo_storage/rebar.config | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/leo_gateway/rebar.config b/apps/leo_gateway/rebar.config index 541ceba7..174a7eb6 100644 --- a/apps/leo_gateway/rebar.config +++ b/apps/leo_gateway/rebar.config @@ -26,7 +26,7 @@ {leo_cache, ".*", {git, "https://github.com/leo-project/leo_cache.git", {tag, "0.8.9"}}}, {leo_commons, ".*", {git, "https://github.com/leo-project/leo_commons.git", {tag, "1.2.0"}}}, {leo_logger, ".*", {git, "https://github.com/leo-project/leo_logger.git", {tag, "1.3.7"}}}, - {leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.30"}}}, + {leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.31"}}}, {leo_pod, ".*", {git, "https://github.com/leo-project/leo_pod.git", {tag, "0.6.9"}}}, {leo_redundant_manager, ".*", {git, "https://github.com/leo-project/leo_redundant_manager.git", {tag, "1.9.59"}}}, {leo_statistics, ".*", {git, "https://github.com/leo-project/leo_statistics.git", {tag, "1.1.22"}}}, diff --git a/apps/leo_manager/rebar.config b/apps/leo_manager/rebar.config index c1014b9b..0a14f4ea 100644 --- a/apps/leo_manager/rebar.config +++ b/apps/leo_manager/rebar.config @@ -25,7 +25,7 @@ {deps, [ {leo_commons, ".*", {git, "https://github.com/leo-project/leo_commons.git", {tag, "1.2.0"}}}, {leo_logger, ".*", {git, "https://github.com/leo-project/leo_logger.git", {tag, "1.3.7"}}}, - {leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.30"}}}, + {leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.31"}}}, {leo_redundant_manager, ".*", {git, "https://github.com/leo-project/leo_redundant_manager.git", {tag, "1.9.59"}}}, {leo_rpc, ".*", {git, "https://github.com/leo-project/leo_rpc.git", {tag, "0.10.17"}}}, {leo_statistics, ".*", {git, "https://github.com/leo-project/leo_statistics.git", {tag, "1.1.22"}}}, diff --git a/apps/leo_storage/rebar.config b/apps/leo_storage/rebar.config index ed622217..a10f6507 100644 --- a/apps/leo_storage/rebar.config +++ b/apps/leo_storage/rebar.config @@ -26,7 +26,7 @@ {leo_commons, ".*", {git, "https://github.com/leo-project/leo_commons.git", {tag, "1.2.0"}}}, {leo_logger, ".*", {git, "https://github.com/leo-project/leo_logger.git", {tag, "1.3.7"}}}, {leo_mq, ".*", {git, "https://github.com/leo-project/leo_mq.git", {tag, "1.5.16"}}}, - {leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.30"}}}, + {leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.31"}}}, {leo_ordning_reda, ".*", {git, "https://github.com/leo-project/leo_ordning_reda.git", {tag, "1.2.10"}}}, {leo_redundant_manager, ".*", {git, "https://github.com/leo-project/leo_redundant_manager.git", {tag, "1.9.59"}}}, {leo_rpc, ".*", {git, "https://github.com/leo-project/leo_rpc.git", {tag, "0.10.17"}}},