Skip to content

Commit

Permalink
Merge branch 'mocchira-fix/issue983' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
yosukehara committed Mar 25, 2018
2 parents 71b25bf + a21d487 commit 4e017ee
Show file tree
Hide file tree
Showing 10 changed files with 152 additions and 9 deletions.
2 changes: 1 addition & 1 deletion apps/leo_gateway/rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
{leo_cache, ".*", {git, "https://github.com/leo-project/leo_cache.git", {tag, "0.8.9"}}},
{leo_commons, ".*", {git, "https://github.com/leo-project/leo_commons.git", {tag, "1.2.0"}}},
{leo_logger, ".*", {git, "https://github.com/leo-project/leo_logger.git", {tag, "1.3.7"}}},
{leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.30"}}},
{leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.31"}}},
{leo_pod, ".*", {git, "https://github.com/leo-project/leo_pod.git", {tag, "0.6.9"}}},
{leo_redundant_manager, ".*", {git, "https://github.com/leo-project/leo_redundant_manager.git", {tag, "1.9.59"}}},
{leo_statistics, ".*", {git, "https://github.com/leo-project/leo_statistics.git", {tag, "1.1.22"}}},
Expand Down
3 changes: 3 additions & 0 deletions apps/leo_manager/include/leo_manager.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@
{?CMD_RECOVER, lists:append(
["recover file <path>", ?CRLF,
"recover dir [<path>]", ?CRLF,
"recover disk <storage-node> <disk-id>", ?CRLF,
"recover node <storage-node>", ?CRLF,
"recover ring <storage-node>", ?CRLF,
"recover cluster <cluster-id>"
Expand Down Expand Up @@ -248,6 +249,7 @@

%% recover type
-define(RECOVER_FILE, "file").
-define(RECOVER_DISK, "disk").
-define(RECOVER_NODE, "node").
-define(RECOVER_RING, "ring").
-define(RECOVER_REMOTE_CLUSTER, "cluster").
Expand Down Expand Up @@ -330,6 +332,7 @@
-define(ERROR_FAILED_GET_VERSION, "Failed to get the version").
-define(ERROR_FAILED_REGISTERING_DEL_BUCKET_MSG, "Failed to register a del-bucket's message").
-define(ERROR_FAILED_REMOVING_DEL_BUCKET_MSG, "Failed to remove a del-bucket's message").
-define(ERROR_FAILED_RECOVER_DISK_DUE_TO_DIFFERENT_AVS_CONF, "Failed to recover disk due to different AVS conf found").

%% type of console
-define(CONSOLE_CUI, 'cui').
Expand Down
2 changes: 1 addition & 1 deletion apps/leo_manager/rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
{deps, [
{leo_commons, ".*", {git, "https://github.com/leo-project/leo_commons.git", {tag, "1.2.0"}}},
{leo_logger, ".*", {git, "https://github.com/leo-project/leo_logger.git", {tag, "1.3.7"}}},
{leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.30"}}},
{leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.31"}}},
{leo_redundant_manager, ".*", {git, "https://github.com/leo-project/leo_redundant_manager.git", {tag, "1.9.59"}}},
{leo_rpc, ".*", {git, "https://github.com/leo-project/leo_rpc.git", {tag, "0.10.17"}}},
{leo_statistics, ".*", {git, "https://github.com/leo-project/leo_statistics.git", {tag, "1.1.22"}}},
Expand Down
83 changes: 82 additions & 1 deletion apps/leo_manager/src/leo_manager_api.erl
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@

-export([register/4, register/7, register/8,
notify/3, notify/4, purge/1, remove/1,
whereis/2, recover/3, rebuild_dir_metadata/2,
whereis/2, recover/3, recover/4, rebuild_dir_metadata/2,
compact/2, compact/4, diagnose_data/1,
stats/2,
mq_stats/1, mq_suspend/2, mq_resume/2,
Expand Down Expand Up @@ -1583,6 +1583,87 @@ recover(_,_,true) ->
recover(_,_,false) ->
{error, ?ERROR_COULD_NOT_GET_RING}.

recover(?RECOVER_DISK, Node, Disk, true) when is_list(Node) ->
recover(?RECOVER_DISK, list_to_atom(Node), Disk, true);
recover(?RECOVER_DISK, Node, Disk, true) when is_list(Disk) ->
case catch list_to_integer(Disk) of
{'EXIT', _} ->
{error, ?ERROR_INVALID_ARGS};
Val ->
recover(?RECOVER_DISK, Node, Val, true)
end;
recover(?RECOVER_DISK, Node, Disk, true) ->
%% Check the target node and system-state
case leo_misc:node_existence(Node) of
true ->
Ret = case leo_redundant_manager_api:get_member_by_node(Node) of
{ok, #member{state = ?STATE_RUNNING}} ->
true;
_ ->
false
end,
recover_disk_1(Ret, Node, Disk);
false ->
{error, ?ERROR_COULD_NOT_CONNECT}
end;
recover(_,_,_,true) ->
{error, ?ERROR_INVALID_ARGS};
recover(_,_,_,false) ->
{error, ?ERROR_COULD_NOT_GET_RING}.

%% @doc Execute recovery of the target disk
%% Check conditions
%% @private
recover_disk_1(true, Node, Disk) ->
{Ret, Members} = is_allow_to_distribute_command(Node),
recover_disk_2(Ret, Members, Node, Disk);
recover_disk_1(false, _, _) ->
{error, ?ERROR_TARGET_NODE_NOT_RUNNING}.

%% @doc Execute recovery of the target disk
%% @private
recover_disk_2(true, Members, Node, Disk) ->
case rpc:multicall(Members, ?API_STORAGE, get_node_status,
[], ?DEF_TIMEOUT) of
{RetL, []} ->
case has_same_avs_conf(RetL) of
{true, Size} ->
case 0 < Disk andalso Disk =< Size of
true ->
case rpc:multicall(Members, ?API_STORAGE, synchronize,
[{Node, Disk}], ?DEF_TIMEOUT) of
{_RetL, []} ->
ok;
{_, BadNodes} ->
?warn("recover_disk_2/4",
[{bad_nodes, BadNodes}]),
{error, BadNodes}
end;
false ->
{error, ?ERROR_INVALID_ARGS}
end;
_ ->
{error, ?ERROR_FAILED_RECOVER_DISK_DUE_TO_DIFFERENT_AVS_CONF}
end;
{_, BadNodes} ->
?warn("recover_disk_2/4",
[{bad_nodes, BadNodes}]),
{error, BadNodes}
end;
recover_disk_2(false,_,_,_) ->
{error, ?ERROR_NOT_SATISFY_CONDITION}.

%% @doc Check if every storage node has a same AVS config
%% @private
has_same_avs_conf(RetL) ->
ObjContainerConfList = [leo_misc:get_value('avs', Status, []) || {ok, Status} <- RetL],
NormalizedList = [lists:map(fun(Items) ->
leo_misc:get_value('num_of_containers', Items, 0)
end, ObjContainer) || ObjContainer <- ObjContainerConfList],
[H|Rest] = NormalizedList,
{lists:all(fun(Elem) ->
Elem == H
end, Rest), length(H)}.

%% @doc Execute recovery of the target node
%% Check conditions
Expand Down
8 changes: 8 additions & 0 deletions apps/leo_manager/src/leo_manager_console.erl
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,14 @@ recover(Socket, Option) ->
{_, Cause} ->
{error, Cause}
end;
{ok, [?RECOVER_DISK, Node, Disk |Rest]} when Rest == [] ->
HasRoutingTable = (leo_redundant_manager_api:checksum(ring) >= 0),
case catch leo_manager_api:recover(?RECOVER_DISK, Node, Disk, HasRoutingTable) of
ok ->
ok;
{_, Cause} ->
{error, Cause}
end;
_ ->
{error, ?ERROR_INVALID_ARGS}
end.
Expand Down
2 changes: 1 addition & 1 deletion apps/leo_storage/rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
{leo_commons, ".*", {git, "https://github.com/leo-project/leo_commons.git", {tag, "1.2.0"}}},
{leo_logger, ".*", {git, "https://github.com/leo-project/leo_logger.git", {tag, "1.3.7"}}},
{leo_mq, ".*", {git, "https://github.com/leo-project/leo_mq.git", {tag, "1.5.16"}}},
{leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.30"}}},
{leo_object_storage, ".*", {git, "https://github.com/leo-project/leo_object_storage.git", {tag, "1.3.31"}}},
{leo_ordning_reda, ".*", {git, "https://github.com/leo-project/leo_ordning_reda.git", {tag, "1.2.10"}}},
{leo_redundant_manager, ".*", {git, "https://github.com/leo-project/leo_redundant_manager.git", {tag, "1.9.59"}}},
{leo_rpc, ".*", {git, "https://github.com/leo-project/leo_rpc.git", {tag, "0.10.17"}}},
Expand Down
2 changes: 1 addition & 1 deletion apps/leo_storage/src/leo_storage_api.erl
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ attach(SystemConf) ->
%% @doc synchronize a data.
%%
-spec(synchronize(Node) ->
ok | {error, any()} when Node::atom()).
ok | {error, any()} when Node::atom()|{atom(),pos_integer()}).
synchronize(Node) ->
leo_storage_mq:publish(?QUEUE_ID_RECOVERY_NODE, Node).

Expand Down
14 changes: 13 additions & 1 deletion apps/leo_storage/src/leo_storage_mq.erl
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ start_1([{Id, Path}|Rest], Sup, Root) ->


%% @doc Input a message into the queue.
-spec(publish(mq_id(), atom()|binary()) ->
-spec(publish(mq_id(), atom()|binary()|{atom(),pos_integer()}) ->
ok | {error, any()}).
publish(?QUEUE_ID_RECOVERY_NODE = Id, Node) ->
KeyBin = term_to_binary(Node),
Expand Down Expand Up @@ -412,6 +412,9 @@ handle_call({consume, ?QUEUE_ID_ASYNC_DELETION, MessageBin}) ->

handle_call({consume, ?QUEUE_ID_RECOVERY_NODE, MessageBin}) ->
case catch binary_to_term(MessageBin) of
#recovery_node_message{node = NodeAndDisk} when is_tuple(NodeAndDisk) ->
{Node, Disk} = NodeAndDisk,
recover_disk(Node, Disk);
#recovery_node_message{node = Node} ->
recover_node(Node);
_ ->
Expand Down Expand Up @@ -492,6 +495,15 @@ handle_call(_,_,_) ->
%% INNTERNAL FUNCTIONS-1
%%--------------------------------------------------------------------
%% @doc synchronize by vnode-id.
%% @private
-spec(recover_disk(Node, Disk) ->
ok when Node::node(),
Disk::pos_integer()).
recover_disk(Node, Disk) ->
Callback = recover_node_callback(Node),
_ = leo_object_storage_api:fetch_by_addr_id_and_disk(0, Disk, Callback),
ok.

%% @private
-spec(recover_node(Node) ->
ok when Node::node()).
Expand Down
23 changes: 20 additions & 3 deletions docs/admin/system_operations/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ This section provides information about the recovery commands that can be used i
| Shell | Description |
|--- |--- |
|leofs-adm recover-file \<file-path\>|Recover an inconsistent object specified by the file-path.|
|leofs-adm recover-disk \<storage-node\> \<disk-id\>|Recover all inconsistent objects on the specified disk in the specified storage-node. Note that this command can be used ONLY in case all LeoStorage have the same obj_containers configuration.|
|leofs-adm recover-node \<storage-node\>|Recover all inconsistent objects in the specified storage-node.|
|leofs-adm recover-cluster \<cluster-id\>|Recover all inconsistent objects in the specified cluster-id.|

Expand All @@ -235,6 +236,22 @@ This section provides information about the recovery commands that can be used i
$ leofs-adm recover-file leo/fast/storage.key
OK
```

#### recover-disk

```bash
## Example:
## If you have the following configuration in leo_storage.conf
## obj_containers.path = [./avs1,./avs2]
## then the below command will recover files stored under ./avs1
$ leofs-adm recover-disk [email protected] 1
OK

## If you want to recover files stored under ./avs2 then issue the below one.
$ leofs-adm recover-disk [email protected] 2
OK
```

#### recover-node

```bash
Expand All @@ -243,7 +260,7 @@ $ leofs-adm recover-node [email protected]
OK
```

- recover-cluster
#### recover-cluster

```bash
## Example:
Expand All @@ -257,15 +274,15 @@ OK
When/How to use recover commands.

- AVS/KVS Broken
- Invoke `recover-node` with a node having broken AVS/KVS files.
- Invoke `recover-node` with a node having broken AVS/KVS files or `recover-disk` with a disk having broken AVS/KVS files if you have multiple container directories.
- Queue Broken
- Invoke `recover-node` with every node except which having broken Queue files.
- The procedure might be improved in future when [issue#618](https://github.com/leo-project/leofs/issues/618) solved.
- Disk Broken
- Invoke `suspend` with a node having broken Disk arrays and subsequently run `leo_storage stop`.
- Exchange broken Disk arrays.
- Run `leo_storage start` and subsequently Invoke `resume` with the node.
- Invoke `recover-node` with the node.
- Invoke `recover-node` with the node or `recover-disk` with the broken disk if you have multiple container directories.
- Node Broken
- Invoke `detach` with a broken node.
- Prepare a new node that will take over all objects assigned to a detached node.
Expand Down
22 changes: 22 additions & 0 deletions leofs-adm
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,20 @@ usage_recover_file() {
# }
# <<

usage_recover_disk() {
USAGE="recover-disk $STORAGE_NODE <disk-id>"
case "$1" in
min)
output "$WHITESPACE $USAGE "
;;
*)
output "Usage:"
output " $SCRIPT $USAGE"
output "Description:"
output " - Recover all inconsistent objects on the specified disk in the specified node"
esac
}

usage_recover_node() {
USAGE="recover-node $STORAGE_NODE"
case "$1" in
Expand Down Expand Up @@ -932,6 +946,7 @@ usage() {
output ""
output " ${UNDERLINE}Recover Commands:${NONE}"
usage_recover_file min
usage_recover_disk min
usage_recover_node min
usage_recover_ring min
usage_recover_cluster min
Expand Down Expand Up @@ -1194,6 +1209,13 @@ case "$1" in
# fi
# ;;
# <<
recover-disk)
if [ $# -eq 3 ]; then
send_command "recover disk $2 $3"
else
usage_recover_disk
fi
;;
recover-node)
if [ $# -eq 2 ]; then
case "$2" in
Expand Down

0 comments on commit 4e017ee

Please sign in to comment.