From 499113ebac4e0bbb3d917787e1b2cfac7c7a4b5f Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 22 Mar 2024 17:07:21 -0300 Subject: [PATCH 1/4] perf: add concurrency Previously, with single-process concurrency: ```sh bash-5.1# time _build/default/bin/emqx_schema_validate -j 1 /schema.json real 2m44.926s user 0m3.740s sys 0m1.366s ``` With 11 concurrent processes: ```sh bash-5.1# time _build/default/bin/emqx_schema_validate -j 11 /schema.json real 0m14.165s user 0m4.292s sys 0m0.861s ``` --- src/emqx_schema_validate.erl | 82 +++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/src/emqx_schema_validate.erl b/src/emqx_schema_validate.erl index 1e75525..9139dfb 100644 --- a/src/emqx_schema_validate.erl +++ b/src/emqx_schema_validate.erl @@ -1,21 +1,27 @@ -module(emqx_schema_validate). %% API exports --export([main/1]). +-export([main/1, do_spellcheck_schema/1]). --define(TAB, stats_tab). +-define(DEFAULT_MAX_CONCURRENCY, 10). %%==================================================================== %% API functions %%==================================================================== -main(["-"]) -> +main(Args) -> + main(Args, ?DEFAULT_MAX_CONCURRENCY). + +main(["-j", N0 | Rest], _MaxConcurrency) -> + N = list_to_integer(N0), + main(Rest, N); +main(["-"], MaxConcurrency) -> Binary = iolist_to_binary(read_stdio()), - process_data(Binary); -main([JsonFile]) -> + process_data(Binary, MaxConcurrency); +main([JsonFile], MaxConcurrency) -> {ok, Binary} = file:read_file(JsonFile), - process_data(Binary); -main(_) -> + process_data(Binary, MaxConcurrency); +main(_, _) -> io:format("Usage: emqx_schema_validate ~n", []), erlang:halt(1). @@ -23,26 +29,53 @@ main(_) -> %% Internal functions %%==================================================================== -process_data(Binary) -> +process_data(Binary, MaxConcurrency) -> {ok, _} = application:ensure_all_started(emqx_schema_validate), langtool:start(), Data = jsone:decode(Binary, [{object_format, map}, {keys, atom}]), - spellcheck_schema(Data), + spellcheck_schema(Data, MaxConcurrency), case is_ok() of true -> halt(0); false -> halt(1) end. -spellcheck_schema(Data) -> +spellcheck_schema(Data, MaxConcurrency) -> + Chunk = lists:sublist(Data, MaxConcurrency), + Rest = lists:sublist(Data, MaxConcurrency, length(Data)), + Refs = + lists:foldl( + fun(Node, Acc) -> + {Pid, Ref} = spawn_monitor(?MODULE, do_spellcheck_schema, [Node]), + Acc#{{Pid, Ref} => true} + end, + #{}, + Chunk), + spellcheck_schema_loop(Rest, Refs). + +spellcheck_schema_loop([] = _Data, Refs) when map_size(Refs) == 0 -> + ok; +spellcheck_schema_loop([] = Data, Refs0) -> + receive + {'DOWN', Ref, process, Pid, _} when is_map_key({Pid, Ref}, Refs0) -> + Refs = maps:remove({Pid, Ref}, Refs0), + spellcheck_schema_loop(Data, Refs) + end; +spellcheck_schema_loop([Node | Rest], Refs0) -> + receive + {'DOWN', Ref, process, Pid, _} when is_map_key({Pid, Ref}, Refs0) -> + Refs1 = maps:remove({Pid, Ref}, Refs0), + {NPid, NRef} = spawn_monitor(?MODULE, do_spellcheck_schema, [Node]), + Refs = Refs1#{{NPid, NRef} => true}, + spellcheck_schema_loop(Rest, Refs) + end. + +do_spellcheck_schema(Node = #{full_name := FullName}) -> + do_spellcheck([FullName], Node), + Fields = maps:get(fields, Node, []), [begin - do_spellcheck([FullName], Node), - Fields = maps:get(fields, Node, []), - [begin - FieldName = [FullName, maps:get(name, Field)], - do_spellcheck(FieldName, Field) - end || Field <- Fields] - end || Node = #{full_name := FullName} <- Data], - ok. + FieldName = [FullName, maps:get(name, Field)], + do_spellcheck(FieldName, Field) + end || Field <- Fields]. %% Check spelling in any description: do_spellcheck(FullName, #{desc := Desc}) -> @@ -52,17 +85,18 @@ do_spellcheck(FullName, #{desc := Desc}) -> ok; L -> setfail(), - io:format(user, "!! '~s'~n~n", [format_name(FullName)]), - [io:format(user, "~s", [langtool:format_warning(I)]) || I <- L], + Header = io_lib:format("!! '~s'~n~n", [format_name(FullName)]), + Warnings = [io_lib:format("~s", [langtool:format_warning(I)]) || I <- L], + io:put_chars(user, [Header, Warnings]), ok end; %% Ignore references to structs, since the struct itself should have a description -do_spellcheck(FullName, #{type := #{kind := <<"struct">>}}) -> +do_spellcheck(_FullName, #{type := #{kind := <<"struct">>}}) -> ok; do_spellcheck(FullName, _) -> Record = hd(FullName), case binary:match(Record, [<<"Root Config Keys">>]) of - nomatch -> + nomatch -> io:format(user, "Error: '~s' doesn't have a description~n", [format_name(FullName)]), setfail(); _ -> ok @@ -79,10 +113,10 @@ read_stdio() -> end. setfail() -> - put(?MODULE, true). + persistent_term:put(?MODULE, true). is_ok() -> - get(?MODULE) =/= true. + persistent_term:get(?MODULE, false) =/= true. format_name(FullName) -> lists:join("::", FullName). From c6dcd96cb9c033d4c7ba337250a319f3f73b0d80 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 22 Mar 2024 17:58:48 -0300 Subject: [PATCH 2/4] fix: pin base docker image version Newer versions introduce more checks that fail with out current schema docs. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 949e82e..e557343 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM erikvl87/languagetool +FROM erikvl87/languagetool:5.8-dockerupdate-2 # Improving the spell checker # http://wiki.languagetool.org/hunspell-support From 10222a9f985da685652be17b10035dcd1790fad5 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 25 Mar 2024 09:23:10 -0300 Subject: [PATCH 3/4] style: change style of concurrency --- src/emqx_schema_validate.erl | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/src/emqx_schema_validate.erl b/src/emqx_schema_validate.erl index 9139dfb..ad77f78 100644 --- a/src/emqx_schema_validate.erl +++ b/src/emqx_schema_validate.erl @@ -39,35 +39,13 @@ process_data(Binary, MaxConcurrency) -> false -> halt(1) end. -spellcheck_schema(Data, MaxConcurrency) -> +spellcheck_schema(Data = [_ | _], MaxConcurrency) -> Chunk = lists:sublist(Data, MaxConcurrency), Rest = lists:sublist(Data, MaxConcurrency, length(Data)), - Refs = - lists:foldl( - fun(Node, Acc) -> - {Pid, Ref} = spawn_monitor(?MODULE, do_spellcheck_schema, [Node]), - Acc#{{Pid, Ref} => true} - end, - #{}, - Chunk), - spellcheck_schema_loop(Rest, Refs). - -spellcheck_schema_loop([] = _Data, Refs) when map_size(Refs) == 0 -> - ok; -spellcheck_schema_loop([] = Data, Refs0) -> - receive - {'DOWN', Ref, process, Pid, _} when is_map_key({Pid, Ref}, Refs0) -> - Refs = maps:remove({Pid, Ref}, Refs0), - spellcheck_schema_loop(Data, Refs) - end; -spellcheck_schema_loop([Node | Rest], Refs0) -> - receive - {'DOWN', Ref, process, Pid, _} when is_map_key({Pid, Ref}, Refs0) -> - Refs1 = maps:remove({Pid, Ref}, Refs0), - {NPid, NRef} = spawn_monitor(?MODULE, do_spellcheck_schema, [Node]), - Refs = Refs1#{{NPid, NRef} => true}, - spellcheck_schema_loop(Rest, Refs) - end. + _ = rpc:pmap({?MODULE, do_spellcheck_schema}, _ExtraArgs = [], Chunk), + spellcheck_schema(Rest, MaxConcurrency); +spellcheck_schema(_Data = [], _MaxConcurrency) -> + ok. do_spellcheck_schema(Node = #{full_name := FullName}) -> do_spellcheck([FullName], Node), From db72dffb42a1e7bf57b697d3f5c0f099e518db63 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 25 Mar 2024 09:31:41 -0300 Subject: [PATCH 4/4] chore: fix run script --- run.sh | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/run.sh b/run.sh index bb8e7c9..0bda80e 100755 --- a/run.sh +++ b/run.sh @@ -7,9 +7,22 @@ for dict in /dicts/*.txt; do cat "$dict" >> org/languagetool/resource/en/hunspell/spelling.txt done -ARGS="$*" +SCHEMA_FILE="" +MAX_CONCURRENCY="10" +while [ "$#" -gt 0 ]; do + case "$1" in + -j) + MAX_CONCURRENCY="$2" + shift 2 + ;; + *) + SCHEMA_FILE="$1" + shift + ;; + esac +done -if [ -z "${ARGS}" ]; then +if [ -z "${SCHEMA_FILE}" ]; then echo "Missing schema file" exit 1 fi @@ -30,6 +43,6 @@ while ! curl --fail --data "language=en-US&text=a simple test" http://localhost: sleep 1 done -echo "Checking $ARGS..." +echo "Checking $SCHEMA_FILE with $MAX_CONCURRENCY processes..." -emqx_schema_validate "$ARGS" +emqx_schema_validate -j "$MAX_CONCURRENCY" "$SCHEMA_FILE"