Skip to content
This repository has been archived by the owner on Nov 5, 2023. It is now read-only.

Commit

Permalink
introduce metrics export framework for prometheus
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffgrunewald committed Aug 8, 2022
1 parent 90579f9 commit 58482f1
Show file tree
Hide file tree
Showing 9 changed files with 278 additions and 12 deletions.
4 changes: 4 additions & 0 deletions config/docker_node.config.src
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
[
{nat_map, #{ {"${NAT_INTERNAL_IP}", "${NAT_INTERNAL_PORT}"} => {"${NAT_EXTERNAL_IP}", "${NAT_EXTERNAL_PORT}"}}}
]},
{blockchain_node, [
{metrics, [block_metrics, txn_metrics, grpc_metrics]},
{metrics_port, ${METRICS_PORT:-9090}}
]},
{blockchain,
[
{base_dir, "/var/data"}
Expand Down
4 changes: 4 additions & 0 deletions config/prod.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
[
{log_root, "log"}
]},
{blockchain_node, [
{metrics, [block_metrics, txn_metrics, grpc_metrics]},
{metrics_port, 9090}
]},
{blockchain,
[
{base_dir, "data"}
Expand Down
2 changes: 2 additions & 0 deletions rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
jsone,
{jsonrpc2, {git, "https://github.com/novalabsxyz/jsonrpc2-erlang.git", {branch, "master"}}},
{observer_cli, "1.7.1"},
{telemetry, "1.1.0"},
{prometheus, "4.8.2"},
pbkdf2
]}.

Expand Down
20 changes: 10 additions & 10 deletions rebar.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
{<<"base64url">>,{pkg,<<"base64url">>,<<"1.0.1">>},1},
{<<"blockchain">>,
{git,"https://github.com/helium/blockchain-core.git",
{ref,"edc710e9893b272c31c68de00b29a2b2dac619dd"}},
{ref,"9da2cea76574c147a792aa5a3c2ed9d332d059a1"}},
0},
{<<"certifi">>,{pkg,<<"certifi">>,<<"2.9.0">>},2},
{<<"chatterbox">>,
Expand All @@ -19,15 +19,15 @@
{git,"https://github.com/helium/clique.git",
{ref,"e4be0dae150061bec080502ed530091672880867"}},
1},
{<<"cream">>,
{git,"https://github.com/helium/cream",
{ref,"9fd0ff78ab4d30f8ea16212b504c817290dfaf64"}},
1},
{<<"ctx">>,{pkg,<<"ctx">>,<<"0.6.0">>},2},
{<<"cuttlefish">>,
{git,"https://github.com/helium/cuttlefish.git",
{ref,"8672838e8f4ef61602aee6e4ff97ec9be54031dc"}},
2},
{<<"e2qc">>,
{git,"https://github.com/helium/e2qc",
{ref,"d77231471ab93ad38bb6763870917ff9499cdc73"}},
1},
{<<"ecc_compact">>,{pkg,<<"ecc_compact">>,<<"1.1.1">>},3},
{<<"elli">>,{pkg,<<"elli">>,<<"3.3.0">>},0},
{<<"enacl">>,
Expand Down Expand Up @@ -111,8 +111,8 @@
{<<"parse_trans">>,{pkg,<<"parse_trans">>,<<"3.3.1">>},2},
{<<"pbkdf2">>,{pkg,<<"pbkdf2">>,<<"2.0.0">>},0},
{<<"procket">>,{pkg,<<"procket">>,<<"0.9.6">>},3},
{<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.8.1">>},2},
{<<"quantile_estimator">>,{pkg,<<"quantile_estimator">>,<<"0.2.1">>},3},
{<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.8.2">>},0},
{<<"quantile_estimator">>,{pkg,<<"quantile_estimator">>,<<"0.2.1">>},1},
{<<"ranch">>,{pkg,<<"ranch">>,<<"1.5.0">>},2},
{<<"rand_compat">>,{pkg,<<"rand_compat">>,<<"0.0.3">>},3},
{<<"recon">>,{pkg,<<"recon">>,<<"2.5.2">>},1},
Expand All @@ -128,7 +128,7 @@
{<<"small_ints">>,{pkg,<<"small_ints">>,<<"0.1.0">>},4},
{<<"splicer">>,{pkg,<<"splicer">>,<<"0.5.5">>},2},
{<<"ssl_verify_fun">>,{pkg,<<"ssl_verify_fun">>,<<"1.1.6">>},2},
{<<"telemetry">>,{pkg,<<"telemetry">>,<<"1.1.0">>},1},
{<<"telemetry">>,{pkg,<<"telemetry">>,<<"1.1.0">>},0},
{<<"throttle">>,{pkg,<<"lambda_throttle">>,<<"0.2.0">>},2},
{<<"unicode_util_compat">>,{pkg,<<"unicode_util_compat">>,<<"0.7.0">>},2},
{<<"vincenty">>,
Expand Down Expand Up @@ -169,7 +169,7 @@
{<<"parse_trans">>, <<"16328AB840CC09919BD10DAB29E431DA3AF9E9E7E7E6F0089DD5A2D2820011D8">>},
{<<"pbkdf2">>, <<"11C23279FDED5C0027AB3996CFAE77805521D7EF4BABDE2BD7EC04A9086CF499">>},
{<<"procket">>, <<"467014637052CAB030DD32D5269591B82763420C9572F61CD6C5657FE4579777">>},
{<<"prometheus">>, <<"FA76B152555273739C14B06F09F485CF6D5D301FE4E9D31B7FF803D26025D7A0">>},
{<<"prometheus">>, <<"B88F24279DD7A1F512CB090595FF6C88B50AAD0A6B394A4C4983725723DCD834">>},
{<<"quantile_estimator">>, <<"EF50A361F11B5F26B5F16D0696E46A9E4661756492C981F7B2229EF42FF1CD15">>},
{<<"ranch">>, <<"F04166F456790FEE2AC1AA05A02745CC75783C2BFB26D39FAF6AEFC9A3D3A58A">>},
{<<"rand_compat">>, <<"011646BC1F0B0C432FE101B816F25B9BBB74A085713CEE1DAFD2D62E9415EAD3">>},
Expand Down Expand Up @@ -210,7 +210,7 @@
{<<"parse_trans">>, <<"07CD9577885F56362D414E8C4C4E6BDF10D43A8767ABB92D24CBE8B24C54888B">>},
{<<"pbkdf2">>, <<"1E793CE6FDB0576613115714DEAE9DFC1D1537EABA74F07EFB36DE139774488D">>},
{<<"procket">>, <<"037287B8D10A747E54A69BDCE56724ECC8BE12BCB8233AE4D6F885F0ED71E895">>},
{<<"prometheus">>, <<"6EDFBE928D271C7F657A6F2C46258738086584BD6CAE4A000B8B9A6009BA23A5">>},
{<<"prometheus">>, <<"C3ABD6521E52CEC4F0D8ECA603CF214DFC84D8A27AA85946639F1424B8554D98">>},
{<<"quantile_estimator">>, <<"282A8A323CA2A845C9E6F787D166348F776C1D4A41EDE63046D72D422E3DA946">>},
{<<"ranch">>, <<"86D40FC42AA47BCB6952DDF1DBFD3DA04B5BA69AFB65C322C99845913250B11F">>},
{<<"rand_compat">>, <<"CDF7BE2B17308EC245B912C45FE55741F93B6E4F1A24BA6074F7137B0CC09BF4">>},
Expand Down
5 changes: 4 additions & 1 deletion src/blockchain_node.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
observer_cli,
clique,
recon,
grpcbox
grpcbox,
elli,
telemetry,
prometheus
]
},
{included_applications, [blockchain]},
Expand Down
3 changes: 2 additions & 1 deletion src/bn_sup.erl
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ init([]) ->
[{follower_module, {bn_gateways, [{base_dir, BaseDir}]}}]
]),
?WORKER(bn_wallets, [[{base_dir, BaseDir}]]),
?WORKER(elli, [[{callback, bn_jsonrpc_handler}, {port, NodePort}]])
?WORKER(elli, [[{callback, bn_jsonrpc_handler}, {port, NodePort}]]),
?WORKER(bn_metrics_server, [])
]}}.

random_val_predicate(Peer) ->
Expand Down
19 changes: 19 additions & 0 deletions src/metrics/bn_metrics_exporter.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-module(bn_metrics_exporter).

-behaviour(elli_handler).

-include_lib("elli/include/elli.hrl").

-export([handle/2, handle_event/3]).

handle(Req, _Args) ->
handle(Req#req.method, elli_request:path(Req), Req).

%% Expose `/metrics` for Prometheus as a scrape target
handle('GET', [<<"metrics">>], _Req) ->
{ok, [], prometheus_text_format:format()};
handle(_Verb, _Path, _Req) ->
ignore.

handle_event(_Event, _Data, _Args) ->
ok.
175 changes: 175 additions & 0 deletions src/metrics/bn_metrics_server.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
-module(bn_metrics_server).

-behaviour(gen_server).

-include("metrics/metrics.hrl").

-export([
handle_metric/4,
start_link/0
]).

-export([
init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2,
code_change/3
]).

-define(SERVER, ?MODULE).

-type metric() :: {
Metric :: string(),
Event :: [atom()],
PrometheusHandler :: module(),
Labels :: [atom()],
Description :: string()
}.

-type metrics() :: [metric()].

-type exporter_opts() :: [
{callback, module()} |
{callback_args, map()} |
{port, integer()}
].

-record(state, {
metrics :: metrics(),
exporter_opts :: exporter_opts(),
exporter_pid :: pid() | undefined
}).

handle_metric(Event, Measurements, Metadata, _Config) ->
handle_metric_event(Event, Measurements, Metadata).

start_link() ->
gen_server:start_link({local, ?SERVER}, ?SERVER, [], []).

init(_Args) ->
case get_configs() of
{[], []} -> ignore;
{[_ | _], [_ | _]} = Metrics ->
erlang:process_flag(trap_exit, true),

ok = setup_metrics(Metrics),

ElliOpts = [
{callback, be_metrics_exporter},
{callback_args, #{}},
{port, application:get_env(blockchain_node, metrics_port, 9090)}
],
{ok, ExporterPid} = elli:start_link(ElliOpts),
{ok, #state{
metrics = Metrics,
exporter_opts = ElliOpts,
exporter_pid = ExporterPid}}
end.

handle_call(_Msg, _From, State) ->
lager:debug("Received unknown call msg: ~p from ~p", [_Msg, _From]),
{reply, ok, State}.

handle_cast(_Msg, State) ->
{noreply, State}.

handle_info({'EXIT', ExporterPid, Reason}, #state{exporter_pid=ExporterPid} = State) ->
lager:warning("Metrics exporter exited with reason ~p, restarting", [Reason]),
{ok, NewExporter} = elli:start_link(State#state.exporter_opts),
{noreply, State#state{exporter_pid = NewExporter}};
handle_info(_Msg, State) ->
lager:debug("Received unknown info msg: ~p", [_Msg]),
{noreply, State}.

code_change(_OldVsn, State, _Extra) ->
{ok, State}.

terminate(Reason, #state{metrics = Metrics, exporter_pid = Exporter}) ->
true = erlang:exit(Exporter, Reason),
lists:foreach(
fun({Metric, Module, _, _}) ->
lager:info("De-registering metric ~p as ~p", [Metric, Module]),
Module:deregister(Metric)
end,
Metrics
).

setup_metrics({EventNames, EventSpecs}) ->
lager:warning("METRICS ~p", [EventSpecs]),
lists:foreach(
fun({Metric, Module, Meta, Description}) ->
lager:info("Declaring metric ~p as ~p meta=~p", [Metric, Module, Meta]),
MetricOpts = [{name, Metric}, {help, Description}, {labels, Meta}],
case Module of
prometheus_histogram ->
Module:declare(MetricOpts ++ [{buckets, ?METRICS_HISTOGRAM_BUCKETS}]);
_ ->
Module:declare(MetricOpts)
end
end,
EventSpecs
),

ok = telemetry:attach_many(<<"bn-metrics-handler">>, EventNames, fun bn_metrics_server:handle_metric/4, []).

get_configs() ->
lists:foldl(
fun(Metric, {Names, Specs} = Acc) ->
case maps:get(Metric, ?METRICS, undefined) of
undefined -> Acc;
{N, S} -> {Names ++ N, Specs ++ S}
end
end,
{[], []},
application:get_env(blockchain_node, metrics, [])
).

handle_metric_event([blockchain, block, absorb], #{duration := Duration}, #{stage := Stage}) ->
prometheus_histogram:observe(?METRICS_BLOCK_ABSORB, [Stage], Duration),
ok;
handle_metric_event([blockchain, block, height], #{height := Height}, #{time := Time}) ->
prometheus_gauge:set(?METRICS_BLOCK_HEIGHT, [Time], Height),
ok;
handle_metric_event([blockchain, block, unvalidated_absorb], #{duration := Duration}, #{stage := Stage}) ->
prometheus_histogram:observe(?METRICS_BLOCK_UNVAL_ABSORB, [Stage], Duration),
ok;
handle_metric_event([blockchain, block, unvalidated_height], #{height := Height}, #{time := Time}) ->
prometheus_gauge:set(?METRICS_BLOCK_UNVAL_HEIGHT, [Time], Height),
ok;
handle_metric_event([blockchain, txn, absorb], #{duration := Duration}, #{type := Type}) ->
prometheus_histogram:observe(?METRICS_TXN_ABSORB_DURATION, [Type], Duration),
ok;
handle_metric_event([blockchain, txn_mgr, submit], _Measurements, #{type := Type}) ->
prometheus_counter:inc(?METRICS_TXN_SUBMIT_COUNT, [Type]),
ok;
handle_metric_event([blockchain, txn_mgr, reject], #{block_span := Span}, #{type := Type}) ->
prometheus_counter:inc(?METRICS_TXN_REJECT_COUNT, [Type]),
prometheus_gauge:set(?METRICS_TXN_BLOCK_SPAN, [], Span),
ok;
handle_metric_event([blockchain, txn_mgr, accept], #{block_span := Span, queue_len := QLen}, #{type := Type}) ->
prometheus_counter:inc(?METRICS_TXN_ACCEPT_COUNT, [Type]),
prometheus_gauge:set(?METRICS_TXN_BLOCK_SPAN, [], Span),
prometheus_gauge:set(?METRICS_TXN_QUEUE, [], QLen),
ok;
handle_metric_event([blockchain, txn_mgr, update], #{block_span := Span, queue_len := QLen}, #{type := Type}) ->
prometheus_counter:inc(?METRICS_TXN_UPDATE_COUNT, [Type]),
prometheus_gauge:set(?METRICS_TXN_BLOCK_SPAN, [], Span),
prometheus_gauge:set(?METRICS_TXN_QUEUE, [], QLen),
ok;
handle_metric_event([blockchain, txn_mgr, process], #{duration := Duration}, #{stage := Stage}) ->
prometheus_histogram:observe(?METRICS_TXN_PROCESS_DURATION, [Stage], Duration),
ok;
handle_metric_event([blockchain, txn_mgr, add_block], #{cache := Cache, block_time := BlockTime, block_age := BlockAge}, #{height := Height}) ->
prometheus_gauge:set(?METRICS_TXN_CACHE_SIZE, [Height], Cache),
prometheus_gauge:set(?METRICS_TXN_BLOCK_TIME, [Height], BlockTime),
prometheus_gauge:set(?METRICS_TXN_BLOCK_AGE, [Height], BlockAge),
ok;
handle_metric_event([grpcbox, server, rpc_end], #{server_latency := Latency}, #{grpc_server_method := Method, grpc_server_status := Status}) ->
prometheus_gauge:dec(?METRICS_GRPC_SESSIONS, [Method]),
prometheus_histogram:observe(?METRICS_GRPC_LATENCY, [Method, Status], Latency),
ok;
handle_metric_event([grpcbox, server, rpc_begin], _Measurements, #{grpc_server_method := Method}) ->
prometheus_gauge:inc(?METRICS_GRPC_SESSIONS, [Method]),
ok.
58 changes: 58 additions & 0 deletions src/metrics/metrics.hrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
-define(METRICS_HISTOGRAM_BUCKETS, [50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000]).

-define(METRICS_BLOCK_ABSORB, "blockchain_block_absorb_duration").
-define(METRICS_BLOCK_UNVAL_ABSORB, "blockchain_block_unval_absorb_duration").
-define(METRICS_BLOCK_HEIGHT, "blockchain_block_height").
-define(METRICS_BLOCK_UNVAL_HEIGHT, "blockchain_block_unval_height").
-define(METRICS_TXN_ABSORB_DURATION, "blockchain_txn_absorb_duration").
-define(METRICS_TXN_BLOCK_SPAN, "blockchain_txn_mgr_block_span").
-define(METRICS_TXN_QUEUE, "blockchain_txn_mgr_queue").
-define(METRICS_TXN_SUBMIT_COUNT, "blockchain_txn_mgr_submitted_count").
-define(METRICS_TXN_REJECT_COUNT, "blockchain_txn_mgr_rejected_count").
-define(METRICS_TXN_ACCEPT_COUNT, "blockchain_txn_mgr_accepted_count").
-define(METRICS_TXN_UPDATE_COUNT, "blockchain_txn_mgr_updated_count").
-define(METRICS_TXN_PROCESS_DURATION, "blockchain_txn_mgr_process_duration").
-define(METRICS_TXN_CACHE_SIZE, "blockchain_txn_mgr_cache_size").
-define(METRICS_TXN_BLOCK_TIME, "blockchain_txn_mgr_block_time").
-define(METRICS_TXN_BLOCK_AGE, "blockchain_txn_mgr_block_age").
-define(METRICS_GRPC_SESSIONS, "grpc_session_count").
-define(METRICS_GRPC_LATENCY, "grpcbox_session_latency").

-define(METRICS, #{
block_metrics => {
[ [blockchain, block, absorb],
[blockchain, block, height],
[blockchain, block, unvalidated_absorb],
[blockchain, block, unvalidated_height] ],
[ {?METRICS_BLOCK_ABSORB, prometheus_histogram, [stage], "Block absorb duration"},
{?METRICS_BLOCK_HEIGHT, prometheus_gauge, [time], "Most recent block height"},
{?METRICS_BLOCK_UNVAL_ABSORB, prometheus_histogram, [stage], "Block unvalidated absorb duration"},
{?METRICS_BLOCK_UNVAL_HEIGHT, prometheus_gauge, [time], "Most recent unvalidated block height"} ]
},
txn_metrics => {
[ [blockchain, txn, absorb],
[blockchain, txn_mgr, submit],
[blockchain, txn_mgr, reject],
[blockchain, txn_mgr, accept],
[blockchain, txn_mgr, update],
[blockchain, txn_mgr, process],
[blockchain, txn_mgr, add_block] ],
[ {?METRICS_TXN_ABSORB_DURATION, prometheus_histogram, [stage], "Txn absorb duration"},
{?METRICS_TXN_BLOCK_SPAN, prometheus_gauge, [], "Block span on transactions"},
{?METRICS_TXN_QUEUE, prometheus_gauge, [], "Txn manager submission queue length"},
{?METRICS_TXN_SUBMIT_COUNT, prometheus_counter, [type], "Count of submitted transactions"},
{?METRICS_TXN_REJECT_COUNT, prometheus_counter, [type], "Count of rejected transactions"},
{?METRICS_TXN_ACCEPT_COUNT, prometheus_counter, [type], "Count of accepted transactions"},
{?METRICS_TXN_UPDATE_COUNT, prometheus_counter, [type], "Count of updated transaction"},
{?METRICS_TXN_PROCESS_DURATION, prometheus_histogram, [stage], "Transaction manager cache process duration"},
{?METRICS_TXN_CACHE_SIZE, prometheus_gauge, [height], "Transaction manager buffer size"},
{?METRICS_TXN_BLOCK_TIME, prometheus_gauge, [height], "Block time observed from the transaction manager"},
{?METRICS_TXN_BLOCK_AGE, prometheus_gauge, [height], "Block age observed from the transaction manager"} ]
},
grpc_metrics => {
[ [grpcbox, server, rpc_begin],
[grpcbox, server, rpc_end] ],
[ {?METRICS_GRPC_SESSIONS, prometheus_gauge, [method], "GRPC session count"},
{?METRICS_GRPC_LATENCY, prometheus_histogram, [method, status], "GRPC session latency"} ]
}
}).

0 comments on commit 58482f1

Please sign in to comment.