Skip to content

Commit

Permalink
Merge pull request #460 from rabbitmq/pre-init-missing-checkpoints-dir
Browse files Browse the repository at this point in the history
Avoid interrupting Ra system start when pre init fails
  • Loading branch information
kjnilsson authored Jul 23, 2024
2 parents ba4bf6f + b62b33d commit 61f3fde
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 45 deletions.
2 changes: 1 addition & 1 deletion src/ra_lib.erl
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ merge_with_1(none, Result, _, _) ->
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").

lists_chink_test() ->
lists_chunk_test() ->
?assertError(invalid_size, lists_chunk(0, [a])),
?assertMatch([], lists_chunk(2, [])),
?assertMatch([[a]], lists_chunk(2, [a])),
Expand Down
12 changes: 7 additions & 5 deletions src/ra_log_pre_init.erl
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ init([System]) ->
ok -> ok
catch _:Err ->
?ERROR("pre_init failed in system ~s for UId ~ts with name ~ts"
" This error may need manual intervention",
[System, UId, Name]),
throw({stop, {error, Err}})
" This error may need manual intervention, Error ~p",
[System, UId, Name, Err]),
ok
end
end|| {Name, UId} <- Regd],
{ok, #state{} , hibernate}.
Expand Down Expand Up @@ -95,11 +95,13 @@ pre_init(System, UId) ->
{error, Err} ->
?ERROR("pre_init failed to read config file for UId '~ts', Err ~p",
[UId, Err]),
exit({pre_init_failed, Err})
ok
end;
false ->
?INFO("pre_init UId '~ts' is registered but no data directory was found",
?INFO("pre_init UId '~ts' is registered but no data
directory was found, removing from ra directory",
[UId]),
_ = catch ra_directory:unregister_name(System, UId),
ok
end
end
Expand Down
76 changes: 40 additions & 36 deletions src/ra_snapshot.erl
Original file line number Diff line number Diff line change
Expand Up @@ -224,43 +224,47 @@ find_checkpoints(#?MODULE{uid = UId,
module = Module,
current = Current,
checkpoint_directory = CheckpointDir} = State) ->
true = ra_lib:is_dir(CheckpointDir),
CurrentIdx = case Current of
undefined ->
-1;
{I, _} ->
I
end,
{ok, CPFiles0} = prim_file:list_dir(CheckpointDir),
CPFiles = lists:reverse(lists:sort(CPFiles0)),
Checkpoints =
lists:filtermap(
fun(File) ->
CP = filename:join(CheckpointDir, File),
case Module:validate(CP) of
ok ->
{ok, #{index := Idx, term := Term}} =
case ra_lib:is_dir(CheckpointDir) of
false ->
State;
true ->
CurrentIdx = case Current of
undefined ->
-1;
{I, _} ->
I
end,
{ok, CPFiles0} = prim_file:list_dir(CheckpointDir),
CPFiles = lists:reverse(lists:sort(CPFiles0)),
Checkpoints =
lists:filtermap(
fun(File) ->
CP = filename:join(CheckpointDir, File),
case Module:validate(CP) of
ok ->
{ok, #{index := Idx, term := Term}} =
Module:read_meta(CP),
case Idx > CurrentIdx of
true ->
{true, {Idx, Term}};
false ->
?INFO("ra_snapshot: ~ts: removing "
"checkpoint ~s as was older than the "
"current snapshot.",
[UId, CP]),
delete(CheckpointDir, {Idx, Term}),
false
end;
Err ->
?INFO("ra_snapshot: ~ts: removing checkpoint ~s as "
"did not validate. Err: ~w",
[UId, CP, Err]),
ra_lib:recursive_delete(CP),
false
end
end, CPFiles),
State#?MODULE{checkpoints = Checkpoints}.
case Idx > CurrentIdx of
true ->
{true, {Idx, Term}};
false ->
?INFO("ra_snapshot: ~ts: removing "
"checkpoint ~s as was older than the "
"current snapshot.",
[UId, CP]),
delete(CheckpointDir, {Idx, Term}),
false
end;
Err ->
?INFO("ra_snapshot: ~ts: removing checkpoint ~s as "
"did not validate. Err: ~w",
[UId, CP, Err]),
ra_lib:recursive_delete(CP),
false
end
end, CPFiles),
State#?MODULE{checkpoints = Checkpoints}
end.

-spec init_ets() -> ok.
init_ets() ->
Expand Down
5 changes: 2 additions & 3 deletions test/coordination_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,8 @@ segment_writer_or_wal_crash_follower(Config) ->
LastIdxs =
[begin
{ok, #{current_term := T,
log := #{last_index := L}}, _} =
log := #{last_index := L,
cache_size := 0}}, _} =
ra:member_overview(S),
{T, L}
end || {_, _N} = S <- ServerIds],
Expand Down Expand Up @@ -934,8 +935,6 @@ segment_writer_or_wal_crash_follower(Config) ->

%% assert stuff
await_condition(AwaitReplicated, 100),
?assertMatch({ok, #{log := #{cache_size := 0}}, _},
ra:member_overview(Follower)),
%% follower hasn't crashed
?assertEqual(FollowerPid, ct_rpc:call(FollowerNode, erlang, whereis,
[FollowerName]))
Expand Down
51 changes: 51 additions & 0 deletions test/ra_log_2_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ all_tests() ->
recovery,
recover_many,
recovery_with_missing_directory,
recovery_with_missing_checkpoints_directory,
recovery_with_missing_config_file,
wal_crash_recover,
wal_down_read_availability,
wal_down_append_throws,
Expand Down Expand Up @@ -643,6 +645,55 @@ recovery_with_missing_directory(Config) ->
ok = ra_lib:recursive_delete(ServerDataDir),
?assertNot(filelib:is_dir(ServerDataDir)),

?assert(ra_directory:is_registered_uid(default, UId)),
application:stop(ra),
start_ra(Config),
?assertNot(ra_directory:is_registered_uid(default, UId)),

Log5 = ra_log_init(Config),
ra_log:close(Log5),
ok = ra_lib:recursive_delete(ServerDataDir),
?assertNot(filelib:is_dir(ServerDataDir)),

ok.

recovery_with_missing_checkpoints_directory(Config) ->
%% checking that the ra system can be restarted even if the checkpoints
%% directory is missing, it will be created the next time the
%% log is initialised
logger:set_primary_config(level, debug),
UId = ?config(uid, Config),
Log0 = ra_log_init(Config),
ra_log:close(Log0),

ServerDataDir = ra_env:server_data_dir(default, UId),
CheckpointsDir = filename:join(ServerDataDir, "checkpoints"),
ok = ra_lib:recursive_delete(CheckpointsDir),
?assertNot(filelib:is_dir(CheckpointsDir)),

application:stop(ra),
start_ra(Config),

Log5 = ra_log_init(Config),
ra_log:close(Log5),
ok = ra_lib:recursive_delete(ServerDataDir),
?assertNot(filelib:is_dir(ServerDataDir)),

ok.

recovery_with_missing_config_file(Config) ->
%% checking that the ra system can be restarted even when the config
%% file is missing
logger:set_primary_config(level, debug),
UId = ?config(uid, Config),
Log0 = ra_log_init(Config),
ra_log:close(Log0),

ServerDataDir = ra_env:server_data_dir(default, UId),
ConfigFile = filename:join(ServerDataDir, "config"),
file:delete(ConfigFile),
?assertNot(filelib:is_file(ConfigFile)),

application:stop(ra),
start_ra(Config),

Expand Down

0 comments on commit 61f3fde

Please sign in to comment.