Skip to content

Commit

Permalink
fix(duplication): failed to start duplication after source replica se…
Browse files Browse the repository at this point in the history
…rvers exit due to missing duplicating status in .app-info (#1608)

#1595

Currently, after a duplication was added, `duplicating` status would be only updated
for in-memory state. However, once some replica servers exited and were started again,
`duplicating` status would be lost since it had not been persisted into `.app-info` before.
Consequently, plog files would be removed by GC, which lead to failed assertion for decree
in `replica_duplicator::verify_start_decree()`.

To resolve this problem, `duplicating` should be updated for both in-memory and on-disk
states. Then, `duplicating` status would be loaded correctly from `.app-info` file at the loading
stage of replicas.
  • Loading branch information
ninsmiracle authored Apr 3, 2024
1 parent 937e337 commit f5e1b81
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 7 deletions.
29 changes: 23 additions & 6 deletions src/replica/duplication/duplication_sync_timer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,37 @@ void duplication_sync_timer::on_duplication_sync_reply(error_code err,
void duplication_sync_timer::update_duplication_map(
const std::map<int32_t, std::map<int32_t, duplication_entry>> &dup_map)
{
for (const replica_ptr &r : _stub->get_all_replicas()) {
for (replica_ptr &r : _stub->get_all_replicas()) {
auto dup_mgr = r->get_duplication_manager();
if (!dup_mgr) {
continue;
}

const auto &it = dup_map.find(r->get_gpid().get_app_id());
if (it == dup_map.end()) {
// no duplication is assigned to this app

// TODO(wangdan): at meta server side, an app is considered duplicating
// as long as any duplication of this app has valid status(i.e.
// duplication_info::is_invalid_status() returned false, see
// meta_duplication_service::refresh_duplicating_no_lock()). And duplications
// in duplication_sync_response returned by meta server could also be
// considered duplicating according to meta_duplication_service::duplication_sync().
// Thus we could update `duplicating` in both memory and file(.app-info).
//
// However, most of properties of an app(struct `app_info`) are written to .app-info
// file in replica::on_config_sync(), such as max_replica_count; on the other hand,
// in-memory `duplicating` is also updated in replica::on_config_proposal(). Thus we'd
// better think about a unique entrance to update `duplicating`(in both memory and disk),
// rather than update them at anywhere.
const auto duplicating = it != dup_map.end();
r->update_app_duplication_status(duplicating);

if (!duplicating) {
// No duplication is assigned to this app.
dup_mgr->update_duplication_map({});
} else {
dup_mgr->update_duplication_map(it->second);
continue;
}

dup_mgr->update_duplication_map(it->second);
}
}

Expand Down Expand Up @@ -198,6 +216,5 @@ duplication_sync_timer::get_dup_states(int app_id, /*out*/ bool *app_found)
}
return result;
}

} // namespace replication
} // namespace dsn
2 changes: 1 addition & 1 deletion src/replica/duplication/replica_duplicator_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class replica_duplicator_manager : public replica_base
// - the app is not assigned with duplication (dup_map.empty())
void update_duplication_map(const std::map<int32_t, duplication_entry> &new_dup_map)
{
if (_replica->status() != partition_status::PS_PRIMARY || new_dup_map.empty()) {
if (new_dup_map.empty() || _replica->status() != partition_status::PS_PRIMARY) {
remove_all_duplications();
return;
}
Expand Down
2 changes: 2 additions & 0 deletions src/replica/replica.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ class replica : public serverlet<replica>, public ref_counter, public replica_ba
_is_duplication_plog_checking.store(checking);
}

void update_app_duplication_status(bool duplicating);

//
// Backup
//
Expand Down
19 changes: 19 additions & 0 deletions src/replica/replica_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1176,5 +1176,24 @@ error_code replica::update_init_info_ballot_and_decree()
return _app->update_init_info_ballot_and_decree(this);
}

void replica::update_app_duplication_status(bool duplicating)
{
if (duplicating == _app_info.duplicating) {
return;
}

auto old_duplicating = _app_info.duplicating;
_app_info.__set_duplicating(duplicating);

CHECK_EQ_PREFIX_MSG(store_app_info(_app_info),
ERR_OK,
"store_app_info for duplicating failed: app_name={}, "
"app_id={}, old_duplicating={}, new_duplicating={}",
_app_info.app_name,
_app_info.app_id,
old_duplicating,
_app_info.duplicating);
}

} // namespace replication
} // namespace dsn

0 comments on commit f5e1b81

Please sign in to comment.