From a61635e85218dd2e19d339385066e5a6a9c86346 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 07:57:23 -0700 Subject: [PATCH 1/7] ceph-monstore-tool: dump paxos transactions Signed-off-by: Sage Weil --- src/mon/Paxos.h | 2 +- src/tools/ceph-monstore-tool.cc | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index cab27f289a882..3df238e19332e 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -1114,7 +1114,7 @@ class Paxos { * @param t The transaction to which we will append the operations * @param bl A bufferlist containing an encoded transaction */ - void decode_append_transaction(MonitorDBStore::Transaction& t, + static void decode_append_transaction(MonitorDBStore::Transaction& t, bufferlist& bl) { MonitorDBStore::Transaction vt; bufferlist::iterator it = bl.begin(); diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc index ae608a302f2c0..f361266aff0bd 100644 --- a/src/tools/ceph-monstore-tool.cc +++ b/src/tools/ceph-monstore-tool.cc @@ -31,6 +31,7 @@ #include "global/global_init.h" #include "os/LevelDBStore.h" #include "mon/MonitorDBStore.h" +#include "mon/Paxos.h" #include "common/Formatter.h" namespace po = boost::program_options; @@ -246,6 +247,19 @@ int main(int argc, char **argv) { goto done; } bl.write_fd(fd); + } else if (cmd == "dump-paxos") { + for (version_t v = dstart; v <= dstop; ++v) { + bufferlist bl; + st.get("paxos", v, bl); + if (bl.length() == 0) + break; + cout << "\n--- " << v << " ---" << std::endl; + MonitorDBStore::Transaction tx; + Paxos::decode_append_transaction(tx, bl); + JSONFormatter f(true); + tx.dump(&f); + f.flush(cout); + } } else if (cmd == "dump-trace") { if (tfile.empty()) { std::cerr << "Need trace_file" << std::endl; From 99e605455f7bf6088be5ce5ee3d4e29ab7e03d47 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 07:58:41 -0700 Subject: [PATCH 2/7] mon/Paxos: accepted_pn_from has no semantic meaning Signed-off-by: Sage Weil --- src/mon/Paxos.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index 3df238e19332e..69419e64ab937 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -290,8 +290,9 @@ class Paxos { */ version_t accepted_pn; /** - * @todo This has something to do with the last_committed version. Not sure - * about what it entails, tbh. + * The last_committed epoch of the leader at the time we accepted the last pn. + * + * This has NO SEMANTIC MEANING, and is there only for the debug output. */ version_t accepted_pn_from; /** From b26b7f6e5e02ac6beb66e3e34e177e6448cf91cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 22 Jul 2013 14:13:23 -0700 Subject: [PATCH 3/7] mon/Paxos: only share uncommitted value if it is next We may have an uncommitted value from our perspective (it is our lc + 1) when the collector has a much larger lc (because we have been out for the last few rounds). Only share an uncommitted value if it is in fact the next value. Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index ee2ba3b6fdb89..7785d37d4f03b 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -198,7 +198,8 @@ void Paxos::handle_collect(MMonPaxos *collect) // do we have an accepted but uncommitted value? // (it'll be at last_committed+1) bufferlist bl; - if (get_store()->exists(get_name(), last_committed+1)) { + if (collect->last_committed == last_committed && + get_store()->exists(get_name(), last_committed+1)) { get_store()->get(get_name(), last_committed+1, bl); assert(bl.length() > 0); dout(10) << " sharing our accepted but uncommitted value for " From b3253a453c057914753846c77499f98d3845c58e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:11:22 -0700 Subject: [PATCH 4/7] mon/Paxos: only learn uncommitted value if it is in the future If an older peer sends an uncommitted value, make sure we only take it if it is in the future, and at least as new as any current uncommitted value. (Prior to the previous patch, peers could send values from long-past rounds. The pn values are also bogus.) Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 7785d37d4f03b..7e39fce37e3fb 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -392,7 +392,9 @@ void Paxos::handle_last(MMonPaxos *last) // did this person send back an accepted but uncommitted value? if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { + last->uncommitted_pn > uncommitted_pn && + last->last_committed >= last_committed && + last->last_committed + 1 >= uncommitted_v) { uncommitted_v = last->last_committed+1; uncommitted_pn = last->uncommitted_pn; uncommitted_value = last->values[uncommitted_v]; From 19b29788966eb80ed847630090a16a3d1b810969 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:12:46 -0700 Subject: [PATCH 5/7] mon/Paxos: debug ignored uncommitted values Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 7e39fce37e3fb..bf9bb52c05c82 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -391,17 +391,23 @@ void Paxos::handle_last(MMonPaxos *last) << num_last << " peons" << dendl; // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn && - last->last_committed >= last_committed && - last->last_committed + 1 >= uncommitted_v) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << dendl; + if (last->uncommitted_pn) { + if (last->uncommitted_pn > uncommitted_pn && + last->last_committed >= last_committed && + last->last_committed + 1 >= uncommitted_v) { + uncommitted_v = last->last_committed+1; + uncommitted_pn = last->uncommitted_pn; + uncommitted_value = last->values[uncommitted_v]; + dout(10) << "we learned an uncommitted value for " << uncommitted_v + << " pn " << uncommitted_pn + << " " << uncommitted_value.length() << " bytes" + << dendl; + } else { + dout(10) << "ignoring uncommitted value for " << (last->last_committed+1) + << " pn " << last->uncommitted_pn + << " " << last->values[last->last_committed+1].length() << " bytes" + << dendl; + } } // is that everyone? From 20baf662112dd5f560bc3a2d2114b469444c3de8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:48:18 -0700 Subject: [PATCH 6/7] mon/Paxos: fix pn for uncommitted value during collect/last phase During the collect/last exchange, peers share any uncommitted values with the leader. They are supposed to also share the pn under which that value was accepted, but were instead using the just-accepted pn value. This effectively meant that we *always* took the uncommitted value; if there were multiples, which one we accepted depended on what order the LAST messages arrived, not which pn the values were generated under. The specific failure sequence I observed: - collect - learned uncommitted value for 262 from myself - send collect with pn 901 - got last with pn 901 (incorrect) for 200 (old) from peer - discard our own value, remember the other - finish collect phase - ignore old uncommitted value Fix this by storing a pending_v and pending_pn value whenever we accept a value. Use this to send an appropriate pn value in the LAST reply so that the leader can make it's decision about which uncommitted value to accept based on accurate information. Also use it when we learn the uncommitted value from ourselves. We could probably be more clever about storing less information here, for example by omitting pending_v and clearing pending_pn at the appropriate point, but that would be more fragile. Similarly, we could store a pn for *every* commit if we wanted to lay some groundwork for having multiple uncommitted proposals in flight, but I don't want to speculate about what is necessary or sufficient for a correct solution there. Fixes: #5698 Backport: cuttlefish, bobtail Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index bf9bb52c05c82..2a9f547ee478b 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -103,11 +103,21 @@ void Paxos::collect(version_t oldpn) // look for uncommitted value if (get_store()->exists(get_name(), last_committed+1)) { + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + uncommitted_pn = pn; + } else { + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn + << " and crossing our fingers" << dendl; + uncommitted_pn = accepted_pn; + } uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; + get_store()->get(get_name(), last_committed+1, uncommitted_value); assert(uncommitted_value.length()); dout(10) << "learned uncommitted " << (last_committed+1) + << " pn " << uncommitted_pn << " (" << uncommitted_value.length() << " bytes) from myself" << dendl; } @@ -164,6 +174,8 @@ void Paxos::handle_collect(MMonPaxos *collect) last->last_committed = last_committed; last->first_committed = first_committed; + version_t previous_pn = accepted_pn; + // can we accept this pn? if (collect->pn > accepted_pn) { // ok, accept it @@ -205,7 +217,18 @@ void Paxos::handle_collect(MMonPaxos *collect) dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 << " (" << bl.length() << " bytes)" << dendl; last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; + + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + last->uncommitted_pn = pn; + } else { + // previously we didn't record which pn a value was accepted + // under! use the pn value we just had... :( + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn + << " and crossing our fingers" << dendl; + last->uncommitted_pn = previous_pn; + } } // send reply @@ -511,6 +534,10 @@ void Paxos::begin(bufferlist& v) MonitorDBStore::Transaction t; t.put(get_name(), last_committed+1, new_value); + // note which pn this pending value is for. + t.put(get_name(), "pending_v", last_committed + 1); + t.put(get_name(), "pending_pn", accepted_pn); + dout(30) << __func__ << " transaction dump:\n"; JSONFormatter f(true); t.dump(&f); @@ -587,6 +614,10 @@ void Paxos::handle_begin(MMonPaxos *begin) MonitorDBStore::Transaction t; t.put(get_name(), v, begin->values[v]); + // note which pn this pending value is for. + t.put(get_name(), "pending_v", v); + t.put(get_name(), "pending_pn", accepted_pn); + dout(30) << __func__ << " transaction dump:\n"; JSONFormatter f(true); t.dump(&f); From cfe1395f479f152867e94371756a358a6fe4fe3d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:57:38 -0700 Subject: [PATCH 7/7] mon/Paxos: add failure injection points Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/mon/Paxos.cc | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index defb71ee514c8..fabb8ec689da6 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -207,6 +207,7 @@ OPTION(paxos_trim_min, OPT_INT, 250) // number of extra proposals tolerated bef OPTION(paxos_trim_max, OPT_INT, 500) // max number of extra proposals to trim at a time OPTION(paxos_service_trim_min, OPT_INT, 250) // minimum amount of versions to trigger a trim (0 disables it) OPTION(paxos_service_trim_max, OPT_INT, 500) // maximum amount of versions to trim during a single proposal (0 disables it) +OPTION(paxos_kill_at, OPT_INT, 0) OPTION(clock_offset, OPT_DOUBLE, 0) // how much to offset the system clock in Clock.cc OPTION(auth_cluster_required, OPT_STR, "cephx") // required of mon, mds, osd daemons OPTION(auth_service_required, OPT_STR, "cephx") // required by daemons of clients diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 2a9f547ee478b..508669deef542 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -394,9 +394,13 @@ void Paxos::handle_last(MMonPaxos *last) return; } + assert(g_conf->paxos_kill_at != 1); + // store any committed values if any are specified in the message store_state(last); + assert(g_conf->paxos_kill_at != 2); + // do they accept your pn? if (last->pn > accepted_pn) { // no, try again. @@ -552,6 +556,8 @@ void Paxos::begin(bufferlist& v) get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 3); + if (mon->get_quorum().size() == 1) { // we're alone, take it easy commit(); @@ -602,6 +608,8 @@ void Paxos::handle_begin(MMonPaxos *begin) assert(begin->pn == accepted_pn); assert(begin->last_committed == last_committed); + assert(g_conf->paxos_kill_at != 4); + // set state. state = STATE_UPDATING; lease_expire = utime_t(); // cancel lease @@ -626,6 +634,8 @@ void Paxos::handle_begin(MMonPaxos *begin) get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 5); + // reply MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, ceph_clock_now(g_ceph_context)); @@ -660,6 +670,8 @@ void Paxos::handle_accept(MMonPaxos *accept) accepted.insert(from); dout(10) << " now " << accepted << " have accepted" << dendl; + assert(g_conf->paxos_kill_at != 6); + // new majority? if (accepted.size() == (unsigned)mon->monmap->size()/2+1) { // yay, commit! @@ -683,6 +695,8 @@ void Paxos::handle_accept(MMonPaxos *accept) // yay! extend_lease(); + assert(g_conf->paxos_kill_at != 10); + finish_round(); // wake people up @@ -713,6 +727,8 @@ void Paxos::commit() // leader still got a majority and committed with out us.) lease_expire = utime_t(); // cancel lease + assert(g_conf->paxos_kill_at != 7); + MonitorDBStore::Transaction t; // commit locally @@ -732,6 +748,8 @@ void Paxos::commit() get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 8); + // refresh first_committed; this txn may have trimmed. first_committed = get_store()->get(get_name(), "first_committed"); @@ -753,6 +771,8 @@ void Paxos::commit() mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); } + assert(g_conf->paxos_kill_at != 9); + // get ready for a new round. new_value.clear();