From 61a298c39c1a6684682e2b749e45a66d073182c8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:10:42 -0700 Subject: [PATCH 01/30] osd: install async signal handlers after init() The orderly shutdown in OSD assumes init() has completed. Signed-off-by: Sage Weil --- src/ceph_osd.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc index b485133514e3c..49cae9dddb889 100644 --- a/src/ceph_osd.cc +++ b/src/ceph_osd.cc @@ -451,12 +451,6 @@ int main(int argc, const char **argv) messenger_hb_back_server->start(); cluster_messenger->start(); - // install signal handlers - init_async_signal_handler(); - register_async_signal_handler(SIGHUP, sighup_handler); - register_async_signal_handler_oneshot(SIGINT, handle_osd_signal); - register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal); - // start osd err = osd->init(); if (err < 0) { @@ -465,6 +459,12 @@ int main(int argc, const char **argv) return 1; } + // install signal handlers + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_osd_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal); + client_messenger->wait(); messenger_hbclient->wait(); messenger_hb_front_server->wait(); From b084a3877f3b585a6b387f42d625b33c9b759435 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:11:43 -0700 Subject: [PATCH 02/30] osd: do not assume we have an osdmap in prepare_to_stop Signed-off-by: Sage Weil --- src/osd/OSD.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1ee4c09a63e1f..695dff240a41a 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4730,11 +4730,12 @@ bool OSDService::prepare_to_stop() if (state != NOT_STOPPING) return false; - if (get_osdmap()->is_up(whoami)) { + OSDMapRef osdmap = get_osdmap(); + if (osdmap && osdmap->is_up(whoami)) { state = PREPARING_TO_STOP; monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(), - get_osdmap()->get_inst(whoami), - get_osdmap()->get_epoch(), + osdmap->get_inst(whoami), + osdmap->get_epoch(), false )); utime_t now = ceph_clock_now(g_ceph_context); From 868b48634be46c74f8e26e6a1c2ebc2d2b3beb9a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:37:26 -0700 Subject: [PATCH 03/30] mon: install signal handlers after init Signed-off-by: Sage Weil --- src/ceph_mon.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index 6ac22ba20e5c8..f247de8cdfb5d 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -542,16 +542,16 @@ int main(int argc, const char **argv) if (g_conf->daemonize) prefork.daemonize(); + messenger->start(); + + mon->init(); + // set up signal handlers, now that we've daemonized/forked. init_async_signal_handler(); register_async_signal_handler(SIGHUP, sighup_handler); register_async_signal_handler_oneshot(SIGINT, handle_mon_signal); register_async_signal_handler_oneshot(SIGTERM, handle_mon_signal); - messenger->start(); - - mon->init(); - messenger->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); From 318f3df718bff735c09851178fa3398bc272dc67 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:37:44 -0700 Subject: [PATCH 04/30] mds: install signal handlers after init Signed-off-by: Sage Weil --- src/ceph_mds.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc index edb48bd96d813..9206312bc6695 100644 --- a/src/ceph_mds.cc +++ b/src/ceph_mds.cc @@ -274,12 +274,6 @@ int main(int argc, const char **argv) messenger->start(); - // set up signal handlers, now that we've daemonized/forked. - init_async_signal_handler(); - register_async_signal_handler(SIGHUP, sighup_handler); - register_async_signal_handler_oneshot(SIGINT, handle_mds_signal); - register_async_signal_handler_oneshot(SIGTERM, handle_mds_signal); - // start mds mds = new MDS(g_conf->name.get_id().c_str(), messenger, &mc); @@ -291,16 +285,23 @@ int main(int argc, const char **argv) r = mds->init(shadow); else r = mds->init(); + if (r < 0) + goto shutdown; - if (r >= 0) { - messenger->wait(); - } + // set up signal handlers, now that we've daemonized/forked. + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_mds_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_mds_signal); + + messenger->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_mds_signal); unregister_async_signal_handler(SIGTERM, handle_mds_signal); shutdown_async_signal_handler(); + shutdown: // yuck: grab the mds lock, so we can be sure that whoever in *mds // called shutdown finishes what they were doing. mds->mds_lock.Lock(); From ba5e357e4e6891de33250d4dced820430b795cb0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:46:57 -0700 Subject: [PATCH 05/30] mds: reorder suicide/shutdown a bit * shutdown msgr last * stop timer quickly, in case there are other events Signed-off-by: Sage Weil --- src/mds/MDS.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index a867961ccf3aa..e70d6fd4dff8f 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1628,19 +1628,18 @@ void MDS::suicide() } timer.cancel_all_events(); //timer.join(); + timer.shutdown(); // shut down cache mdcache->shutdown(); if (objecter->initialized) objecter->shutdown_locked(); - - // shut down messenger - messenger->shutdown(); monc->shutdown(); - timer.shutdown(); + // shut down messenger + messenger->shutdown(); } void MDS::respawn() From 1fd01e3b5ba36bd1843525de14d2f32c24211c7e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:47:20 -0700 Subject: [PATCH 06/30] mon: shutdown: remove sessions under mon->lock Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 90750dd7b1167..6c943832ac1c3 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -605,13 +605,13 @@ void Monitor::shutdown() finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED); finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED); - timer.shutdown(); + remove_all_sessions(); + // unlock before msgr shutdown... lock.Unlock(); - remove_all_sessions(); messenger->shutdown(); // last thing! ceph_mon.cc will delete mon. } From 7e25fecdfeb3d558fcffcbcff64615d16cc83800 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:47:51 -0700 Subject: [PATCH 07/30] mds: put g_ceph_context on shutdown This makes us shut down lots of threads. Signed-off-by: Sage Weil --- src/ceph_mds.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc index 9206312bc6695..a8b4519d4b292 100644 --- a/src/ceph_mds.cc +++ b/src/ceph_mds.cc @@ -314,14 +314,15 @@ int main(int argc, const char **argv) if (mds->is_stopped()) delete mds; + g_ceph_context->put(); + // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. char s[20]; snprintf(s, sizeof(s), "gmon/%d", getpid()); if ((mkdir(s, 0755) == 0) && (chdir(s) == 0)) { - dout(0) << "ceph-mds: gmon.out should be in " << s << dendl; + cerr << "ceph-mds: gmon.out should be in " << s << std::endl; } - generic_dout(0) << "stopped." << dendl; return 0; } From c3acc256088a3da36c964d149263058ec647b6b2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 20 Jul 2013 08:49:48 -0700 Subject: [PATCH 08/30] mon, mds, osd: add early SIGTERM injection This makes it easy to identify problems with (early) shutdown with a loop like while [ ! -e core ] ; do ./ceph-mds -i a -c ceph.conf -f ; done and a vstart cluster. Signed-off-by: Sage Weil --- src/ceph_mds.cc | 3 +++ src/ceph_mon.cc | 3 +++ src/ceph_osd.cc | 3 +++ src/common/config_opts.h | 2 ++ 4 files changed, 11 insertions(+) diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc index a8b4519d4b292..88b807b1b2451 100644 --- a/src/ceph_mds.cc +++ b/src/ceph_mds.cc @@ -294,6 +294,9 @@ int main(int argc, const char **argv) register_async_signal_handler_oneshot(SIGINT, handle_mds_signal); register_async_signal_handler_oneshot(SIGTERM, handle_mds_signal); + if (g_conf->inject_early_sigterm) + kill(getpid(), SIGTERM); + messenger->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index f247de8cdfb5d..35ed56a7985b6 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -552,6 +552,9 @@ int main(int argc, const char **argv) register_async_signal_handler_oneshot(SIGINT, handle_mon_signal); register_async_signal_handler_oneshot(SIGTERM, handle_mon_signal); + if (g_conf->inject_early_sigterm) + kill(getpid(), SIGTERM); + messenger->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc index 49cae9dddb889..d8590bff81755 100644 --- a/src/ceph_osd.cc +++ b/src/ceph_osd.cc @@ -465,6 +465,9 @@ int main(int argc, const char **argv) register_async_signal_handler_oneshot(SIGINT, handle_osd_signal); register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal); + if (g_conf->inject_early_sigterm) + kill(getpid(), SIGTERM); + client_messenger->wait(); messenger_hbclient->wait(); messenger_hb_front_server->wait(); diff --git a/src/common/config_opts.h b/src/common/config_opts.h index defb71ee514c8..ed44dca145d92 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -123,6 +123,8 @@ OPTION(ms_inject_delay_max, OPT_DOUBLE, 1) // seconds OPTION(ms_inject_delay_probability, OPT_DOUBLE, 0) // range [0, 1] OPTION(ms_inject_internal_delays, OPT_DOUBLE, 0) // seconds +OPTION(inject_early_sigterm, OPT_BOOL, false) + OPTION(mon_data, OPT_STR, "/var/lib/ceph/mon/$cluster-$id") OPTION(mon_initial_members, OPT_STR, "") // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster OPTION(mon_sync_fs_threshold, OPT_INT, 5) // sync() when writing this many objects; 0 to disable. From 20bc09c668cca01bc1d27c0a860b384d85585ef5 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 19 Jul 2013 22:54:46 -0700 Subject: [PATCH 09/30] rgw: read attributes when reading bucket entry point Fixes: #5691 We need to also read the attributes, as bucket might be a legacy bucket and might have all bucket instance info in that object. Signed-off-by: Yehuda Sadeh Reviewed-by: Greg Farnum Tested-by: Faidon Liambotis --- src/rgw/rgw_bucket.cc | 21 +++++++++++++-------- src/rgw/rgw_rados.cc | 21 +++++++++++++++------ src/rgw/rgw_rados.h | 6 ++++-- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index d4cfdc88e6449..8de5a3d101fe9 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -93,8 +93,10 @@ int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t new_bucket.creation_time = creation_time; ::encode(new_bucket, bl); + map attrs; + if (update_entrypoint) { - ret = store->get_bucket_entrypoint_info(NULL, bucket_name, ep, &ot, NULL); + ret = store->get_bucket_entrypoint_info(NULL, bucket_name, ep, &ot, NULL, &attrs); if (ret < 0 && ret != -ENOENT) { ldout(store->ctx(), 0) << "ERROR: store->get_bucket_entrypoint_info() returned " << ret << dendl; } else if (ret >= 0 && ep.linked && ep.owner != user_id) { @@ -119,7 +121,7 @@ int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t ep.linked = true; ep.owner = user_id; - ret = store->put_bucket_entrypoint_info(bucket_name, ep, false, ot, 0); + ret = store->put_bucket_entrypoint_info(bucket_name, ep, false, ot, 0, &attrs); if (ret < 0) goto done_err; @@ -153,7 +155,8 @@ int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name RGWBucketEntryPoint ep; RGWObjVersionTracker ot; - ret = store->get_bucket_entrypoint_info(NULL, bucket_name, ep, &ot, NULL); + map attrs; + ret = store->get_bucket_entrypoint_info(NULL, bucket_name, ep, &ot, NULL, &attrs); if (ret == -ENOENT) return 0; if (ret < 0) @@ -168,7 +171,7 @@ int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name } ep.linked = false; - ret = store->put_bucket_entrypoint_info(bucket_name, ep, false, ot, 0); + ret = store->put_bucket_entrypoint_info(bucket_name, ep, false, ot, 0, &attrs); if (ret < 0) return ret; @@ -1381,8 +1384,9 @@ class RGWBucketMetadataHandler : public RGWMetadataHandler { RGWBucketEntryPoint be; time_t mtime; + map attrs; - int ret = store->get_bucket_entrypoint_info(NULL, entry, be, &ot, &mtime); + int ret = store->get_bucket_entrypoint_info(NULL, entry, be, &ot, &mtime, &attrs); if (ret < 0) return ret; @@ -1398,16 +1402,17 @@ class RGWBucketMetadataHandler : public RGWMetadataHandler { decode_json_obj(be, obj); time_t orig_mtime; + map attrs; RGWObjVersionTracker old_ot; - int ret = store->get_bucket_entrypoint_info(NULL, entry, old_be, &old_ot, &orig_mtime); + int ret = store->get_bucket_entrypoint_info(NULL, entry, old_be, &old_ot, &orig_mtime, &attrs); if (ret < 0 && ret != -ENOENT) return ret; objv_tracker.read_version = old_ot.read_version; /* maintain the obj version we just read */ - ret = store->put_bucket_entrypoint_info(entry, be, false, objv_tracker, mtime); + ret = store->put_bucket_entrypoint_info(entry, be, false, objv_tracker, mtime, &attrs); if (ret < 0) return ret; @@ -1429,7 +1434,7 @@ class RGWBucketMetadataHandler : public RGWMetadataHandler { int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) { RGWBucketEntryPoint be; - int ret = store->get_bucket_entrypoint_info(NULL, entry, be, &objv_tracker, NULL); + int ret = store->get_bucket_entrypoint_info(NULL, entry, be, &objv_tracker, NULL, NULL); if (ret < 0) return ret; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 31400dc840225..0c7b22a42d3d3 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -4583,11 +4583,12 @@ int RGWRados::get_bucket_instance_from_oid(void *ctx, string& oid, RGWBucketInfo int RGWRados::get_bucket_entrypoint_info(void *ctx, const string& bucket_name, RGWBucketEntryPoint& entry_point, RGWObjVersionTracker *objv_tracker, - time_t *pmtime) + time_t *pmtime, + map *pattrs) { bufferlist bl; - int ret = rgw_get_system_obj(this, ctx, zone.domain_root, bucket_name, bl, objv_tracker, pmtime, NULL); + int ret = rgw_get_system_obj(this, ctx, zone.domain_root, bucket_name, bl, objv_tracker, pmtime, pattrs); if (ret < 0) { return ret; } @@ -4610,7 +4611,7 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf RGWBucketEntryPoint entry_point; time_t ep_mtime; RGWObjVersionTracker ot; - int ret = get_bucket_entrypoint_info(ctx, bucket_name, entry_point, &ot, &ep_mtime); + int ret = get_bucket_entrypoint_info(ctx, bucket_name, entry_point, &ot, &ep_mtime, pattrs); if (ret < 0) { info.bucket.name = bucket_name; /* only init this field */ return ret; @@ -4623,6 +4624,13 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf return 0; } + /* data is in the bucket instance object, we need to get attributes from there, clear everything + * that we got + */ + if (pattrs) { + pattrs->clear(); + } + ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl; if (pattrs) @@ -4643,11 +4651,12 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf } int RGWRados::put_bucket_entrypoint_info(const string& bucket_name, RGWBucketEntryPoint& entry_point, - bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime) + bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime, + map *pattrs) { bufferlist epbl; ::encode(entry_point, epbl); - return rgw_bucket_store_info(this, bucket_name, epbl, exclusive, NULL, &objv_tracker, mtime); + return rgw_bucket_store_info(this, bucket_name, epbl, exclusive, pattrs, &objv_tracker, mtime); } int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, @@ -4692,7 +4701,7 @@ int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t *pep_objv = ot.write_version; } } - ret = put_bucket_entrypoint_info(info.bucket.name, entry_point, exclusive, ot, mtime); + ret = put_bucket_entrypoint_info(info.bucket.name, entry_point, exclusive, ot, mtime, NULL); if (ret < 0) return ret; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 6422c182adc5d..c9924e0dc564a 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1286,9 +1286,11 @@ class RGWRados void get_bucket_instance_entry(rgw_bucket& bucket, string& entry); void get_bucket_meta_oid(rgw_bucket& bucket, string& oid); - int put_bucket_entrypoint_info(const string& bucket_name, RGWBucketEntryPoint& entry_point, bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime); + int put_bucket_entrypoint_info(const string& bucket_name, RGWBucketEntryPoint& entry_point, bool exclusive, RGWObjVersionTracker& objv_tracker, time_t mtime, + map *pattrs); int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, time_t mtime, map *pattrs); - int get_bucket_entrypoint_info(void *ctx, const string& bucket_name, RGWBucketEntryPoint& entry_point, RGWObjVersionTracker *objv_tracker, time_t *pmtime); + int get_bucket_entrypoint_info(void *ctx, const string& bucket_name, RGWBucketEntryPoint& entry_point, RGWObjVersionTracker *objv_tracker, time_t *pmtime, + map *pattrs); int get_bucket_instance_info(void *ctx, const string& meta_key, RGWBucketInfo& info, time_t *pmtime, map *pattrs); int get_bucket_instance_info(void *ctx, rgw_bucket& bucket, RGWBucketInfo& info, time_t *pmtime, map *pattrs); int get_bucket_instance_from_oid(void *ctx, string& oid, RGWBucketInfo& info, time_t *pmtime, map *pattrs); From 0897d3a820ec182ebd74100a370dbadab50de84f Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 22 Jul 2013 11:08:04 -0700 Subject: [PATCH 10/30] OSD::_make_pg: use createmap, not osdmap The osd lock is not held at this point, we must use the createmap passed in. Fixes: #5656 Signed-off-by: Samuel Just Reviewed-by: Sage Weil --- src/osd/OSD.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 8cc9e31459d61..464ed770df2d6 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1728,7 +1728,7 @@ PG* OSD::_make_pg( PG *pg; hobject_t logoid = make_pg_log_oid(pgid); hobject_t infooid = make_pg_biginfo_oid(pgid); - if (osdmap->get_pg_type(pgid) == pg_pool_t::TYPE_REP) + if (createmap->get_pg_type(pgid) == pg_pool_t::TYPE_REP) pg = new ReplicatedPG(&service, createmap, pool, pgid, logoid, infooid); else assert(0); From 58c78dbaf357def4c7bf6fb95a0248a1ccf6c3c6 Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Sun, 21 Jul 2013 10:54:00 -0700 Subject: [PATCH 11/30] FileJournal: fix posix_fallocate error handling From the man page for posix_fallocate: posix_fallocate() returns zero on success, or an error number on failure. Note that errno is not set. Signed-off-by: Noah Watkins Reviewed-by: Sage Weil --- src/os/FileJournal.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index 3acadf0958219..4a2af08dd4c0a 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -299,11 +299,10 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, return -err; } ret = ::posix_fallocate(fd, 0, newsize); - if (ret < 0) { - int err = errno; + if (ret) { derr << "FileJournal::_open_file : unable to preallocation journal to " - << newsize << " bytes: " << cpp_strerror(err) << dendl; - return -err; + << newsize << " bytes: " << cpp_strerror(ret) << dendl; + return -ret; } max_size = newsize; } From e60d14d97da6ed5ea25e6d7d7cfe8df3c3c6feec Mon Sep 17 00:00:00 2001 From: Dan Mick Date: Mon, 22 Jul 2013 11:31:09 -0700 Subject: [PATCH 12/30] ceph.in: reject --admin-daemon so it can't do harm Fixes: #3944 Signed-off-by: Dan Mick --- src/ceph.in | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/ceph.in b/src/ceph.in index 6ba92c99b1834..dbb7fb5a8cdc5 100755 --- a/src/ceph.in +++ b/src/ceph.in @@ -118,6 +118,8 @@ def parse_cmdargs(args=None, target=''): parser.add_argument('--admin-daemon', dest='admin_socket', help='submit admin-socket commands (\"help\" for help') + parser.add_argument('--admin-socket', dest='admin_socket_nope', + help='you probably mean --admin-daemon') parser.add_argument('-s', '--status', action='store_true', help='show cluster status') @@ -489,6 +491,11 @@ def main(): global verbose verbose = parsed_args.verbose + if parsed_args.admin_socket_nope: + print >> sys.stderr, '--admin-socket is used by daemons; '\ + 'you probably mean --admin-daemon/daemon' + return 1 + # pass on --id, --name, --conf name = 'client.admin' if parsed_args.client_id: From a61635e85218dd2e19d339385066e5a6a9c86346 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 07:57:23 -0700 Subject: [PATCH 13/30] ceph-monstore-tool: dump paxos transactions Signed-off-by: Sage Weil --- src/mon/Paxos.h | 2 +- src/tools/ceph-monstore-tool.cc | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index cab27f289a882..3df238e19332e 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -1114,7 +1114,7 @@ class Paxos { * @param t The transaction to which we will append the operations * @param bl A bufferlist containing an encoded transaction */ - void decode_append_transaction(MonitorDBStore::Transaction& t, + static void decode_append_transaction(MonitorDBStore::Transaction& t, bufferlist& bl) { MonitorDBStore::Transaction vt; bufferlist::iterator it = bl.begin(); diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc index ae608a302f2c0..f361266aff0bd 100644 --- a/src/tools/ceph-monstore-tool.cc +++ b/src/tools/ceph-monstore-tool.cc @@ -31,6 +31,7 @@ #include "global/global_init.h" #include "os/LevelDBStore.h" #include "mon/MonitorDBStore.h" +#include "mon/Paxos.h" #include "common/Formatter.h" namespace po = boost::program_options; @@ -246,6 +247,19 @@ int main(int argc, char **argv) { goto done; } bl.write_fd(fd); + } else if (cmd == "dump-paxos") { + for (version_t v = dstart; v <= dstop; ++v) { + bufferlist bl; + st.get("paxos", v, bl); + if (bl.length() == 0) + break; + cout << "\n--- " << v << " ---" << std::endl; + MonitorDBStore::Transaction tx; + Paxos::decode_append_transaction(tx, bl); + JSONFormatter f(true); + tx.dump(&f); + f.flush(cout); + } } else if (cmd == "dump-trace") { if (tfile.empty()) { std::cerr << "Need trace_file" << std::endl; From 99e605455f7bf6088be5ce5ee3d4e29ab7e03d47 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 07:58:41 -0700 Subject: [PATCH 14/30] mon/Paxos: accepted_pn_from has no semantic meaning Signed-off-by: Sage Weil --- src/mon/Paxos.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index 3df238e19332e..69419e64ab937 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -290,8 +290,9 @@ class Paxos { */ version_t accepted_pn; /** - * @todo This has something to do with the last_committed version. Not sure - * about what it entails, tbh. + * The last_committed epoch of the leader at the time we accepted the last pn. + * + * This has NO SEMANTIC MEANING, and is there only for the debug output. */ version_t accepted_pn_from; /** From b26b7f6e5e02ac6beb66e3e34e177e6448cf91cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 22 Jul 2013 14:13:23 -0700 Subject: [PATCH 15/30] mon/Paxos: only share uncommitted value if it is next We may have an uncommitted value from our perspective (it is our lc + 1) when the collector has a much larger lc (because we have been out for the last few rounds). Only share an uncommitted value if it is in fact the next value. Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index ee2ba3b6fdb89..7785d37d4f03b 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -198,7 +198,8 @@ void Paxos::handle_collect(MMonPaxos *collect) // do we have an accepted but uncommitted value? // (it'll be at last_committed+1) bufferlist bl; - if (get_store()->exists(get_name(), last_committed+1)) { + if (collect->last_committed == last_committed && + get_store()->exists(get_name(), last_committed+1)) { get_store()->get(get_name(), last_committed+1, bl); assert(bl.length() > 0); dout(10) << " sharing our accepted but uncommitted value for " From b3253a453c057914753846c77499f98d3845c58e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:11:22 -0700 Subject: [PATCH 16/30] mon/Paxos: only learn uncommitted value if it is in the future If an older peer sends an uncommitted value, make sure we only take it if it is in the future, and at least as new as any current uncommitted value. (Prior to the previous patch, peers could send values from long-past rounds. The pn values are also bogus.) Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 7785d37d4f03b..7e39fce37e3fb 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -392,7 +392,9 @@ void Paxos::handle_last(MMonPaxos *last) // did this person send back an accepted but uncommitted value? if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { + last->uncommitted_pn > uncommitted_pn && + last->last_committed >= last_committed && + last->last_committed + 1 >= uncommitted_v) { uncommitted_v = last->last_committed+1; uncommitted_pn = last->uncommitted_pn; uncommitted_value = last->values[uncommitted_v]; From 19b29788966eb80ed847630090a16a3d1b810969 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:12:46 -0700 Subject: [PATCH 17/30] mon/Paxos: debug ignored uncommitted values Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 7e39fce37e3fb..bf9bb52c05c82 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -391,17 +391,23 @@ void Paxos::handle_last(MMonPaxos *last) << num_last << " peons" << dendl; // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn && - last->last_committed >= last_committed && - last->last_committed + 1 >= uncommitted_v) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << dendl; + if (last->uncommitted_pn) { + if (last->uncommitted_pn > uncommitted_pn && + last->last_committed >= last_committed && + last->last_committed + 1 >= uncommitted_v) { + uncommitted_v = last->last_committed+1; + uncommitted_pn = last->uncommitted_pn; + uncommitted_value = last->values[uncommitted_v]; + dout(10) << "we learned an uncommitted value for " << uncommitted_v + << " pn " << uncommitted_pn + << " " << uncommitted_value.length() << " bytes" + << dendl; + } else { + dout(10) << "ignoring uncommitted value for " << (last->last_committed+1) + << " pn " << last->uncommitted_pn + << " " << last->values[last->last_committed+1].length() << " bytes" + << dendl; + } } // is that everyone? From 20baf662112dd5f560bc3a2d2114b469444c3de8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:48:18 -0700 Subject: [PATCH 18/30] mon/Paxos: fix pn for uncommitted value during collect/last phase During the collect/last exchange, peers share any uncommitted values with the leader. They are supposed to also share the pn under which that value was accepted, but were instead using the just-accepted pn value. This effectively meant that we *always* took the uncommitted value; if there were multiples, which one we accepted depended on what order the LAST messages arrived, not which pn the values were generated under. The specific failure sequence I observed: - collect - learned uncommitted value for 262 from myself - send collect with pn 901 - got last with pn 901 (incorrect) for 200 (old) from peer - discard our own value, remember the other - finish collect phase - ignore old uncommitted value Fix this by storing a pending_v and pending_pn value whenever we accept a value. Use this to send an appropriate pn value in the LAST reply so that the leader can make it's decision about which uncommitted value to accept based on accurate information. Also use it when we learn the uncommitted value from ourselves. We could probably be more clever about storing less information here, for example by omitting pending_v and clearing pending_pn at the appropriate point, but that would be more fragile. Similarly, we could store a pn for *every* commit if we wanted to lay some groundwork for having multiple uncommitted proposals in flight, but I don't want to speculate about what is necessary or sufficient for a correct solution there. Fixes: #5698 Backport: cuttlefish, bobtail Signed-off-by: Sage Weil --- src/mon/Paxos.cc | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index bf9bb52c05c82..2a9f547ee478b 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -103,11 +103,21 @@ void Paxos::collect(version_t oldpn) // look for uncommitted value if (get_store()->exists(get_name(), last_committed+1)) { + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + uncommitted_pn = pn; + } else { + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn + << " and crossing our fingers" << dendl; + uncommitted_pn = accepted_pn; + } uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; + get_store()->get(get_name(), last_committed+1, uncommitted_value); assert(uncommitted_value.length()); dout(10) << "learned uncommitted " << (last_committed+1) + << " pn " << uncommitted_pn << " (" << uncommitted_value.length() << " bytes) from myself" << dendl; } @@ -164,6 +174,8 @@ void Paxos::handle_collect(MMonPaxos *collect) last->last_committed = last_committed; last->first_committed = first_committed; + version_t previous_pn = accepted_pn; + // can we accept this pn? if (collect->pn > accepted_pn) { // ok, accept it @@ -205,7 +217,18 @@ void Paxos::handle_collect(MMonPaxos *collect) dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 << " (" << bl.length() << " bytes)" << dendl; last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; + + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + last->uncommitted_pn = pn; + } else { + // previously we didn't record which pn a value was accepted + // under! use the pn value we just had... :( + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn + << " and crossing our fingers" << dendl; + last->uncommitted_pn = previous_pn; + } } // send reply @@ -511,6 +534,10 @@ void Paxos::begin(bufferlist& v) MonitorDBStore::Transaction t; t.put(get_name(), last_committed+1, new_value); + // note which pn this pending value is for. + t.put(get_name(), "pending_v", last_committed + 1); + t.put(get_name(), "pending_pn", accepted_pn); + dout(30) << __func__ << " transaction dump:\n"; JSONFormatter f(true); t.dump(&f); @@ -587,6 +614,10 @@ void Paxos::handle_begin(MMonPaxos *begin) MonitorDBStore::Transaction t; t.put(get_name(), v, begin->values[v]); + // note which pn this pending value is for. + t.put(get_name(), "pending_v", v); + t.put(get_name(), "pending_pn", accepted_pn); + dout(30) << __func__ << " transaction dump:\n"; JSONFormatter f(true); t.dump(&f); From cfe1395f479f152867e94371756a358a6fe4fe3d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 21 Jul 2013 08:57:38 -0700 Subject: [PATCH 19/30] mon/Paxos: add failure injection points Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/mon/Paxos.cc | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index defb71ee514c8..fabb8ec689da6 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -207,6 +207,7 @@ OPTION(paxos_trim_min, OPT_INT, 250) // number of extra proposals tolerated bef OPTION(paxos_trim_max, OPT_INT, 500) // max number of extra proposals to trim at a time OPTION(paxos_service_trim_min, OPT_INT, 250) // minimum amount of versions to trigger a trim (0 disables it) OPTION(paxos_service_trim_max, OPT_INT, 500) // maximum amount of versions to trim during a single proposal (0 disables it) +OPTION(paxos_kill_at, OPT_INT, 0) OPTION(clock_offset, OPT_DOUBLE, 0) // how much to offset the system clock in Clock.cc OPTION(auth_cluster_required, OPT_STR, "cephx") // required of mon, mds, osd daemons OPTION(auth_service_required, OPT_STR, "cephx") // required by daemons of clients diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 2a9f547ee478b..508669deef542 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -394,9 +394,13 @@ void Paxos::handle_last(MMonPaxos *last) return; } + assert(g_conf->paxos_kill_at != 1); + // store any committed values if any are specified in the message store_state(last); + assert(g_conf->paxos_kill_at != 2); + // do they accept your pn? if (last->pn > accepted_pn) { // no, try again. @@ -552,6 +556,8 @@ void Paxos::begin(bufferlist& v) get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 3); + if (mon->get_quorum().size() == 1) { // we're alone, take it easy commit(); @@ -602,6 +608,8 @@ void Paxos::handle_begin(MMonPaxos *begin) assert(begin->pn == accepted_pn); assert(begin->last_committed == last_committed); + assert(g_conf->paxos_kill_at != 4); + // set state. state = STATE_UPDATING; lease_expire = utime_t(); // cancel lease @@ -626,6 +634,8 @@ void Paxos::handle_begin(MMonPaxos *begin) get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 5); + // reply MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, ceph_clock_now(g_ceph_context)); @@ -660,6 +670,8 @@ void Paxos::handle_accept(MMonPaxos *accept) accepted.insert(from); dout(10) << " now " << accepted << " have accepted" << dendl; + assert(g_conf->paxos_kill_at != 6); + // new majority? if (accepted.size() == (unsigned)mon->monmap->size()/2+1) { // yay, commit! @@ -683,6 +695,8 @@ void Paxos::handle_accept(MMonPaxos *accept) // yay! extend_lease(); + assert(g_conf->paxos_kill_at != 10); + finish_round(); // wake people up @@ -713,6 +727,8 @@ void Paxos::commit() // leader still got a majority and committed with out us.) lease_expire = utime_t(); // cancel lease + assert(g_conf->paxos_kill_at != 7); + MonitorDBStore::Transaction t; // commit locally @@ -732,6 +748,8 @@ void Paxos::commit() get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 8); + // refresh first_committed; this txn may have trimmed. first_committed = get_store()->get(get_name(), "first_committed"); @@ -753,6 +771,8 @@ void Paxos::commit() mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); } + assert(g_conf->paxos_kill_at != 9); + // get ready for a new round. new_value.clear(); From 6094685e4f2b87a20febce4686ba88e98dcc6078 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 19 Jul 2013 21:11:53 -0700 Subject: [PATCH 20/30] rgw: send data back when copying object Currently doing it only when copying between regions. This is needed so that the operation doesn't time out (as it can take a long time and the web server may just hang on us since we're not sending any data). This is configurable and can be disabled. Currently only implemented for S3. Signed-off-by: Yehuda Sadeh --- src/common/config_opts.h | 2 ++ src/rgw/rgw_op.cc | 23 ++++++++++++++++++++++- src/rgw/rgw_op.h | 5 +++++ src/rgw/rgw_rados.cc | 15 ++++++++++++--- src/rgw/rgw_rados.h | 4 +++- src/rgw/rgw_rest_s3.cc | 25 ++++++++++++++++++++----- src/rgw/rgw_rest_s3.h | 4 +++- 7 files changed, 67 insertions(+), 11 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index defb71ee514c8..ff23ba56232a3 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -675,6 +675,8 @@ OPTION(rgw_md_log_max_shards, OPT_INT, 64) // max shards for metadata log OPTION(rgw_num_zone_opstate_shards, OPT_INT, 128) // max shards for keeping inter-region copy progress info OPTION(rgw_opstate_ratelimit_sec, OPT_INT, 30) // min time between opstate updates on a single upload (0 for disabling ratelimit) OPTION(rgw_curl_wait_timeout_ms, OPT_INT, 1000) // timeout for certain curl calls +OPTION(rgw_copy_obj_progress, OPT_BOOL, true) // should dump progress during long copy operations? +OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT, 1024 * 1024) // min bytes between copy progress output OPTION(rgw_data_log_window, OPT_INT, 30) // data log entries window (in seconds) OPTION(rgw_data_log_changes_size, OPT_INT, 1000) // number of in-memory entries to hold for data changes log diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 17a3aaa8439d5..97ae5fc6f0ca4 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1655,6 +1655,25 @@ int RGWCopyObj::init_common() return 0; } +static void copy_obj_progress_cb(off_t ofs, void *param) +{ + RGWCopyObj *op = static_cast(param); + op->progress_cb(ofs); +} + +void RGWCopyObj::progress_cb(off_t ofs) +{ + if (!s->cct->_conf->rgw_copy_obj_progress) + return; + + if (ofs - last_ofs < s->cct->_conf->rgw_copy_obj_progress_every_bytes) + return; + + send_partial_response(ofs); + + last_ofs = ofs; +} + void RGWCopyObj::execute() { rgw_obj src_obj, dst_obj; @@ -1686,7 +1705,9 @@ void RGWCopyObj::execute() replace_attrs, attrs, RGW_OBJ_CATEGORY_MAIN, &s->req_id, /* use req_id as tag */ - &s->err); + &s->err, + copy_obj_progress_cb, (void *)this + ); } int RGWGetACLs::verify_permission() diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 7bca53b5e4313..0c338dea8a978 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -438,6 +438,8 @@ class RGWCopyObj : public RGWOp { string client_id; string op_id; + off_t last_ofs; + int init_common(); @@ -460,6 +462,7 @@ class RGWCopyObj : public RGWOp { ret = 0; mtime = 0; replace_attrs = false; + last_ofs = 0; } virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) { @@ -468,9 +471,11 @@ class RGWCopyObj : public RGWOp { } int verify_permission(); void execute(); + void progress_cb(off_t ofs); virtual int init_dest_policy() { return 0; } virtual int get_params() = 0; + virtual void send_partial_response(off_t ofs) {} virtual void send_response() = 0; virtual const char *name() { return "copy_obj"; } virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; } diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 087fdcf8e0993..3c8d9757ca638 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2397,9 +2397,16 @@ class RGWRadosPutObj : public RGWGetDataCB rgw_obj obj; RGWPutObjProcessor_Atomic *processor; RGWOpStateSingleOp *opstate; + void (*progress_cb)(off_t, void *); + void *progress_data; public: - RGWRadosPutObj(RGWPutObjProcessor_Atomic *p, RGWOpStateSingleOp *_ops) : processor(p), opstate(_ops) {} + RGWRadosPutObj(RGWPutObjProcessor_Atomic *p, RGWOpStateSingleOp *_ops, + void (*_progress_cb)(off_t, void *), void *_progress_data) : processor(p), opstate(_ops), + progress_cb(_progress_cb), + progress_data(_progress_data) {} int handle_data(bufferlist& bl, off_t ofs, off_t len) { + progress_cb(ofs, progress_data); + void *handle; int ret = processor->handle_data(bl, ofs, &handle); if (ret < 0) @@ -2477,7 +2484,9 @@ int RGWRados::copy_obj(void *ctx, map& attrs, RGWObjCategory category, string *ptag, - struct rgw_err *err) + struct rgw_err *err, + void (*progress_cb)(off_t, void *), + void *progress_data) { int ret; uint64_t total_len, obj_size; @@ -2545,7 +2554,7 @@ int RGWRados::copy_obj(void *ctx, ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl; return ret; } - RGWRadosPutObj cb(&processor, &opstate); + RGWRadosPutObj cb(&processor, &opstate, progress_cb, progress_data); string etag; map req_headers; time_t set_mtime; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 6422c182adc5d..0ef7166624482 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1121,7 +1121,9 @@ class RGWRados map& attrs, RGWObjCategory category, string *ptag, - struct rgw_err *err); + struct rgw_err *err, + void (*progress_cb)(off_t, void *), + void *progress_data); int copy_obj_data(void *ctx, void *handle, off_t end, diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 9e8ec3f88a5a8..35ee64d7eb938 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -1299,15 +1299,30 @@ int RGWCopyObj_ObjStore_S3::get_params() return 0; } -void RGWCopyObj_ObjStore_S3::send_response() +void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) { - if (ret) + if (!sent_header) { + if (ret) set_req_state_err(s, ret); - dump_errno(s); + dump_errno(s); + + end_header(s, "binary/octet-stream"); + if (ret == 0) { + s->formatter->open_object_section("CopyObjectResult"); + } + sent_header = true; + } else { + s->formatter->dump_int("Progress", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_S3::send_response() +{ + if (!sent_header) + send_partial_response(0); - end_header(s, "binary/octet-stream"); if (ret == 0) { - s->formatter->open_object_section("CopyObjectResult"); dump_time(s, "LastModified", &mtime); map::iterator iter = attrs.find(RGW_ATTR_ETAG); if (iter != attrs.end()) { diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h index e2a1b0b92eb5f..a0af4eac9fd95 100644 --- a/src/rgw/rgw_rest_s3.h +++ b/src/rgw/rgw_rest_s3.h @@ -143,12 +143,14 @@ class RGWDeleteObj_ObjStore_S3 : public RGWDeleteObj_ObjStore { }; class RGWCopyObj_ObjStore_S3 : public RGWCopyObj_ObjStore { + bool sent_header; public: - RGWCopyObj_ObjStore_S3() {} + RGWCopyObj_ObjStore_S3() : sent_header(false) {} ~RGWCopyObj_ObjStore_S3() {} int init_dest_policy(); int get_params(); + void send_partial_response(off_t ofs); void send_response(); }; From c5025d4ad41a5f1e826094a805e5a46d91df4162 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 19 Jul 2013 21:35:47 -0700 Subject: [PATCH 21/30] rgw: dump progress through swift object copy Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rest_swift.cc | 34 +++++++++++++++++++++++++++++----- src/rgw/rgw_rest_swift.h | 4 +++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 157158e7ed7d7..f9d8c2eb3a974 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -475,13 +475,37 @@ int RGWCopyObj_ObjStore_SWIFT::get_params() return 0; } +void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs) +{ + if (!sent_header) { + if (!ret) + ret = STATUS_CREATED; + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); + + if (ret == 0) { + s->formatter->open_array_section("progress"); + } + sent_header = true; + } else { + s->formatter->dump_int("ofs", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + void RGWCopyObj_ObjStore_SWIFT::send_response() { - if (!ret) - ret = STATUS_CREATED; - set_req_state_err(s, ret); - dump_errno(s); - end_header(s); + if (!sent_header) { + if (!ret) + ret = STATUS_CREATED; + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); + } else { + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + } } int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h index e4b6f0bccee10..1c23ab29204be 100644 --- a/src/rgw/rgw_rest_swift.h +++ b/src/rgw/rgw_rest_swift.h @@ -100,13 +100,15 @@ class RGWDeleteObj_ObjStore_SWIFT : public RGWDeleteObj_ObjStore { }; class RGWCopyObj_ObjStore_SWIFT : public RGWCopyObj_ObjStore { + bool sent_header; public: - RGWCopyObj_ObjStore_SWIFT() {} + RGWCopyObj_ObjStore_SWIFT() : sent_header(false) {} ~RGWCopyObj_ObjStore_SWIFT() {} int init_dest_policy(); int get_params(); void send_response(); + void send_partial_response(off_t ofs); }; class RGWGetACLs_ObjStore_SWIFT : public RGWGetACLs_ObjStore { From 9f05db6b558e441dc816a490b5a7ed3a2071ed03 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 22 Jul 2013 14:26:54 -0700 Subject: [PATCH 22/30] rgw: add some comments Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rest_s3.cc | 3 +++ src/rgw/rgw_rest_swift.cc | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 35ee64d7eb938..199ccc485eb4e 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -1312,6 +1312,9 @@ void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) } sent_header = true; } else { + /* Send progress field. Note that this diverge from the original S3 + * spec. We do this in order to keep connection alive. + */ s->formatter->dump_int("Progress", (uint64_t)ofs); } rgw_flush_formatter(s, s->formatter); diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index f9d8c2eb3a974..80438a6556de5 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -484,6 +484,9 @@ void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs) dump_errno(s); end_header(s); + /* Send progress information. Note that this diverge from the original swift + * spec. We do this in order to keep connection alive. + */ if (ret == 0) { s->formatter->open_array_section("progress"); } From 165b0d0a9c127eb8ce232f6bc7bbf3eba7dc03d7 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 22 Jul 2013 14:27:01 -0700 Subject: [PATCH 23/30] PendingReleaseNotes: update about new rgw copy obj response Signed-off-by: Yehuda Sadeh --- PendingReleaseNotes | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index a9880942b5a7e..7a9adf7293e99 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -19,3 +19,9 @@ v0.67 commandline tool. ceph_rest_api.py can be used as a WSGI application for deployment in a more-capable web server. See ceph-rest-api.8 for more. + +* rgw copy object operation may return extra progress info during the + operation. At this point it will only happen when doing cross zone + copy operations. The S3 response will now return extra + field under the container. The Swift response will + now send the progress as a json array. From 8a2eb18494005aa968b71f18121da8ebab48e950 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 22 Jul 2013 13:33:33 -0700 Subject: [PATCH 24/30] rgw: translate swift request to s3 when forwarding When forwarding a swift request to a different region, we need to use the effective uri, and not just send the one we got since we use S3 authentication for the forwarded requests. This is achieved through a new using 'effective_uri' param on the request info (which in swift ponts to the plain bucket/object uri without the swift/v1 prefix(. Also, rename the old req_state::effective_uri to relative_uri in order to prevent confusion. Signed-off-by: Yehuda Sadeh Reviewed-by: Greg Farnum --- src/rgw/rgw_auth_s3.cc | 8 +++++++- src/rgw/rgw_common.cc | 7 ++++++- src/rgw/rgw_common.h | 3 ++- src/rgw/rgw_rest.cc | 2 +- src/rgw/rgw_rest_client.cc | 2 ++ src/rgw/rgw_rest_s3.cc | 2 +- src/rgw/rgw_rest_swift.cc | 5 +++++ 7 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc index bdd458e68b6f6..c93de7cd58a3d 100644 --- a/src/rgw/rgw_auth_s3.cc +++ b/src/rgw/rgw_auth_s3.cc @@ -190,8 +190,14 @@ bool rgw_create_s3_canonical_header(req_info& info, utime_t *header_time, string map& meta_map = info.x_meta_map; map& sub_resources = info.args.get_sub_resources(); + string request_uri; + if (info.effective_uri.empty()) + request_uri = info.request_uri; + else + request_uri = info.effective_uri; + rgw_create_s3_canonical_header(info.method, content_md5, content_type, date.c_str(), - meta_map, info.request_uri.c_str(), sub_resources, + meta_map, request_uri.c_str(), sub_resources, dest); return true; diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index aea396bf3def1..8a281775d0774 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -109,7 +109,12 @@ void req_info::rebuild_from(req_info& src) { method = src.method; script_uri = src.script_uri; - request_uri = src.request_uri; + if (src.effective_uri.empty()) { + request_uri = src.request_uri; + } else { + request_uri = src.effective_uri; + } + effective_uri.clear(); host = src.host; x_meta_map = src.x_meta_map; diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 1d3596d44183f..7f224a798f55c 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -764,6 +764,7 @@ struct req_info { const char *method; string script_uri; string request_uri; + string effective_uri; string request_params; req_info(CephContext *cct, RGWEnv *_env); @@ -780,7 +781,7 @@ struct req_state { int format; ceph::Formatter *formatter; string decoded_uri; - string effective_uri; + string relative_uri; const char *length; uint64_t content_length; map generic_attrs; diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index 0f9e61d1740ec..e4933a67a3924 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -1242,7 +1242,7 @@ RGWHandler *RGWREST::get_handler(RGWRados *store, struct req_state *s, RGWClient if (*init_error < 0) return NULL; - RGWRESTMgr *m = mgr.get_resource_mgr(s, s->decoded_uri, &s->effective_uri); + RGWRESTMgr *m = mgr.get_resource_mgr(s, s->decoded_uri, &s->relative_uri); if (!m) { *init_error = -ERR_METHOD_NOT_ALLOWED; return NULL; diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc index 2075e53552554..ea80b5b84f86e 100644 --- a/src/rgw/rgw_rest_client.cc +++ b/src/rgw/rgw_rest_client.cc @@ -403,6 +403,7 @@ int RGWRESTStreamWriteRequest::put_obj_init(RGWAccessKey& key, rgw_obj& obj, uin new_info.script_uri = "/"; new_info.script_uri.append(resource); new_info.request_uri = new_info.script_uri; + new_info.effective_uri = new_info.effective_uri; map& m = new_env.get_map(); map::iterator bliter; @@ -568,6 +569,7 @@ int RGWRESTStreamReadRequest::get_obj(RGWAccessKey& key, map& ex new_info.script_uri = "/"; new_info.script_uri.append(resource); new_info.request_uri = new_info.script_uri; + new_info.effective_uri = new_info.effective_uri; new_info.init_meta_info(NULL); diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index e131eeee28d0c..6c1738218e635 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -1819,7 +1819,7 @@ int RGWHandler_ObjStore_S3::init_from_header(struct req_state *s, int default_fo string req; string first; - const char *req_name = s->effective_uri.c_str(); + const char *req_name = s->relative_uri.c_str(); const char *p; if (*req_name == '?') { diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 80438a6556de5..877e3711a5e2c 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -856,11 +856,16 @@ int RGWHandler_ObjStore_SWIFT::init_from_header(struct req_state *s) s->bucket_name_str = first; s->bucket_name = strdup(s->bucket_name_str.c_str()); + + s->info.effective_uri = "/" + s->bucket_name_str; + if (req.size()) { s->object_str = req; s->object = strdup(s->object_str.c_str()); + s->info.effective_uri.append("/" + s->object_str); } + return 0; } From 1ecdb14937583d44273b8a2861379754f9c1c615 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 22 Jul 2013 13:36:30 -0700 Subject: [PATCH 25/30] rgw: swift, in create bucket set location_constraints For swift we're setting the location constraint to be the current region we're in when creating a bucket. Signed-off-by: Yehuda Sadeh Reviewed-by: Greg Farnum --- src/rgw/rgw_rest_swift.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 877e3711a5e2c..b4f830830f9ff 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -288,6 +288,8 @@ int RGWCreateBucket_ObjStore_SWIFT::get_params() { policy.create_default(s->user.user_id, s->user.display_name); + location_constraint = store->region.api_name; + return 0; } From 046d5cb6db99e4cf38e870a5de487eadd856ebda Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 22 Jul 2013 13:44:24 -0700 Subject: [PATCH 26/30] src/test/osd/TestPGLog.cc: check that the object remains in log.objects Signed-off-by: Samuel Just Reviewed-by: Sage Weil --- src/test/osd/TestPGLog.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index d8ec8d03df2be..e0863f726a08e 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -82,6 +82,10 @@ TEST_F(PGLogTest, rewind_divergent_log) { hobject_t divergent_object; eversion_t divergent_version; eversion_t newhead; + + hobject_t divergent; + divergent.hash = 0x9; + { pg_log_entry_t e; @@ -90,16 +94,16 @@ TEST_F(PGLogTest, rewind_divergent_log) { log.tail = e.version; log.log.push_back(e); e.version = newhead = eversion_t(1, 4); - e.soid.hash = 0x9; + e.soid = divergent; e.op = pg_log_entry_t::MODIFY; log.log.push_back(e); - log.index(); e.version = divergent_version = eversion_t(1, 5); - e.soid.hash = 0x9; + e.soid = divergent; divergent_object = e.soid; e.op = pg_log_entry_t::DELETE; log.log.push_back(e); log.head = e.version; + log.index(); info.last_update = log.head; info.last_complete = log.head; @@ -118,6 +122,7 @@ TEST_F(PGLogTest, rewind_divergent_log) { rewind_divergent_log(t, newhead, info, remove_snap, dirty_info, dirty_big_info); + EXPECT_TRUE(log.objects.count(divergent)); EXPECT_TRUE(missing.is_missing(divergent_object)); EXPECT_EQ(1U, log.objects.count(divergent_object)); EXPECT_EQ(2U, log.log.size()); From 6957dbc75cc2577652b542aa3eae69f03060cb63 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 22 Jul 2013 13:46:10 -0700 Subject: [PATCH 27/30] PGLog::rewind_divergent_log: unindex only works from tail, index() instead Fixes: #5714 Signed-off-by: Samuel Just Reviewed-by: Sage Weil --- src/osd/PGLog.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 6ba08362dadde..2f2c83757db55 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -375,7 +375,6 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead } assert(p->version > newhead); dout(10) << "rewind_divergent_log future divergent " << *p << dendl; - log.unindex(*p); } log.head = newhead; @@ -383,6 +382,7 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead if (info.last_complete > newhead) info.last_complete = newhead; + log.index(); for (list::iterator d = divergent.begin(); d != divergent.end(); ++d) merge_old_entry(t, *d, info, remove_snap); From 2fd4421707814d3563c84002d8b49b5a0c7a3dd5 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 22 Jul 2013 13:46:26 -0700 Subject: [PATCH 28/30] PGLog::merge_log, unidex() only works from tail, we index() below anyway Signed-off-by: Samuel Just Reviewed-by: Sage Weil --- src/osd/PGLog.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 2f2c83757db55..dac1f33fd91cd 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -505,7 +505,6 @@ void PGLog::merge_log(ObjectStore::Transaction& t, break; dout(10) << "merge_log divergent " << oe << dendl; divergent.push_front(oe); - log.unindex(oe); log.log.pop_back(); } From 6582b31abcb36a355b96d63c03ac17021c1e591f Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 22 Jul 2013 12:41:40 -0700 Subject: [PATCH 29/30] FileStore: disable fd cacher and wbthrottle during replay The fd cache only works correctly when there is at most 1 inode per hobject_t. This condition is frequently violated during replay. Fixes: #5699 Signed-off-by: Samuel Just Reviewed-by: Sage Weil --- src/os/FDCache.h | 3 +++ src/os/FileStore.cc | 11 ++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/os/FDCache.h b/src/os/FDCache.h index cf07f860aa50e..f0f40e7bbf41b 100644 --- a/src/os/FDCache.h +++ b/src/os/FDCache.h @@ -28,6 +28,7 @@ * FD Cache */ class FDCache : public md_config_obs_t { +public: /** * FD * @@ -47,8 +48,10 @@ class FDCache : public md_config_obs_t { } }; +private: SharedLRU registry; CephContext *cct; + public: FDCache(CephContext *cct) : cct(cct) { assert(cct); diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 10f2b1f2aad45..17105c11d69b0 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -220,7 +220,8 @@ int FileStore::lfn_open(coll_t cid, r = get_index(cid, index); } Mutex::Locker l(fdcache_lock); - *outfd = fdcache.lookup(oid); + if (!replaying) + *outfd = fdcache.lookup(oid); if (*outfd) { return 0; } @@ -258,7 +259,10 @@ int FileStore::lfn_open(coll_t cid, goto fail; } } - *outfd = fdcache.add(oid, fd); + if (!replaying) + *outfd = fdcache.add(oid, fd); + else + *outfd = FDRef(new FDCache::FD(fd)); return 0; fail: @@ -3060,7 +3064,8 @@ int FileStore::_write(coll_t cid, const hobject_t& oid, r = bl.length(); // flush? - wbthrottle.queue_wb(fd, oid, offset, len, replica); + if (!replaying) + wbthrottle.queue_wb(fd, oid, offset, len, replica); lfn_close(fd); out: From 093182b79680994a0ccb7942aff6722e62905181 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 22 Jul 2013 17:19:31 -0700 Subject: [PATCH 30/30] osd/ReplicatedPG: drop repop refs in ~SnapTrimmer This fixes a leak on shutdown. Signed-off-by: Sage Weil Reviewed-by: Samuel Just --- src/osd/ReplicatedPG.cc | 8 ++++++++ src/osd/ReplicatedPG.h | 1 + 2 files changed, 9 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 9c8d42dbf3c67..298d38d6ace64 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -7767,6 +7767,14 @@ void ReplicatedPG::_scrub_finish() #undef dout_prefix #define dout_prefix *_dout << pg->gen_prefix() +ReplicatedPG::SnapTrimmer::~SnapTrimmer() +{ + while (!repops.empty()) { + (*repops.begin())->put(); + repops.erase(repops.begin()); + } +} + void ReplicatedPG::SnapTrimmer::log_enter(const char *state_name) { dout(20) << "enter " << state_name << dendl; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 0d4867f6e6ded..9dafe23faa1fb 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -982,6 +982,7 @@ class ReplicatedPG : public PG { bool need_share_pg_info; bool requeue; SnapTrimmer(ReplicatedPG *pg) : pg(pg), need_share_pg_info(false), requeue(false) {} + ~SnapTrimmer(); void log_enter(const char *state_name); void log_exit(const char *state_name, utime_t duration); } snap_trimmer_machine;