From f1b6bd7988ab964c9167eff7bea51a49573f5175 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 13 Jun 2013 18:15:39 -0700 Subject: [PATCH] osd: EINVAL from truncate causes osd to crash Maximum object size is 100GB configurable with osd_max_object_size Error EFBIG if attempt to WRITE/WRITEFULL/TRUNCATE beyond osd_max_object_size Error EINVAL if length < 1 for WRITE/WRITEFULL/ZERO Make ZERO beyond existing size a no-op Fixes: #5252 Fixes: #5340 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- PendingReleaseNotes | 6 ++++++ src/common/config_opts.h | 2 ++ src/osd/ReplicatedPG.cc | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 5e4b99f4b14f1..5b08cd761be68 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -24,3 +24,9 @@ v0.65 ceph osd crush add osd.123 ceph osd crush add 123 + +* There is now a maximum RADOS object size, configurable via 'osd max + object size', defaulting to 100 GB. Note that this has no effect on + RBD, CephFS, or radosgw, which all stripe over objects. + + \ No newline at end of file diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 8a02dd5c98eca..094d12401c5cd 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -485,6 +485,8 @@ OPTION(osd_recovery_op_priority, OPT_INT, 10) // Max time to wait between notifying mon of shutdown and shutting down OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5) +OPTION(osd_max_object_size, OPT_U64, 100*1024L*1024L*1024L) // OSD's maximum object size + OPTION(filestore, OPT_BOOL, false) /// filestore wb throttle limits diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 60e0c889932e8..5f4aa6a05257f 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1953,6 +1953,19 @@ int ReplicatedPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd return result; } +static int check_offset_and_length(__le64 offset, __le64 length) +{ + if (length < 1) + return -EINVAL; + + if (offset >= g_conf->osd_max_object_size || + length > g_conf->osd_max_object_size || + offset + length > g_conf->osd_max_object_size) + return -EFBIG; + + return 0; +} + int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) { int result = 0; @@ -2005,7 +2018,14 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes) if (op.op == CEPH_OSD_OP_ZERO && obs.exists && + op.extent.offset < g_conf->osd_max_object_size && + op.extent.length >= 1 && + op.extent.length <= g_conf->osd_max_object_size && op.extent.offset + op.extent.length >= oi.size) { + if (op.extent.offset >= oi.size) { + // no-op + goto fail; + } dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl; op.op = CEPH_OSD_OP_TRUNCATE; @@ -2517,6 +2537,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) oi.truncate_size = op.extent.truncate_size; } } + result = check_offset_and_length(op.extent.offset, op.extent.length); + if (result < 0) + break; bufferlist nbl; bp.copy(op.extent.length, nbl); t.write(coll, soid, op.extent.offset, op.extent.length, nbl); @@ -2531,6 +2554,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) case CEPH_OSD_OP_WRITEFULL: { // write full object + result = check_offset_and_length(op.extent.offset, op.extent.length); + if (result < 0) + break; bufferlist nbl; bp.copy(op.extent.length, nbl); if (obs.exists) { @@ -2560,6 +2586,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) case CEPH_OSD_OP_ZERO: { // zero + result = check_offset_and_length(op.extent.offset, op.extent.length); + if (result < 0) + break; assert(op.extent.length); if (obs.exists) { t.zero(coll, soid, op.extent.offset, op.extent.length); @@ -2618,6 +2647,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) break; } + if (op.extent.offset > g_conf->osd_max_object_size) { + result = -EFBIG; + break; + } + if (op.extent.truncate_seq) { assert(op.extent.offset == op.extent.truncate_size); if (op.extent.truncate_seq <= oi.truncate_seq) {