diff --git a/qa/tasks/cephfs/test_quiesce.py b/qa/tasks/cephfs/test_quiesce.py index da442381fa3781..7e9f2dfbfb55a9 100644 --- a/qa/tasks/cephfs/test_quiesce.py +++ b/qa/tasks/cephfs/test_quiesce.py @@ -188,14 +188,6 @@ def _verify_quiesce(self, rank=0, root=None, splitauth=False): visited = set() locks_expected = set([ "iquiesce", - "isnap", - "ipolicy", - "ifile", - "inest", - "idft", - "iauth", - "ilink", - "ixattr", ]) for inode in cache: ino = inode['ino'] @@ -212,21 +204,9 @@ def _verify_quiesce(self, rank=0, root=None, splitauth=False): for lock in op['type_data']['locks']: lock_type = lock['lock']['type'] if lock_type == "iquiesce": - if ino == root_ino: - self.assertEqual(lock['flags'], 1) - self.assertEqual(lock['lock']['state'], 'sync') - else: - self.assertEqual(lock['flags'], 4) - self.assertEqual(lock['lock']['state'], 'xlock') - elif lock_type == "isnap": - self.assertEqual(lock['flags'], 1) - self.assertEqual(lock['lock']['state'][:4], 'sync') - elif lock_type == "ifile": - self.assertEqual(lock['flags'], 1) - self.assertEqual(lock['lock']['state'][:4], 'sync') - elif lock_type in ("ipolicy", "inest", "idft", "iauth", "ilink", "ixattr"): - self.assertEqual(lock['flags'], 1) - self.assertEqual(lock['lock']['state'][:4], 'sync') + self.assertEqual(lock['flags'], 4) + self.assertEqual(lock['lock']['state'], 'lock') + self.assertEqual(lock['lock']['num_xlocks'], 1) else: # no iflock self.assertFalse(lock_type.startswith("i")) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 047a2dbfda6f44..4f6dd50d6da47d 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2882,6 +2882,7 @@ bool CInode::freeze_inode(int auth_pin_allowance) const static int lock_types[] = { CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT, CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0 + //TODO: add iquiesce here? }; for (int i = 0; lock_types[i]; ++i) { auto lock = get_lock(lock_types[i]); @@ -3532,13 +3533,23 @@ void CInode::export_client_caps(map& cl) } } +int CInode::get_caps_quiesce_mask() const +{ + if (is_quiesced()) { + // what we allow to our clients for a quiesced node + return CEPH_CAP_ANY_RD | CEPH_CAP_FILE_BUFFER | CEPH_CAP_PIN; + } else { + return CEPH_CAP_ANY; + } +} + // caps allowed int CInode::get_caps_liked() const { if (is_dir()) - return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER + return get_caps_quiesce_mask() & (CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED); // but not, say, FILE_RD|WR|WRBUFFER else - return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; + return get_caps_quiesce_mask() & (CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO); } int CInode::get_caps_allowed_ever() const @@ -3558,30 +3569,33 @@ int CInode::get_caps_allowed_ever() const int CInode::get_caps_allowed_by_type(int type) const { - return + return get_caps_quiesce_mask() & ( CEPH_CAP_PIN | (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) | (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) | (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) | - (linklock.gcaps_allowed(type) << linklock.get_cap_shift()); + (linklock.gcaps_allowed(type) << linklock.get_cap_shift()) + ); } int CInode::get_caps_careful() const { - return + return get_caps_quiesce_mask() & ( (filelock.gcaps_careful() << filelock.get_cap_shift()) | (authlock.gcaps_careful() << authlock.get_cap_shift()) | (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) | - (linklock.gcaps_careful() << linklock.get_cap_shift()); + (linklock.gcaps_careful() << linklock.get_cap_shift()) + ); } int CInode::get_xlocker_mask(client_t client) const { - return + return get_caps_quiesce_mask() & ( (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) | (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) | (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) | - (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()); + (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()) + ); } int CInode::get_caps_allowed_for_client(Session *session, Capability *cap, @@ -3679,6 +3693,14 @@ int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const other |= p.second; //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; } + + // we adjust wanted caps to prevent unnecessary lock transitions + // don't worry, when the quiesce lock is dropped + // the whole thing will get evaluated again, with a fixed mask +// loner &= get_caps_quiesce_mask(); +// other &= get_caps_quiesce_mask(); +// w &= get_caps_quiesce_mask(); + if (ploner) *ploner = (loner >> shift) & mask; if (pother) *pother = (other >> shift) & mask; return (w >> shift) & mask; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 7589597a2412f8..a78c82f23fe572 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -661,6 +661,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counteris_file(); } bool is_symlink() const { return get_inode()->is_symlink(); } bool is_dir() const { return get_inode()->is_dir(); } + bool is_quiesced() const { return quiescelock.is_xlocked(); } bool is_head() const { return last == CEPH_NOSNAP; } @@ -866,6 +867,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter quiesce_op_flags; + // xlocks bool need_quiescelock = !skip_quiesce; for (size_t i = 0; i < lov.size(); ++i) { @@ -255,7 +259,16 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, SimpleLock *lock = p.lock; MDSCacheObject *object = lock->get_parent(); auto t = lock->get_type(); - + + if (t == CEPH_LOCK_IQUIESCE) { + quiesce_op_flags[object] = p.flags; + need_quiescelock = false; + // remove this lock from the vector, + // we'll add it back below + lov.erase(lov.begin() + i); + --i; + continue; + } if (p.is_xlock()) { if ((lock->get_type() == CEPH_LOCK_ISNAP || lock->get_type() == CEPH_LOCK_IPOLICY) && @@ -316,10 +329,10 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, case CEPH_LOCK_IQUIESCE: break; default: - CInode *in = static_cast(object); - if (need_quiescelock) { - need_quiescelock = false; - lov.add_rdlock(&in->quiescelock, i + 1); + CInode* in = static_cast(object); + if (need_quiescelock && (lock->get_cap_shift() > 0)) { + dout(15) << "need shared quiesce lock for " << p << " on " << SimpleLock::get_lock_type_name(t) << " of " << in << dendl; + quiesce_op_flags[in] |= LockOp::WRLOCK; } if (!in->is_auth()) continue; @@ -347,19 +360,17 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, << " in case we need to request a scatter" << dendl; mustpin.insert(object); } - if (need_quiescelock && is_inode_lock(t) && t != CEPH_LOCK_IQUIESCE) { - CInode *in = static_cast(object); - lov.add_rdlock(&in->quiescelock, i + 1); - need_quiescelock = false; + if (need_quiescelock && (lock->get_cap_shift() > 0)) { + dout(15) << "need shared quiesce lock for " << p << " on " << SimpleLock::get_lock_type_name(t) << " of " << object << dendl; + quiesce_op_flags[object] |= LockOp::WRLOCK; } } else if (p.is_remote_wrlock()) { dout(20) << " must remote_wrlock on mds." << p.wrlock_target << " " << *lock << " " << *object << dendl; mustpin.insert(object); - if (need_quiescelock && is_inode_lock(t) && t != CEPH_LOCK_IQUIESCE) { - CInode *in = static_cast(object); - lov.add_rdlock(&in->quiescelock, i + 1); - need_quiescelock = false; + if (need_quiescelock && (lock->get_cap_shift() > 0)) { + dout(15) << "need shared quiesce lock for " << p << " on " << SimpleLock::get_lock_type_name(t) << " of " << object << dendl; + quiesce_op_flags[object] |= LockOp::WRLOCK; } } else if (p.is_rdlock()) { dout(20) << " must rdlock " << *lock << " " << *object << dendl; @@ -385,26 +396,46 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, * xlocked, then all locks are dropped (s.f. * Locker::handle_quiesce_failure). So adding the quiescelock can never * contribute to deadlock. + * + * UPD - we can further simplify and say that no read operation + * should be taking the quiesce lock. We'll have to handle it a bit + * differently when we decide to implement an exclusive quiesce */ - if (need_quiescelock && !mdr->is_rdlocked(lock)) { - /* Can we get the lock without waiting? */ - if (!lock->can_rdlock(client)) { - /* To prevent deadlock where an op holds a parent snaplock - * (Locker::try_rdlock_snap_layout), add quiescelock. - */ - CInode *in = static_cast(object); - lov.add_rdlock(&in->quiescelock, i + 1); - need_quiescelock = false; - } - } + // if (need_quiescelock && !mdr->is_rdlocked(lock) && (lock->get_cap_shift() > 0)) { + // /* Can we get the lock without waiting? */ + // if (!lock->can_rdlock(client)) { + // /* To prevent deadlock where an op holds a parent snaplock + // * (Locker::try_rdlock_snap_layout), add quiescelock. + // */ + // quiesce_op_flags[object] |= LockOp::WRLOCK; + // } + // } } else { ceph_assert(0 == "locker unknown lock operation"); } } lov.sort_and_merge(); - + + bool injected_quiesce_lock = false; + for (auto &[co, flags]: quiesce_op_flags){ + CInode *in = static_cast(co); + if (flags & LockOp::XLOCK) { + dout(15) << "injecting an exclusive quiesce lock for inode " << in << " under mdr: " << mdr << dendl; + lov.add_xlock(&in->quiescelock, 0); + injected_quiesce_lock = true; + } else if (flags & LockOp::WRLOCK) { + dout(15) << "injecting a shared quiesce lock for inode " << in << " under mdr: " << mdr << dendl; + lov.add_wrlock(&in->quiescelock, 0); + injected_quiesce_lock = true; + } + } + + if (!injected_quiesce_lock) { + dout(20) << "no quiesce lock taken for mdr: " << mdr << dendl; + } + // AUTH PINS map > mustpin_remote; // mds -> (object set) @@ -575,11 +606,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, if (mdr->locking && lock != mdr->locking) cancel_locking(mdr.get(), &issue_set); if (!xlock_start(lock, mdr)) { - if (t == CEPH_LOCK_IQUIESCE) { - handle_quiesce_failure(mdr, marker.message); - } else { - marker.message = "failed to xlock, waiting"; - } + marker.message = "failed to xlock, waiting"; goto out; } dout(10) << " got xlock on " << *lock << " " << *lock->get_parent() << dendl; @@ -655,11 +682,8 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, } if (!rdlock_start(lock, mdr)) { - if (t == CEPH_LOCK_IQUIESCE) { - handle_quiesce_failure(mdr, marker.message); - } else { - marker.message = "failed to rdlock, waiting"; - } + ceph_assert(t != CEPH_LOCK_IQUIESCE); // rdlock is undefined for LocalLock + marker.message = "failed to rdlock, waiting"; goto out; } dout(10) << " got rdlock on " << *lock << " " << *lock->get_parent() << dendl; @@ -865,7 +889,7 @@ void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut) issue_caps_set(need_issue); } -void Locker::drop_rdlock(MutationImpl* mut, SimpleLock* what) +void Locker::drop_lock(MutationImpl* mut, SimpleLock* what) { dout(20) << __func__ << ": " << *what << dendl; @@ -873,9 +897,14 @@ void Locker::drop_rdlock(MutationImpl* mut, SimpleLock* what) auto* lock = it->lock; if (lock == what) { dout(20) << __func__ << ": found lock " << *lock << dendl; - ceph_assert(it->is_rdlock()); bool ni = false; - rdlock_finish(it, mut, &ni); + if (it->is_xlock()) { + xlock_finish(it, mut, &ni); + } else if (it->is_wrlock()) { + wrlock_finish(it, mut, &ni); + } else if (it->is_rdlock()) { + rdlock_finish(it, mut, &ni); + } if (ni) { set need_issue; need_issue.insert(static_cast(lock->get_parent())); @@ -1450,8 +1479,7 @@ bool Locker::eval(CInode *in, int mask, bool caps_imported) eval_any(&in->flocklock, &need_issue, &finishers, caps_imported); if (mask & CEPH_LOCK_IPOLICY) eval_any(&in->policylock, &need_issue, &finishers, caps_imported); - if (mask & CEPH_LOCK_IQUIESCE) - eval_any(&in->quiescelock, &need_issue, &finishers, caps_imported); + // LocalLocks should not be eval'd // drop loner? if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) { @@ -1841,6 +1869,7 @@ void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut) switch (lock->get_type()) { case CEPH_LOCK_DVERSION: case CEPH_LOCK_IVERSION: + case CEPH_LOCK_IQUIESCE: return local_wrlock_grab(static_cast(lock), mut); default: break; @@ -1892,6 +1921,7 @@ bool Locker::wrlock_start(const MutationImpl::LockOp &op, const MDRequestRef& mu switch (lock->get_type()) { case CEPH_LOCK_DVERSION: case CEPH_LOCK_IVERSION: + case CEPH_LOCK_IQUIESCE: return local_wrlock_start(static_cast(lock), mut); default: break; @@ -1957,6 +1987,7 @@ void Locker::wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl * switch (lock->get_type()) { case CEPH_LOCK_DVERSION: case CEPH_LOCK_IVERSION: + case CEPH_LOCK_IQUIESCE: return local_wrlock_finish(it, mut); default: break; @@ -2045,6 +2076,7 @@ bool Locker::xlock_start(SimpleLock *lock, const MDRequestRef& mut) switch (lock->get_type()) { case CEPH_LOCK_DVERSION: case CEPH_LOCK_IVERSION: + case CEPH_LOCK_IQUIESCE: return local_xlock_start(static_cast(lock), mut); default: break; @@ -2171,6 +2203,7 @@ void Locker::xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *m switch (lock->get_type()) { case CEPH_LOCK_DVERSION: case CEPH_LOCK_IVERSION: + case CEPH_LOCK_IQUIESCE: return local_xlock_finish(it, mut); default: break; @@ -2981,10 +3014,13 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) { // lock? if (in->filelock.is_stable()) { - if (in->get_target_loner() >= 0) - file_excl(&in->filelock); - else - simple_lock(&in->filelock); + auto wanted = in->get_caps_wanted(); + if (in->get_target_loner() >= 0 && (wanted & CEPH_CAP_ANY_FILE_WR)) { + dout(10) << "check_inode_max_size requesting file_excl for wanted caps " << ccap_string(wanted) << " " << *in << dendl; + file_excl(&in->filelock); + } else { + simple_lock(&in->filelock); + } } if (!in->filelock.can_wrlock(in->get_loner())) { dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl; @@ -3965,7 +4001,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap, if (in->is_file()) { bool forced_change_max = false; dout(20) << "inode is file" << dendl; - if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) { + if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR & in->get_caps_quiesce_mask())) { dout(20) << "client has write caps; m->get_max_size=" << m->get_max_size() << "; old_max=" << old_max << dendl; if (m->get_max_size() > new_max) { @@ -5548,6 +5584,14 @@ void Locker::local_xlock_finish(const MutationImpl::lock_iterator& it, MutationI lock->put_xlock(); mut->locks.erase(it); + if (lock->get_type() == CEPH_LOCK_IQUIESCE) { + auto in = static_cast(lock->get_parent()); + set issue_set { in }; + // reevaluate everything related related to caps + eval_cap_gather(in, &issue_set); + issue_caps_set(issue_set); + } + lock->finish_waiters(SimpleLock::WAIT_STABLE | SimpleLock::WAIT_WR | SimpleLock::WAIT_RD); @@ -5593,7 +5637,7 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue) << " other_issued=" << gcap_string(other_issued) << " xlocker_issued=" << gcap_string(xlocker_issued) << dendl; - if (!((loner_wanted|loner_issued) & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) || + if (!((loner_wanted|loner_issued) & (CEPH_CAP_ANY_FILE_WR >> CEPH_CAP_SFILE)) || (other_wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GRD)) || (in->is_dir() && in->multiple_nonstale_caps())) { // FIXME.. :/ dout(20) << " should lose it" << dendl; @@ -5626,7 +5670,7 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue) in->get_target_loner() >= 0 && (in->is_dir() ? !in->has_subtree_or_exporting_dirfrag() : - (wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))) { + (wanted & (CEPH_CAP_ANY_FILE_WR >> CEPH_CAP_SFILE)))) { dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << dendl; file_excl(lock, need_issue); diff --git a/src/mds/Locker.h b/src/mds/Locker.h index cfc0d9ace9a1ec..0a500f09be15be 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -67,7 +67,7 @@ class Locker { void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false); void drop_non_rdlocks(MutationImpl *mut, std::set *pneed_issue=0); void drop_rdlocks_for_early_reply(MutationImpl *mut); - void drop_rdlock(MutationImpl* mut, SimpleLock* what); + void drop_lock(MutationImpl* mut, SimpleLock* what); void drop_locks_for_fragment_unfreeze(MutationImpl *mut); int get_cap_bit_for_lock_cache(int op); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 6cc7140f902991..a84836a326cea9 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "MDCache.h" @@ -13526,63 +13528,53 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) mds->server->respond_to_request(mdr, -CEPHFS_ENOENT); return; } - const bool is_root = (mdr->get_filepath().get_ino() == mdr->get_filepath2().get_ino()); dout(20) << __func__ << " " << *mdr << " quiescing " << *in << dendl; - { - /* Acquire authpins on `in` to prevent migrations after this rank considers - * it (and its children) quiesced. - */ - - MutationImpl::LockOpVec lov; - if (!mds->locker->acquire_locks(mdr, lov, nullptr, {in}, false, true)) { - return; - } - } - - /* TODO: Consider: - * - * rank0 is auth for /foo - * rank1 quiesces /foo with no dirents in cache (and stops) - * rank0 begins quiescing /foo - * rank0 exports a dirfrag of /foo/bar to rank1 (/foo/bar is not authpinned by rank1 nor by rank0 (yet)) - * rank1 discovers relevant paths in /foo/bar - * rank1 now has /foo/bar in cache and may issue caps / execute operations - * - * The solution is probably to have rank1 mark /foo has STATE_QUIESCED and reject export ops from rank0. - */ - if (in->is_auth()) { - /* Acquire rdlocks on anything which prevents writing. - * - * Because files are treated specially allowing multiple reader/writers, we - * need an xlock here to recall all write caps. This unfortunately means - * there can be no readers. + /* Acquire all caps-related locks to revoke unwanted caps * * The xlock on the quiescelock is important to prevent future requests * from blocking on other inode locks while holding path traversal locks. * See dev doc doc/dev/mds_internals/quiesce.rst for more details. */ - MutationImpl::LockOpVec lov; - lov.add_rdlock(&in->authlock); - lov.add_rdlock(&in->dirfragtreelock); + // take the quiesce lock and the first of the four capability releated locks + lov.add_xlock(&in->quiescelock); /* !! */ + lov.add_rdlock(&in->xattrlock); + + if (!mds->locker->acquire_locks(mdr, lov, nullptr, { in }, false, true)) { + return; + } + + // now we can test if this inode should be skipped + // don't take the projected inode because + // we should only care about the committed state + if (in->get_inode()->get_quiesce_block()) { + dout(10) << __func__ << " quiesce is blocked for this inode; dropping locks!" << dendl; + mdr->mark_event("quiesce blocked"); + mds->locker->drop_locks(mdr.get()); + /* keep authpins! */ + qs.inc_inodes_blocked(); + mdr->internal_op_finish->complete(0); + mdr->internal_op_finish = nullptr; + return; + } + + // we are adding the rest of the rdlocks + // this is safe because we only take rdlocks for all four locks + // if any lock is attempted for wr or x, then quiesce lock would be + // injected automatically by the `acquire_locks` and we already hold + // that as xlock so those can't be holding any of the write locks to the below + + // rdlock on the filelock is sufficient, + // because the REQ state for rd on LOCK_MIX and XCL on LOCK_EXCL + // aren't rdlockable. See SimpleLock::can_rdlock lov.add_rdlock(&in->filelock); + lov.add_rdlock(&in->authlock); lov.add_rdlock(&in->linklock); - lov.add_rdlock(&in->nestlock); - lov.add_rdlock(&in->policylock); - // N.B.: NO xlock/wrlock on quiescelock; we need to allow access to mksnap/lookup - // This is an unfortunate inconsistency. It may be possible to circumvent - // this issue by having those ops acquire the quiscelock only if necessary. - if (is_root) { - lov.add_rdlock(&in->quiescelock); - } else { - lov.add_xlock(&in->quiescelock); /* !! */ - } - lov.add_rdlock(&in->snaplock); - lov.add_rdlock(&in->xattrlock); + if (!mds->locker->acquire_locks(mdr, lov, nullptr, {in}, false, true)) { return; } @@ -13593,17 +13585,50 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) return; } - if (in->get_projected_inode()->get_quiesce_block()) { - dout(10) << __func__ << " quiesce is blocked for this inode; dropping locks!" << dendl; - mdr->mark_event("quiesce blocked"); - mds->locker->drop_locks(mdr.get()); - /* keep authpins! */ - qs.inc_inodes_blocked(); - mdr->internal_op_finish->complete(0); - mdr->internal_op_finish = nullptr; - return; + // since we're holding onto the quiesce lock, + // we don't need the other capabilities related locks anymore + // as the quiesce mask will prevent any non-shared capabilities from being issued + // see CInode::get_caps_quiesce_mask() + // we actually don't need to hold any other lock but the quiesce lock + + MutationImpl::LockOpVec drop_ops; + std::ranges::copy_if( + mdr->locks, + std::back_inserter(drop_ops), + [&in](auto &&op) { return op.lock != &in->quiescelock; } + ); + + dout(20) << __func__ << "dropping locks: " << drop_ops << dendl; + + for (auto op: drop_ops) { + mds->locker->drop_lock(mdr.get(), op.lock); } + // use the lambda for completion so that we don't have to go + // through the whole lock/drop story above + auto did_quiesce = new LambdaContext([in, mdr=mdr, &qs, mds = mds, func = __func__](int rc) { + /* do not respond/complete so locks are not lost, parent request will complete */ + if (mdr->killed) { + dout(20) << func << " " << *mdr << " not dispatching killed " << *mdr << dendl; + return; + } else if (mdr->internal_op_finish == nullptr) { + dout(20) << func << " " << *mdr << " already finished quiesce" << dendl; + return; + } + + if (in->is_auth()) { + dout(10) << func << " " << *mdr << " quiesce complete of " << *in << dendl; + mdr->mark_event("quiesce complete"); + } else { + dout(10) << func << " " << *mdr << " non-auth quiesce complete of " << *in << dendl; + mdr->mark_event("quiesce complete for non-auth inode"); + } + + qs.inc_inodes_quiesced(); + mdr->internal_op_finish->complete(rc); + mdr->internal_op_finish = nullptr; + }); + if (in->is_dir()) { for (auto& dir : in->get_dirfrags()) { if (!dir->is_auth() && !splitauth) { @@ -13614,15 +13639,16 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) } } uint64_t count = 0; - MDSGatherBuilder gather(g_ceph_context, new C_MDS_RetryRequest(this, mdr)); + MDSGatherBuilder gather(g_ceph_context, new MDSInternalContextWrapper(mds, did_quiesce)); auto& qops = qrmdr->more()->quiesce_ops; for (auto& dir : in->get_dirfrags()) { for (auto& [dnk, dn] : *dir) { - auto* in = dn->get_projected_inode(); + // don't take the projected linkage because + // we should only care about the committed state + auto in = dn->get_linkage()->inode; if (!in) { continue; } - if (auto it = qops.find(in); it != qops.end()) { dout(25) << __func__ << ": existing quiesce metareqid: " << it->second << dendl; if (auto reqit = active_requests.find(it->second); reqit != active_requests.end()) { @@ -13658,19 +13684,7 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) } } - if (in->is_auth()) { - dout(10) << __func__ << " " << *mdr << " quiesce complete of " << *in << dendl; - mdr->mark_event("quiesce complete"); - } else { - dout(10) << __func__ << " " << *mdr << " non-auth quiesce complete of " << *in << dendl; - mdr->mark_event("quiesce complete for non-auth inode"); - } - - qs.inc_inodes_quiesced(); - mdr->internal_op_finish->complete(0); - mdr->internal_op_finish = nullptr; - - /* do not respond/complete so locks are not lost, parent request will complete */ + did_quiesce->complete(0); } void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) @@ -13740,7 +13754,6 @@ void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) } else if (auto& qops = mdr->more()->quiesce_ops; qops.count(diri) == 0) { MDRequestRef qimdr = request_start_internal(CEPH_MDS_OP_QUIESCE_INODE); qimdr->set_filepath(filepath(diri->ino())); - qimdr->set_filepath2(filepath(diri->ino())); /* is_root! */ qimdr->internal_op_finish = new C_MDS_RetryRequest(this, mdr); qimdr->internal_op_private = qfinisher; qops[diri] = qimdr->reqid; diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index 4b9d0d7528c898..ce2a6eb32eac73 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -51,6 +51,7 @@ struct MutationImpl : public TrackedOp { mds_rank_t remote_auth_pinned = MDS_RANK_NONE; }; + // held locks struct LockOp { enum { diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index c61bc4b2cd04bf..d2c40895512a21 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -41,7 +41,6 @@ struct LockType { explicit LockType(int t) : type(t) { switch (type) { case CEPH_LOCK_DN: - case CEPH_LOCK_IQUIESCE: case CEPH_LOCK_IAUTH: case CEPH_LOCK_ILINK: case CEPH_LOCK_IXATTR: @@ -59,6 +58,7 @@ struct LockType { break; case CEPH_LOCK_DVERSION: case CEPH_LOCK_IVERSION: + case CEPH_LOCK_IQUIESCE: sm = &sm_locallock; break; default: