Skip to content

Commit

Permalink
mds/quiesce: use LocalLock for iquiesce and a caps mask for a lock-fr…
Browse files Browse the repository at this point in the history
…ee quiesce
  • Loading branch information
leonid-s-usov committed Feb 18, 2024
1 parent 7e76beb commit b7d4f96
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 135 deletions.
23 changes: 3 additions & 20 deletions qa/tasks/cephfs/test_quiesce.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,6 @@ def _verify_quiesce(self, rank=0, root=None, splitauth=False):
visited = set()
locks_expected = set([
"iquiesce",
"isnap",
"ipolicy",
"ifile",
"inest",
"idft",
"iauth",
"ilink",
"ixattr",
])
for inode in cache:
ino = inode['ino']
Expand All @@ -212,12 +204,8 @@ def _verify_quiesce(self, rank=0, root=None, splitauth=False):
for lock in op['type_data']['locks']:
lock_type = lock['lock']['type']
if lock_type == "iquiesce":
if ino == root_ino:
self.assertEqual(lock['flags'], 1)
self.assertEqual(lock['lock']['state'], 'sync')
else:
self.assertEqual(lock['flags'], 4)
self.assertEqual(lock['lock']['state'], 'xlock')
self.assertEqual(lock['flags'], 4)
self.assertEqual(lock['lock']['state'], 'xlock')
elif lock_type == "isnap":
self.assertEqual(lock['flags'], 1)
self.assertEqual(lock['lock']['state'][:4], 'sync')
Expand All @@ -228,14 +216,9 @@ def _verify_quiesce(self, rank=0, root=None, splitauth=False):
else:
self.assertEqual(lock['flags'], 1)
self.assertEqual(lock['lock']['state'][:4], 'sync')
elif lock_type in ("ipolicy", "inest", "idft", "iauth", "ilink", "ixattr"):
elif lock_type in ("iauth", "ilink", "ixattr"):
self.assertEqual(lock['flags'], 1)
self.assertEqual(lock['lock']['state'][:4], 'sync')
elif lock_type == "iversion":
# only regular files acquire xlock (on ifile)
self.assertEqual((mode & S_IFMT), S_IFREG)
self.assertEqual(lock['flags'], 2)
self.assertEqual(lock['lock']['state'][:4], 'lock')
else:
# no iflock
self.assertFalse(lock_type.startswith("i"))
Expand Down
29 changes: 15 additions & 14 deletions src/include/ceph_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,22 +333,23 @@ extern const char *ceph_mds_state_name(int s);
*/
#define CEPH_LOCK_DN (1 << 0)
#define CEPH_LOCK_DVERSION (1 << 1)
#define CEPH_LOCK_IQUIESCE (1 << 4) /* mds internal */
#define CEPH_LOCK_ISNAP (1 << 5) /* snapshot lock. MDS internal */
#define CEPH_LOCK_IPOLICY (1 << 6) /* policy lock on dirs. MDS internal */
#define CEPH_LOCK_IFILE (1 << 7)
#define CEPH_LOCK_INEST (1 << 8) /* mds internal */
#define CEPH_LOCK_IDFT (1 << 9) /* dir frag tree */
#define CEPH_LOCK_IAUTH (1 << 10)
#define CEPH_LOCK_ILINK (1 << 11)
#define CEPH_LOCK_IXATTR (1 << 12)
#define CEPH_LOCK_IFLOCK (1 << 13) /* advisory file locks */
#define CEPH_LOCK_IVERSION (1 << 14) /* mds internal */

#define CEPH_LOCK_IFIRST CEPH_LOCK_IQUIESCE
#define CEPH_LOCK_ISNAP (1 << 4) /* snapshot lock. MDS internal */
#define CEPH_LOCK_IPOLICY (1 << 5) /* policy lock on dirs. MDS internal */
#define CEPH_LOCK_IFILE (1 << 6)
#define CEPH_LOCK_INEST (1 << 7) /* mds internal */
#define CEPH_LOCK_IDFT (1 << 8) /* dir frag tree */
#define CEPH_LOCK_IAUTH (1 << 9)
#define CEPH_LOCK_ILINK (1 << 10)
#define CEPH_LOCK_IXATTR (1 << 11)
#define CEPH_LOCK_IFLOCK (1 << 12) /* advisory file locks */
#define CEPH_LOCK_IVERSION (1 << 13) /* mds internal */
#define CEPH_LOCK_IQUIESCE (1 << 14) /* mds internal */

#define CEPH_LOCK_IFIRST CEPH_LOCK_ISNAP
#define CEPH_LOCK_ILAST CEPH_LOCK_IVERSION

static inline bool is_inode_lock(int l) {
static inline bool is_inode_lock(int l)
{
return (CEPH_LOCK_IFIRST <= l && l <= CEPH_LOCK_ILAST);
}

Expand Down
25 changes: 19 additions & 6 deletions src/mds/CInode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2882,6 +2882,7 @@ bool CInode::freeze_inode(int auth_pin_allowance)
const static int lock_types[] = {
CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
//TODO: add iquiesce here?
};
for (int i = 0; lock_types[i]; ++i) {
auto lock = get_lock(lock_types[i]);
Expand Down Expand Up @@ -3532,6 +3533,15 @@ void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
}
}

int CInode::get_caps_quiesce_mask() const
{
if (quiescelock.can_wrlock()) {
return CEPH_CAP_ANY;
} else {
return CEPH_CAP_ANY_SHARED | CEPH_CAP_PIN /*?*/;
}
}

// caps allowed
int CInode::get_caps_liked() const
{
Expand All @@ -3558,30 +3568,33 @@ int CInode::get_caps_allowed_ever() const

int CInode::get_caps_allowed_by_type(int type) const
{
return
return get_caps_quiesce_mask() & (
CEPH_CAP_PIN |
(filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
(authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
(xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
(linklock.gcaps_allowed(type) << linklock.get_cap_shift());
(linklock.gcaps_allowed(type) << linklock.get_cap_shift())
);
}

int CInode::get_caps_careful() const
{
return
return get_caps_quiesce_mask() & (
(filelock.gcaps_careful() << filelock.get_cap_shift()) |
(authlock.gcaps_careful() << authlock.get_cap_shift()) |
(xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
(linklock.gcaps_careful() << linklock.get_cap_shift());
(linklock.gcaps_careful() << linklock.get_cap_shift())
);
}

int CInode::get_xlocker_mask(client_t client) const
{
return
return get_caps_quiesce_mask() & (
(filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
(authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
(xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
(linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
(linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift())
);
}

int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
Expand Down
4 changes: 3 additions & 1 deletion src/mds/CInode.h
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
* quiescelock.
*/

SimpleLock quiescelock; // FIXME not part of mempool
LocalLockC quiescelock; // FIXME not part of mempool
LocalLockC versionlock; // FIXME not part of mempool
SimpleLock authlock; // FIXME not part of mempool
SimpleLock linklock; // FIXME not part of mempool
Expand Down Expand Up @@ -1171,6 +1171,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
clear_flock_lock_state();
}

int get_caps_quiesce_mask() const;

/**
* Return the pool ID where we currently write backtraces for
* this inode (in addition to inode.old_pools)
Expand Down
107 changes: 78 additions & 29 deletions src/mds/Locker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,27 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
if (auth_pin_freeze)
mustpin.insert(auth_pin_freeze);

using LockOp = MutationImpl::LockOp;

std::unordered_map<MDSCacheObject*, unsigned> quiesce_op_flags;

// xlocks
bool need_quiescelock = !skip_quiesce;
for (size_t i = 0; i < lov.size(); ++i) {
auto& p = lov[i];
SimpleLock *lock = p.lock;
MDSCacheObject *object = lock->get_parent();
auto t = lock->get_type();


if (t == CEPH_LOCK_IQUIESCE) {
quiesce_op_flags[object] = p.flags;
need_quiescelock = false;
// remove this lock from the vector,
// we'll add it back below
lov.erase(lov.begin() + i);
--i;
continue;
}
if (p.is_xlock()) {
if ((lock->get_type() == CEPH_LOCK_ISNAP ||
lock->get_type() == CEPH_LOCK_IPOLICY) &&
Expand Down Expand Up @@ -316,10 +329,10 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
case CEPH_LOCK_IQUIESCE:
break;
default:
CInode *in = static_cast<CInode*>(object);
if (need_quiescelock) {
need_quiescelock = false;
lov.add_rdlock(&in->quiescelock, i + 1);
CInode* in = static_cast<CInode*>(object);
if (need_quiescelock && (lock->get_cap_shift() > 0)) {
dout(15) << "need shared quiesce lock for " << p << " on " << SimpleLock::get_lock_type_name(t) << " of " << in << dendl;
quiesce_op_flags[in] |= LockOp::WRLOCK;
}
if (!in->is_auth())
continue;
Expand Down Expand Up @@ -347,19 +360,17 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
<< " in case we need to request a scatter" << dendl;
mustpin.insert(object);
}
if (need_quiescelock && is_inode_lock(t) && t != CEPH_LOCK_IQUIESCE) {
CInode *in = static_cast<CInode*>(object);
lov.add_rdlock(&in->quiescelock, i + 1);
need_quiescelock = false;
if (need_quiescelock && (lock->get_cap_shift() > 0)) {
dout(15) << "need shared quiesce lock for " << p << " on " << SimpleLock::get_lock_type_name(t) << " of " << object << dendl;
quiesce_op_flags[object] |= LockOp::WRLOCK;
}
} else if (p.is_remote_wrlock()) {
dout(20) << " must remote_wrlock on mds." << p.wrlock_target << " "
<< *lock << " " << *object << dendl;
mustpin.insert(object);
if (need_quiescelock && is_inode_lock(t) && t != CEPH_LOCK_IQUIESCE) {
CInode *in = static_cast<CInode*>(object);
lov.add_rdlock(&in->quiescelock, i + 1);
need_quiescelock = false;
if (need_quiescelock && (lock->get_cap_shift() > 0)) {
dout(15) << "need shared quiesce lock for " << p << " on " << SimpleLock::get_lock_type_name(t) << " of " << object << dendl;
quiesce_op_flags[object] |= LockOp::WRLOCK;
}
} else if (p.is_rdlock()) {
dout(20) << " must rdlock " << *lock << " " << *object << dendl;
Expand All @@ -385,26 +396,46 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
* xlocked, then all locks are dropped (s.f.
* Locker::handle_quiesce_failure). So adding the quiescelock can never
* contribute to deadlock.
*
* UPD - we can further simplify and say that no read operation
* should be taking the quiesce lock. We'll have to handle it a bit
* differently when we decide to implement an exclusive quiesce
*/

if (need_quiescelock && !mdr->is_rdlocked(lock)) {
/* Can we get the lock without waiting? */
if (!lock->can_rdlock(client)) {
/* To prevent deadlock where an op holds a parent snaplock
* (Locker::try_rdlock_snap_layout), add quiescelock.
*/
CInode *in = static_cast<CInode*>(object);
lov.add_rdlock(&in->quiescelock, i + 1);
need_quiescelock = false;
}
}
// if (need_quiescelock && !mdr->is_rdlocked(lock) && (lock->get_cap_shift() > 0)) {
// /* Can we get the lock without waiting? */
// if (!lock->can_rdlock(client)) {
// /* To prevent deadlock where an op holds a parent snaplock
// * (Locker::try_rdlock_snap_layout), add quiescelock.
// */
// quiesce_op_flags[object] |= LockOp::WRLOCK;
// }
// }
} else {
ceph_assert(0 == "locker unknown lock operation");
}
}

lov.sort_and_merge();


bool injected_quiesce_lock = false;
for (auto &[co, flags]: quiesce_op_flags){
CInode *in = static_cast<CInode*>(co);
if (flags & LockOp::XLOCK) {
dout(15) << "injecting an exclusive quiesce lock for inode " << in << " under mdr: " << mdr << dendl;
lov.add_xlock(&in->quiescelock, 0);
injected_quiesce_lock = true;
} else if (flags & LockOp::WRLOCK) {
dout(15) << "injecting a shared quiesce lock for inode " << in << " under mdr: " << mdr << dendl;
lov.add_wrlock(&in->quiescelock, 0);
injected_quiesce_lock = true;
}
}

if (!injected_quiesce_lock) {
dout(20) << "no quiesce lock taken for mdr: " << mdr << dendl;
}

// AUTH PINS
map<mds_rank_t, set<MDSCacheObject*> > mustpin_remote; // mds -> (object set)

Expand Down Expand Up @@ -576,7 +607,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
cancel_locking(mdr.get(), &issue_set);
if (!xlock_start(lock, mdr)) {
if (t == CEPH_LOCK_IQUIESCE) {
handle_quiesce_failure(mdr, marker.message);
marker.message = "failed to xlock quiesce, waiting";
} else {
marker.message = "failed to xlock, waiting";
}
Expand Down Expand Up @@ -865,17 +896,22 @@ void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut)
issue_caps_set(need_issue);
}

void Locker::drop_rdlock(MutationImpl* mut, SimpleLock* what)
void Locker::drop_lock(MutationImpl* mut, SimpleLock* what)
{
dout(20) << __func__ << ": " << *what << dendl;

for (auto it = mut->locks.begin(); it != mut->locks.end(); ++it) {
auto* lock = it->lock;
if (lock == what) {
dout(20) << __func__ << ": found lock " << *lock << dendl;
ceph_assert(it->is_rdlock());
bool ni = false;
rdlock_finish(it, mut, &ni);
if (it->is_xlock()) {
xlock_finish(it, mut, &ni);
} else if (it->is_wrlock()) {
wrlock_finish(it, mut, &ni);
} else if (it->is_rdlock()) {
rdlock_finish(it, mut, &ni);
}
if (ni) {
set<CInode*> need_issue;
need_issue.insert(static_cast<CInode*>(lock->get_parent()));
Expand Down Expand Up @@ -1841,6 +1877,7 @@ void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut)
switch (lock->get_type()) {
case CEPH_LOCK_DVERSION:
case CEPH_LOCK_IVERSION:
case CEPH_LOCK_IQUIESCE:
return local_wrlock_grab(static_cast<LocalLockC*>(lock), mut);
default:
break;
Expand Down Expand Up @@ -1892,6 +1929,7 @@ bool Locker::wrlock_start(const MutationImpl::LockOp &op, const MDRequestRef& mu
switch (lock->get_type()) {
case CEPH_LOCK_DVERSION:
case CEPH_LOCK_IVERSION:
case CEPH_LOCK_IQUIESCE:
return local_wrlock_start(static_cast<LocalLockC*>(lock), mut);
default:
break;
Expand Down Expand Up @@ -1957,6 +1995,7 @@ void Locker::wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *
switch (lock->get_type()) {
case CEPH_LOCK_DVERSION:
case CEPH_LOCK_IVERSION:
case CEPH_LOCK_IQUIESCE:
return local_wrlock_finish(it, mut);
default:
break;
Expand Down Expand Up @@ -2045,6 +2084,7 @@ bool Locker::xlock_start(SimpleLock *lock, const MDRequestRef& mut)
switch (lock->get_type()) {
case CEPH_LOCK_DVERSION:
case CEPH_LOCK_IVERSION:
case CEPH_LOCK_IQUIESCE:
return local_xlock_start(static_cast<LocalLockC*>(lock), mut);
default:
break;
Expand Down Expand Up @@ -2171,6 +2211,7 @@ void Locker::xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *m
switch (lock->get_type()) {
case CEPH_LOCK_DVERSION:
case CEPH_LOCK_IVERSION:
case CEPH_LOCK_IQUIESCE:
return local_xlock_finish(it, mut);
default:
break;
Expand Down Expand Up @@ -5548,6 +5589,14 @@ void Locker::local_xlock_finish(const MutationImpl::lock_iterator& it, MutationI
lock->put_xlock();
mut->locks.erase(it);

if (lock->get_type() == CEPH_LOCK_IQUIESCE) {
auto in = static_cast<CInode*>(lock->get_parent());
set<CInode*> issue_set { in };
// reevaluate everything related related to caps
eval_cap_gather(in, &issue_set);
issue_caps_set(issue_set);
}

lock->finish_waiters(SimpleLock::WAIT_STABLE |
SimpleLock::WAIT_WR |
SimpleLock::WAIT_RD);
Expand Down
2 changes: 1 addition & 1 deletion src/mds/Locker.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class Locker {
void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
void drop_non_rdlocks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
void drop_rdlocks_for_early_reply(MutationImpl *mut);
void drop_rdlock(MutationImpl* mut, SimpleLock* what);
void drop_lock(MutationImpl* mut, SimpleLock* what);
void drop_locks_for_fragment_unfreeze(MutationImpl *mut);

int get_cap_bit_for_lock_cache(int op);
Expand Down
Loading

0 comments on commit b7d4f96

Please sign in to comment.