From f9009efac49c830460f55b9f6c08ee0d76f31b0d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:44:59 -0400 Subject: ceph: add dentry lease metric support For dentry leases, only count the hit/miss info triggered from the vfs calls. For the cases like request reply handling and ceph_trim_dentries, ignore them. For now, these are only viewable using debugfs. Future patches will allow the client to send the stats to the MDS. The output looks like: item total miss hit ------------------------------------------------- d_lease 11 7 141 URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/Makefile | 2 +- fs/ceph/debugfs.c | 33 +++++++++++++++++++++++++++++---- fs/ceph/dir.c | 12 ++++++++++++ fs/ceph/mds_client.c | 16 ++++++++++++++-- fs/ceph/mds_client.h | 4 ++++ fs/ceph/metric.c | 35 +++++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 17 +++++++++++++++++ fs/ceph/super.h | 1 + 8 files changed, 113 insertions(+), 7 deletions(-) create mode 100644 fs/ceph/metric.c create mode 100644 fs/ceph/metric.h (limited to 'fs') diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 0a0823d378db..50c635dc7f71 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ - debugfs.o util.o + debugfs.o util.o metric.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index dcaed75de9e6..1e7ea58f546a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -124,6 +124,23 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +static int metric_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client_metric *m = &mdsc->metric; + + seq_printf(s, "item total miss hit\n"); + seq_printf(s, "-------------------------------------------------\n"); + + seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease", + atomic64_read(&m->total_dentries), + percpu_counter_sum(&m->d_lease_mis), + percpu_counter_sum(&m->d_lease_hit)); + + return 0; +} + static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) { struct seq_file *s = p; @@ -222,6 +239,7 @@ DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); DEFINE_SHOW_ATTRIBUTE(mds_sessions); +DEFINE_SHOW_ATTRIBUTE(metric); /* @@ -255,6 +273,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mdsmap); debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_metric); debugfs_remove(fsc->debugfs_mdsc); } @@ -295,11 +314,17 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc, &mdsc_fops); + fsc->debugfs_metric = debugfs_create_file("metrics", + 0400, + fsc->client->debugfs_dir, + fsc, + &metric_fops); + fsc->debugfs_caps = debugfs_create_file("caps", - 0400, - fsc->client->debugfs_dir, - fsc, - &caps_fops); + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_fops); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4c4202c93b71..44cf85136ad5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,6 +38,8 @@ static int __dir_lease_try_check(const struct dentry *dentry); static int ceph_d_init(struct dentry *dentry) { struct ceph_dentry_info *di; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); if (!di) @@ -48,6 +50,9 @@ static int ceph_d_init(struct dentry *dentry) di->time = jiffies; dentry->d_fsdata = di; INIT_LIST_HEAD(&di->lease_list); + + atomic64_inc(&mdsc->metric.total_dentries); + return 0; } @@ -1709,6 +1714,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; + percpu_counter_inc(&mdsc->metric.d_lease_mis); + op = ceph_snap(dir) == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); @@ -1740,6 +1747,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) dout("d_revalidate %p lookup result=%d\n", dentry, err); } + } else { + percpu_counter_inc(&mdsc->metric.d_lease_hit); } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); @@ -1782,9 +1791,12 @@ static int ceph_d_delete(const struct dentry *dentry) static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); dout("d_release %p\n", dentry); + atomic64_dec(&fsc->mdsc->metric.total_dentries); + spin_lock(&dentry->d_lock); __dentry_lease_unlist(di); dentry->d_fsdata = NULL; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7c63abf5bea9..39a27c56411a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4323,6 +4323,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc; + int err; mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); if (!mdsc) @@ -4331,8 +4332,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (!mdsc->mdsmap) { - kfree(mdsc); - return -ENOMEM; + err = -ENOMEM; + goto err_mdsc; } fsc->mdsc = mdsc; @@ -4371,6 +4372,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); atomic_set(&mdsc->cap_reclaim_pending, 0); + err = ceph_metric_init(&mdsc->metric); + if (err) + goto err_mdsmap; spin_lock_init(&mdsc->dentry_list_lock); INIT_LIST_HEAD(&mdsc->dentry_leases); @@ -4389,6 +4393,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); return 0; + +err_mdsmap: + kfree(mdsc->mdsmap); +err_mdsc: + kfree(mdsc); + return err; } /* @@ -4646,6 +4656,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_mdsc_stop(mdsc); + ceph_metric_destroy(&mdsc->metric); + fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 903d9edfd4bf..605995ae3718 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -16,6 +16,8 @@ #include #include +#include "metric.h" + /* The first 8 bits are reserved for old ceph releases */ enum ceph_feature_type { CEPHFS_FEATURE_MIMIC = 8, @@ -454,6 +456,8 @@ struct ceph_mds_client { struct list_head dentry_leases; /* fifo list */ struct list_head dentry_dir_leases; /* lru list */ + struct ceph_client_metric metric; + spinlock_t snapid_map_lock; struct rb_root snapid_map_tree; struct list_head snapid_map_lru; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c new file mode 100644 index 000000000000..873a3769fa0d --- /dev/null +++ b/fs/ceph/metric.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include + +#include "metric.h" + +int ceph_metric_init(struct ceph_client_metric *m) +{ + int ret; + + if (!m) + return -EINVAL; + + atomic64_set(&m->total_dentries, 0); + ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); + if (ret) + return ret; + ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); + if (ret) { + percpu_counter_destroy(&m->d_lease_hit); + return ret; + } + + return 0; +} + +void ceph_metric_destroy(struct ceph_client_metric *m) +{ + if (!m) + return; + + percpu_counter_destroy(&m->d_lease_mis); + percpu_counter_destroy(&m->d_lease_hit); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h new file mode 100644 index 000000000000..da725dacabfe --- /dev/null +++ b/fs/ceph/metric.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDS_METRIC_H +#define _FS_CEPH_MDS_METRIC_H + +#include +#include + +/* This is the global metrics */ +struct ceph_client_metric { + atomic64_t total_dentries; + struct percpu_counter d_lease_hit; + struct percpu_counter d_lease_mis; +}; + +extern int ceph_metric_init(struct ceph_client_metric *m); +extern void ceph_metric_destroy(struct ceph_client_metric *m); +#endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 60aac3aee055..5c73cf1ca5ff 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -128,6 +128,7 @@ struct ceph_fs_client { struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_metric; struct dentry *debugfs_mds_sessions; #endif -- cgit v1.2.3 From 1af16d547f3080d71060092d22e79a34527d1d08 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:00 -0400 Subject: ceph: add caps perf metric for each superblock Count hits and misses in the caps cache. If the client has all of the necessary caps when a task needs references, then it's counted as a hit. Any other situation is a miss. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/acl.c | 2 +- fs/ceph/caps.c | 19 +++++++++++++++++++ fs/ceph/debugfs.c | 16 ++++++++++++++++ fs/ceph/dir.c | 5 +++-- fs/ceph/inode.c | 4 ++-- fs/ceph/metric.c | 26 ++++++++++++++++++++++---- fs/ceph/metric.h | 13 +++++++++++++ fs/ceph/super.h | 8 +++++--- fs/ceph/xattr.c | 4 ++-- 9 files changed, 83 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 26be6520d3fb..e0465741c591 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -22,7 +22,7 @@ static inline void ceph_set_cached_acl(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); spin_lock(&ci->i_ceph_lock); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + if (__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 0)) set_cached_acl(inode, type, acl); else forget_cached_acl(inode, type); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f1acde6fb9a6..82914866affc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -912,6 +912,20 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) return 0; } +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int touch) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + int r; + + r = __ceph_caps_issued_mask(ci, mask, touch); + if (r) + ceph_update_cap_hit(&fsc->mdsc->metric); + else + ceph_update_cap_mis(&fsc->mdsc->metric); + return r; +} + /* * Return true if mask caps are currently being revoked by an MDS. */ @@ -2685,6 +2699,11 @@ out_unlock: if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); + if (!ret) + ceph_update_cap_mis(&mdsc->metric); + else if (ret == 1) + ceph_update_cap_hit(&mdsc->metric); + dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1e7ea58f546a..37b2775949ad 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -129,6 +129,7 @@ static int metric_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; + int i, nr_caps = 0; seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); @@ -138,6 +139,21 @@ static int metric_show(struct seq_file *s, void *p) percpu_counter_sum(&m->d_lease_mis), percpu_counter_sum(&m->d_lease_hit)); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + nr_caps += s->s_nr_caps; + ceph_put_mds_session(s); + } + mutex_unlock(&mdsc->mutex); + seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, + percpu_counter_sum(&m->i_caps_mis), + percpu_counter_sum(&m->i_caps_hit)); + return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 44cf85136ad5..93476d447a4b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -349,8 +349,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && - __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { int shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) @@ -767,7 +768,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, !is_root_ceph_dentry(dir, dentry) && ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && - (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7fef94fd1e55..357c937699d5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2288,8 +2288,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); - if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) - return 0; + if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) + return 0; mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 873a3769fa0d..2a4b7394eed4 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -16,13 +16,29 @@ int ceph_metric_init(struct ceph_client_metric *m) ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); if (ret) return ret; + ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); - if (ret) { - percpu_counter_destroy(&m->d_lease_hit); - return ret; - } + if (ret) + goto err_d_lease_mis; + + ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_hit; + + ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_mis; return 0; + +err_i_caps_mis: + percpu_counter_destroy(&m->i_caps_hit); +err_i_caps_hit: + percpu_counter_destroy(&m->d_lease_mis); +err_d_lease_mis: + percpu_counter_destroy(&m->d_lease_hit); + + return ret; } void ceph_metric_destroy(struct ceph_client_metric *m) @@ -30,6 +46,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + percpu_counter_destroy(&m->i_caps_mis); + percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index da725dacabfe..098ee8a9adaf 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -10,8 +10,21 @@ struct ceph_client_metric { atomic64_t total_dentries; struct percpu_counter d_lease_hit; struct percpu_counter d_lease_mis; + + struct percpu_counter i_caps_hit; + struct percpu_counter i_caps_mis; }; extern int ceph_metric_init(struct ceph_client_metric *m); extern void ceph_metric_destroy(struct ceph_client_metric *m); + +static inline void ceph_update_cap_hit(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_hit); +} + +static inline void ceph_update_cap_mis(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_mis); +} #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5c73cf1ca5ff..47cfd8935b9c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -645,6 +645,8 @@ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int t); extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *cap); @@ -657,12 +659,12 @@ static inline int ceph_caps_issued(struct ceph_inode_info *ci) return issued; } -static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, - int touch) +static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, + int mask, int touch) { int r; spin_lock(&ci->i_ceph_lock); - r = __ceph_caps_issued_mask(ci, mask, touch); + r = __ceph_caps_issued_mask_metric(ci, mask, touch); spin_unlock(&ci->i_ceph_lock); return r; } diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 7b8a070a782d..71ee34d160c3 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -856,7 +856,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, if (ci->i_xattrs.version == 0 || !((req_mask & CEPH_CAP_XATTR_SHARED) || - __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); /* security module gets xattr while filling trace */ @@ -914,7 +914,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ci->i_xattrs.version, ci->i_xattrs.index_version); if (ci->i_xattrs.version == 0 || - !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { + !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) -- cgit v1.2.3 From 97e27aaa9a2cbd6238c66b3251d397e0eacc9968 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:01 -0400 Subject: ceph: add read/write latency metric support Calculate the latency for OSD read requests. Add a new r_end_stamp field to struct ceph_osd_request that will hold the time of that the reply was received. Use that to calculate the RTT for each call, and divide the sum of those by number of calls to get averate RTT. Keep a tally of RTT for OSD writes and number of calls to track average latency of OSD writes. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 20 ++++++++++++++++ fs/ceph/debugfs.c | 43 ++++++++++++++++++++++++++++++++- fs/ceph/file.c | 30 +++++++++++++++++++++++ fs/ceph/metric.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 22 +++++++++++++++++ 5 files changed, 186 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6f4678d98df7..01ad09733ac7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,10 +11,12 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" +#include "metric.h" #include #include @@ -216,6 +218,9 @@ static int ceph_sync_readpages(struct ceph_fs_client *fsc, if (!rc) rc = ceph_osdc_wait_request(osdc, req); + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + ceph_osdc_put_request(req); dout("readpages result %d\n", rc); return rc; @@ -299,6 +304,7 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; int rc = req->r_result <= 0 ? req->r_result : 0; int bytes = req->r_result >= 0 ? req->r_result : 0; @@ -336,6 +342,10 @@ unlock: put_page(page); bytes -= PAGE_SIZE; } + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + kfree(osd_data->pages); } @@ -643,6 +653,9 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc, if (!rc) rc = ceph_osdc_wait_request(osdc, req); + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + ceph_osdc_put_request(req); if (rc == 0) rc = len; @@ -794,6 +807,9 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the @@ -1852,6 +1868,10 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + out_put: ceph_osdc_put_request(req); if (err == -ECANCELED) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 37b2775949ad..30acbc7f9acd 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -18,6 +20,7 @@ #ifdef CONFIG_DEBUG_FS #include "mds_client.h" +#include "metric.h" static int mdsmap_show(struct seq_file *s, void *p) { @@ -124,13 +127,51 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \ + s64 _total, _avg, _min, _max, _sq, _st; \ + _avg = ktime_to_us(avg); \ + _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ + _max = ktime_to_us(max); \ + _total = total - 1; \ + _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \ + _st = int_sqrt64(_sq); \ + _st = ktime_to_us(_st); \ + seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \ + name, total, _avg, _min, _max, _st); \ +} + static int metric_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; int i, nr_caps = 0; - + s64 total, sum, avg, min, max, sq; + + seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); + seq_printf(s, "-----------------------------------------------------------------------------------\n"); + + spin_lock(&m->read_latency_lock); + total = m->total_reads; + sum = m->read_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->read_latency_min; + max = m->read_latency_max; + sq = m->read_latency_sq_sum; + spin_unlock(&m->read_latency_lock); + CEPH_METRIC_SHOW("read", total, avg, min, max, sq); + + spin_lock(&m->write_latency_lock); + total = m->total_writes; + sum = m->write_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->write_latency_min; + max = m->write_latency_max; + sq = m->write_latency_sq_sum; + spin_unlock(&m->write_latency_lock); + CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + + seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index afdfca965a7f..160644ddaeed 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -11,11 +11,13 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" #include "io.h" +#include "metric.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -906,6 +908,12 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ret = ceph_osdc_start_request(osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + ret); + ceph_osdc_put_request(req); i_size = i_size_read(inode); @@ -1044,6 +1052,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); @@ -1051,6 +1061,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, osd_data->bvec_pos.iter.bi_size); + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + } + if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; BUG_ON(!aio_req->write); @@ -1179,6 +1199,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; struct ceph_vino vino; struct ceph_osd_request *req; struct bio_vec *bvecs; @@ -1295,6 +1316,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + size = i_size_read(inode); if (!write) { if (ret == -ENOENT) @@ -1466,6 +1494,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, ret); out: ceph_osdc_put_request(req); if (ret != 0) { diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 2a4b7394eed4..f5cf32e7789f 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -2,6 +2,7 @@ #include #include +#include #include "metric.h" @@ -29,6 +30,20 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; + spin_lock_init(&m->read_latency_lock); + m->read_latency_sq_sum = 0; + m->read_latency_min = KTIME_MAX; + m->read_latency_max = 0; + m->total_reads = 0; + m->read_latency_sum = 0; + + spin_lock_init(&m->write_latency_lock); + m->write_latency_sq_sum = 0; + m->write_latency_min = KTIME_MAX; + m->write_latency_max = 0; + m->total_writes = 0; + m->write_latency_sum = 0; + return 0; err_i_caps_mis: @@ -51,3 +66,60 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); } + +static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, + ktime_t *min, ktime_t *max, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t total, avg, sq, lsum; + + total = ++(*totalp); + lsum = (*lsump += lat); + + if (unlikely(lat < *min)) + *min = lat; + if (unlikely(lat > *max)) + *max = lat; + + if (unlikely(total == 1)) + return; + + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); + sq = lat - avg; + avg = DIV64_U64_ROUND_CLOSEST(lsum, total); + sq = sq * (lat - avg); + *sq_sump += sq; +} + +void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->read_latency_lock); + __update_latency(&m->total_reads, &m->read_latency_sum, + &m->read_latency_min, &m->read_latency_max, + &m->read_latency_sq_sum, lat); + spin_unlock(&m->read_latency_lock); +} + +void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->write_latency_lock); + __update_latency(&m->total_writes, &m->write_latency_sum, + &m->write_latency_min, &m->write_latency_max, + &m->write_latency_sq_sum, lat); + spin_unlock(&m->write_latency_lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 098ee8a9adaf..ab1543c70a75 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -4,6 +4,7 @@ #include #include +#include /* This is the global metrics */ struct ceph_client_metric { @@ -13,6 +14,20 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; + + spinlock_t read_latency_lock; + u64 total_reads; + ktime_t read_latency_sum; + ktime_t read_latency_sq_sum; + ktime_t read_latency_min; + ktime_t read_latency_max; + + spinlock_t write_latency_lock; + u64 total_writes; + ktime_t write_latency_sum; + ktime_t write_latency_sq_sum; + ktime_t write_latency_min; + ktime_t write_latency_max; }; extern int ceph_metric_init(struct ceph_client_metric *m); @@ -27,4 +42,11 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) { percpu_counter_inc(&m->i_caps_mis); } + +extern void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); +extern void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ -- cgit v1.2.3 From 70c948206f0616c7e46130a26165b6a5d98bade4 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:02 -0400 Subject: ceph: add metadata perf metric support Add a new "r_ended" field to struct ceph_mds_request and use that to maintain the average latency of MDS requests. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 10 ++++++++++ fs/ceph/mds_client.c | 7 +++++++ fs/ceph/mds_client.h | 3 +++ fs/ceph/metric.c | 23 +++++++++++++++++++++++ fs/ceph/metric.h | 10 ++++++++++ 5 files changed, 53 insertions(+) (limited to 'fs') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 30acbc7f9acd..070ed8481340 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -171,6 +171,16 @@ static int metric_show(struct seq_file *s, void *p) spin_unlock(&m->write_latency_lock); CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + spin_lock(&m->metadata_latency_lock); + total = m->total_metadatas; + sum = m->metadata_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->metadata_latency_min; + max = m->metadata_latency_max; + sq = m->metadata_latency_sq_sum; + spin_unlock(&m->metadata_latency_lock); + CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); + seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 39a27c56411a..20fab5c72d39 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -2201,6 +2202,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) mutex_init(&req->r_fill_mutex); req->r_mdsc = mdsc; req->r_started = jiffies; + req->r_start_latency = ktime_get(); req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); @@ -2547,6 +2549,8 @@ out: static void complete_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + req->r_end_latency = ktime_get(); + if (req->r_callback) req->r_callback(mdsc, req); complete_all(&req->r_completion); @@ -3155,6 +3159,9 @@ out_err: /* kick calling process */ complete_request(mdsc, req); + + ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, + req->r_end_latency, err); out: ceph_mdsc_put_request(req); return; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 605995ae3718..cceeb33f2ff9 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -299,6 +300,8 @@ struct ceph_mds_request { unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_start_latency; /* start time to measure latency */ + unsigned long r_end_latency; /* finish time to measure latency */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index f5cf32e7789f..9217f35bc2b9 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -44,6 +44,13 @@ int ceph_metric_init(struct ceph_client_metric *m) m->total_writes = 0; m->write_latency_sum = 0; + spin_lock_init(&m->metadata_latency_lock); + m->metadata_latency_sq_sum = 0; + m->metadata_latency_min = KTIME_MAX; + m->metadata_latency_max = 0; + m->total_metadatas = 0; + m->metadata_latency_sum = 0; + return 0; err_i_caps_mis: @@ -123,3 +130,19 @@ void ceph_update_write_latency(struct ceph_client_metric *m, &m->write_latency_sq_sum, lat); spin_unlock(&m->write_latency_lock); } + +void ceph_update_metadata_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ENOENT)) + return; + + spin_lock(&m->metadata_latency_lock); + __update_latency(&m->total_metadatas, &m->metadata_latency_sum, + &m->metadata_latency_min, &m->metadata_latency_max, + &m->metadata_latency_sq_sum, lat); + spin_unlock(&m->metadata_latency_lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index ab1543c70a75..ccd81285a450 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -28,6 +28,13 @@ struct ceph_client_metric { ktime_t write_latency_sq_sum; ktime_t write_latency_min; ktime_t write_latency_max; + + spinlock_t metadata_latency_lock; + u64 total_metadatas; + ktime_t metadata_latency_sum; + ktime_t metadata_latency_sq_sum; + ktime_t metadata_latency_min; + ktime_t metadata_latency_max; }; extern int ceph_metric_init(struct ceph_client_metric *m); @@ -49,4 +56,7 @@ extern void ceph_update_read_latency(struct ceph_client_metric *m, extern void ceph_update_write_latency(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); +extern void ceph_update_metadata_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ -- cgit v1.2.3 From 0a454bdd501ad1aa30bb72e9581efa338ad6ce5c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 17 Mar 2020 08:47:31 -0400 Subject: ceph: reorganize __send_cap for less spinlock abuse Get rid of the __releases annotation by breaking it up into two functions: __prep_cap which is done under the spinlock and __send_cap that is done outside it. Add new fields to cap_msg_args for the wake boolean and old_xattr_buf pointer. Nothing checks the return value from __send_cap, so make it void return. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 165 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 86 insertions(+), 79 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 82914866affc..8b17f4b4ef7c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1181,6 +1181,7 @@ struct cap_msg_args { u64 xattr_version; u64 change_attr; struct ceph_buffer *xattr_buf; + struct ceph_buffer *old_xattr_buf; struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; @@ -1189,6 +1190,7 @@ struct cap_msg_args { kgid_t gid; umode_t mode; bool inline_data; + bool wake; }; /* @@ -1318,44 +1320,29 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) } /* - * Send a cap msg on the given inode. Update our caps state, then - * drop i_ceph_lock and send the message. + * Prepare to send a cap message to an MDS. Update the cap state, and populate + * the arg struct with the parameters that will need to be sent. This should + * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. - * - * Return non-zero if delayed release, or we experienced an error - * such that the caller should requeue + retry later. - * - * called with i_ceph_lock, then drops it. - * caller should hold snap_rwsem (read), s_mutex. */ -static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, - int op, int flags, int used, int want, int retain, - int flushing, u64 flush_tid, u64 oldest_flush_tid) - __releases(cap->ci->i_ceph_lock) +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, + int op, int flags, int used, int want, int retain, + int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; - struct ceph_buffer *old_blob = NULL; - struct cap_msg_args arg; int held, revoking; - int wake = 0; - int ret; - /* Don't send anything if it's still being created. Return delayed */ - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - spin_unlock(&ci->i_ceph_lock); - dout("%s async create in flight for %p\n", __func__, inode); - return 1; - } + lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", - inode, cap, cap->session, + dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", + __func__, inode, cap, cap->session, ceph_cap_string(held), ceph_cap_string(held & retain), ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); @@ -1363,60 +1350,58 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ - if (cap->implemented & ~cap->issued) { - /* - * Wake up any waiters on wanted -> needed transition. - * This is due to the weird transition from buffered - * to sync IO... we need to flush dirty pages _before_ - * allowing sync writes to avoid reordering. - */ - wake = 1; - } + /* + * Wake up any waiters on wanted -> needed transition. This is due to + * the weird transition from buffered to sync IO... we need to flush + * dirty pages _before_ allowing sync writes to avoid reordering. + */ + arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; - arg.session = cap->session; - arg.ino = ceph_vino(inode).ino; - arg.cid = cap->cap_id; - arg.follows = flushing ? ci->i_head_snapc->seq : 0; - arg.flush_tid = flush_tid; - arg.oldest_flush_tid = oldest_flush_tid; + arg->session = cap->session; + arg->ino = ceph_vino(inode).ino; + arg->cid = cap->cap_id; + arg->follows = flushing ? ci->i_head_snapc->seq : 0; + arg->flush_tid = flush_tid; + arg->oldest_flush_tid = oldest_flush_tid; - arg.size = inode->i_size; - ci->i_reported_size = arg.size; - arg.max_size = ci->i_wanted_max_size; + arg->size = inode->i_size; + ci->i_reported_size = arg->size; + arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) - ci->i_requested_max_size = arg.max_size; + ci->i_requested_max_size = arg->max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { - old_blob = __ceph_build_xattrs_blob(ci); - arg.xattr_version = ci->i_xattrs.version; - arg.xattr_buf = ci->i_xattrs.blob; + arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); + arg->xattr_version = ci->i_xattrs.version; + arg->xattr_buf = ci->i_xattrs.blob; } else { - arg.xattr_buf = NULL; + arg->xattr_buf = NULL; + arg->old_xattr_buf = NULL; } - arg.mtime = inode->i_mtime; - arg.atime = inode->i_atime; - arg.ctime = inode->i_ctime; - arg.btime = ci->i_btime; - arg.change_attr = inode_peek_iversion_raw(inode); + arg->mtime = inode->i_mtime; + arg->atime = inode->i_atime; + arg->ctime = inode->i_ctime; + arg->btime = ci->i_btime; + arg->change_attr = inode_peek_iversion_raw(inode); - arg.op = op; - arg.caps = cap->implemented; - arg.wanted = want; - arg.dirty = flushing; + arg->op = op; + arg->caps = cap->implemented; + arg->wanted = want; + arg->dirty = flushing; - arg.seq = cap->seq; - arg.issue_seq = cap->issue_seq; - arg.mseq = cap->mseq; - arg.time_warp_seq = ci->i_time_warp_seq; + arg->seq = cap->seq; + arg->issue_seq = cap->issue_seq; + arg->mseq = cap->mseq; + arg->time_warp_seq = ci->i_time_warp_seq; - arg.uid = inode->i_uid; - arg.gid = inode->i_gid; - arg.mode = inode->i_mode; + arg->uid = inode->i_uid; + arg->gid = inode->i_gid; + arg->mode = inode->i_mode; - arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && !list_empty(&ci->i_cap_snaps)) { struct ceph_cap_snap *capsnap; @@ -1429,27 +1414,35 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } } } - arg.flags = flags; - - spin_unlock(&ci->i_ceph_lock); + arg->flags = flags; +} - ceph_buffer_put(old_blob); +/* + * Send a cap msg on the given inode. + * + * Caller should hold snap_rwsem (read), s_mutex. + */ +static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg, + struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + int ret; - ret = send_cap_msg(&arg); + ret = send_cap_msg(arg); if (ret < 0) { pr_err("error sending cap msg, ino (%llx.%llx) " "flushing %s tid %llu, requeue\n", - ceph_vinop(inode), ceph_cap_string(flushing), - flush_tid); + ceph_vinop(inode), ceph_cap_string(arg->dirty), + arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } - if (wake) - wake_up_all(&ci->i_cap_wq); + ceph_buffer_put(arg->old_xattr_buf); - return ret; + if (arg->wake) + wake_up_all(&ci->i_cap_wq); } static inline int __send_flush_snap(struct inode *inode, @@ -1470,6 +1463,7 @@ static inline int __send_flush_snap(struct inode *inode, arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; + arg.old_xattr_buf = NULL; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; @@ -1493,6 +1487,7 @@ static inline int __send_flush_snap(struct inode *inode, arg.inline_data = capsnap->inline_data; arg.flags = 0; + arg.wake = false; return send_cap_msg(&arg); } @@ -1967,6 +1962,8 @@ retry_locked: } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + struct cap_msg_args arg; + cap = rb_entry(p, struct ceph_cap, ci_node); /* avoid looping forever */ @@ -2094,9 +2091,12 @@ ack: mds = cap->mds; /* remember mds, so we don't repeat */ - /* __send_cap drops i_ceph_lock */ - __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(mdsc, &arg, ci); + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2135,6 +2135,7 @@ retry: retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; + struct cap_msg_args arg; if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); @@ -2162,11 +2163,13 @@ retry_locked: flush_tid = __mark_caps_flushing(inode, session, true, &oldest_flush_tid); - /* __send_cap drops i_ceph_lock */ - __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(mdsc, &arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = @@ -2368,15 +2371,19 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, first_tid = cf->tid + 1; if (cf->caps) { + struct cap_msg_args arg; + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, cap, cf->tid, ceph_cap_string(cf->caps)); - __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + __send_cap(mdsc, &arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, -- cgit v1.2.3 From 681ac634883ba162ce5c50c10120c8bf4df81574 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Mar 2020 15:29:34 -0400 Subject: ceph: split up __finish_cap_flush This function takes a mdsc argument or ci argument, but if both are passed in, it ignores the ci arg. Fortunately, nothing does that, but there's no good reason to have the same function handle both cases. Also, get rid of some branches and just use |= to set the wake_* vals. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 60 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8b17f4b4ef7c..74d05e5db68d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1740,30 +1740,33 @@ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) * Remove cap_flush from the mdsc's or inode's flushing cap list. * Return true if caller needs to wake up flush waiters. */ -static bool __finish_cap_flush(struct ceph_mds_client *mdsc, - struct ceph_inode_info *ci, - struct ceph_cap_flush *cf) +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; - if (mdsc) { - /* are there older pending cap flushes? */ - if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { - prev = list_prev_entry(cf, g_list); - prev->wake = true; - wake = false; - } - list_del(&cf->g_list); - } else if (ci) { - if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { - prev = list_prev_entry(cf, i_list); - prev->wake = true; - wake = false; - } - list_del(&cf->i_list); - } else { - BUG_ON(1); + + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { + prev = list_prev_entry(cf, g_list); + prev->wake = true; + wake = false; } + list_del(&cf->g_list); + return wake; +} + +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { + prev = list_prev_entry(cf, i_list); + prev->wake = true; + wake = false; + } + list_del(&cf->i_list); return wake; } @@ -3473,8 +3476,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, if (cf->caps == 0) /* capsnap */ continue; if (cf->tid <= flush_tid) { - if (__finish_cap_flush(NULL, ci, cf)) - wake_ci = true; + wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { cleaned &= ~cf->caps; @@ -3496,10 +3498,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, spin_lock(&mdsc->cap_dirty_lock); - list_for_each_entry(cf, &to_remove, i_list) { - if (__finish_cap_flush(mdsc, NULL, cf)) - wake_mdsc = true; - } + list_for_each_entry(cf, &to_remove, i_list) + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); if (ci->i_flushing_caps == 0) { if (list_empty(&ci->i_cap_flush_list)) { @@ -3591,17 +3591,15 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, dout(" removing %p cap_snap %p follows %lld\n", inode, capsnap, follows); list_del(&capsnap->ci_item); - if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) - wake_ci = true; + wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_cap_flush_list)) list_del_init(&ci->i_flushing_item); - if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) - wake_mdsc = true; - + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, + &capsnap->cap_flush); spin_unlock(&mdsc->cap_dirty_lock); } spin_unlock(&ci->i_ceph_lock); -- cgit v1.2.3 From d7dbfb4f2bdb037758f46271f75ea6c8d35626b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Mar 2020 15:34:20 -0400 Subject: ceph: add comments for handle_cap_flush_ack logic Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 74d05e5db68d..056ad0d99438 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3471,14 +3471,26 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { + /* Is this the one that was flushed? */ if (cf->tid == flush_tid) cleaned = cf->caps; - if (cf->caps == 0) /* capsnap */ + + /* Is this a capsnap? */ + if (cf->caps == 0) continue; + if (cf->tid <= flush_tid) { + /* + * An earlier or current tid. The FLUSH_ACK should + * represent a superset of this flush's caps. + */ wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { + /* + * This is a later one. Any caps in it are still dirty + * so don't count them as cleaned. + */ cleaned &= ~cf->caps; if (!cleaned) break; -- cgit v1.2.3 From 7391fba2678c0288d0daf636ddc4f78de7704f81 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Mar 2020 16:43:30 -0400 Subject: ceph: don't release i_ceph_lock in handle_cap_trunc There's no reason to do this here. Just have the caller handle it. Also, add a lockdep assertion. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 056ad0d99438..34aa9f4c1780 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3631,10 +3631,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, * * caller hold s_mutex. */ -static void handle_cap_trunc(struct inode *inode, +static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session) - __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -3645,7 +3644,9 @@ static void handle_cap_trunc(struct inode *inode, int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); - int queue_trunc = 0; + bool queue_trunc = false; + + lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; @@ -3653,10 +3654,7 @@ static void handle_cap_trunc(struct inode *inode, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&ci->i_ceph_lock); - - if (queue_trunc) - ceph_queue_vmtruncate(inode); + return queue_trunc; } /* @@ -3905,6 +3903,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, size_t snaptrace_len; void *p, *end; struct cap_extra_info extra_info = {}; + bool queue_trunc; dout("handle_caps from mds%d\n", session->s_mds); @@ -4088,7 +4087,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; case CEPH_CAP_OP_TRUNC: - handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session); + spin_unlock(&ci->i_ceph_lock); + if (queue_trunc) + ceph_queue_vmtruncate(inode); break; default: -- cgit v1.2.3 From 7833323363233c75fd8d10b5ceefbb9515cb3e32 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Mar 2020 12:00:16 -0400 Subject: ceph: don't take i_ceph_lock in handle_cap_import Just take it before calling it. This means we have to do a couple of minor in-memory operations under the spinlock now, but those shouldn't be an issue. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 34aa9f4c1780..f135660a2479 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3805,7 +3805,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, struct ceph_cap **target_cap, int *old_issued) - __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap, *ocap, *new_cap = NULL; @@ -3830,14 +3829,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", inode, ci, mds, mseq, peer); - retry: - spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { if (!new_cap) { spin_unlock(&ci->i_ceph_lock); new_cap = ceph_get_cap(mdsc, NULL); + spin_lock(&ci->i_ceph_lock); goto retry; } cap = new_cap; @@ -4051,6 +4049,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, } else { down_read(&mdsc->snap_rwsem); } + spin_lock(&ci->i_ceph_lock); handle_cap_import(mdsc, inode, h, peer, session, &cap, &extra_info.issued); handle_cap_grant(inode, session, cap, -- cgit v1.2.3 From 4fb5dda39c26fa5b64059dac0a2c3340a4f6f11b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Mar 2020 15:34:13 -0400 Subject: ceph: document what protects i_dirty_item and i_flushing_item Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 47cfd8935b9c..bb372859c0ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -351,7 +351,9 @@ struct ceph_inode_info { struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ - struct list_head i_dirty_item, i_flushing_item; + struct list_head i_dirty_item, i_flushing_item; /* protected by + * mdsc->cap_dirty_lock + */ /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ -- cgit v1.2.3 From dc3da0461cc4b76f2d0c5b12247fcb3b520edbbf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 20 Mar 2020 16:45:45 -0400 Subject: ceph: fix potential race in ceph_check_caps Nothing ensures that session will still be valid by the time we dereference the pointer. Take and put a reference. In principle, we should always be able to get a reference here, but throw a warning if that's ever not the case. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f135660a2479..d555d2790619 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2044,12 +2044,24 @@ ack: if (mutex_trylock(&session->s_mutex) == 0) { dout("inverting session/ino locks on %p\n", session); + session = ceph_get_mds_session(session); spin_unlock(&ci->i_ceph_lock); if (took_snap_rwsem) { up_read(&mdsc->snap_rwsem); took_snap_rwsem = 0; } - mutex_lock(&session->s_mutex); + if (session) { + mutex_lock(&session->s_mutex); + ceph_put_mds_session(session); + } else { + /* + * Because we take the reference while + * holding the i_ceph_lock, it should + * never be NULL. Throw a warning if it + * ever is. + */ + WARN_ON_ONCE(true); + } goto retry; } } -- cgit v1.2.3 From 88828190f0073bd8f9aa5e2b1caf753d289c6d49 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 20 Mar 2020 17:07:36 -0400 Subject: ceph: throw a warning if we destroy session with mutex still locked Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 20fab5c72d39..de6bb8829837 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -659,6 +659,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); + WARN_ON(mutex_is_locked(&s->s_mutex)); xa_destroy(&s->s_delegated_inos); kfree(s); } -- cgit v1.2.3 From 6f05b30ea063a2a05dda47a4105a69267ae5270f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 30 Mar 2020 19:56:37 +0800 Subject: ceph: reset i_requested_max_size if file write is not wanted write can stuck at waiting for larger max_size in following sequence of events: - client opens a file and writes to position 'A' (larger than unit of max size increment) - client closes the file handle and updates wanted caps (not wanting file write caps) - client opens and truncates the file, writes to position 'A' again. At the 1st event, client set inode's requested_max_size to 'A'. At the 2nd event, mds removes client's writable range, but client does not reset requested_max_size. At the 3rd event, client does not request max size because requested_max_size is already larger than 'A'. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d555d2790619..53db1fcbfdd3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1369,8 +1369,12 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->size = inode->i_size; ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; - if (cap == ci->i_auth_cap) - ci->i_requested_max_size = arg->max_size; + if (cap == ci->i_auth_cap) { + if (want & CEPH_CAP_ANY_FILE_WR) + ci->i_requested_max_size = arg->max_size; + else + ci->i_requested_max_size = 0; + } if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); @@ -3342,10 +3346,6 @@ static void handle_cap_grant(struct inode *inode, ci->i_requested_max_size = 0; } wake = true; - } else if (ci->i_wanted_max_size > ci->i_max_size && - ci->i_wanted_max_size > ci->i_requested_max_size) { - /* CEPH_CAP_OP_IMPORT */ - wake = true; } } @@ -3421,9 +3421,18 @@ static void handle_cap_grant(struct inode *inode, fill_inline = true; } - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap && + le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (newcaps & ~extra_info->issued) wake = true; + + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } + ceph_kick_flushing_inode_caps(session, ci); spin_unlock(&ci->i_ceph_lock); up_read(&session->s_mdsc->snap_rwsem); @@ -3882,9 +3891,6 @@ retry: __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } - /* make sure we re-request max_size, if necessary */ - ci->i_requested_max_size = 0; - *old_issued = issued; *target_cap = cap; } @@ -4318,6 +4324,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode, cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; + if (cap == ci->i_auth_cap && + !(wanted & CEPH_CAP_ANY_FILE_WR)) + ci->i_requested_max_size = 0; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, -- cgit v1.2.3 From 1cf03a68e791b1673bc4daaa88a0820f34f538f8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 1 Apr 2020 17:07:52 -0400 Subject: ceph: convert mdsc->cap_dirty to a per-session list This is a per-sb list now, but that makes it difficult to tell when the cap is the last dirty one associated with the session. Switch this to be a per-session list, but continue using the mdsc->cap_dirty_lock to protect the lists. This list is only ever walked in ceph_flush_dirty_caps, so change that to walk the sessions array and then flush the caps for inodes on each session's list. If the auth cap ever changes while the inode has dirty caps, then move the inode to the appropriate session for the new auth_cap. Also, ensure that we never remove an auth cap while the inode is still on the s_cap_dirty list. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 78 +++++++++++++++++++++++++++++++++++++++++++--------- fs/ceph/mds_client.c | 2 +- fs/ceph/mds_client.h | 5 ++-- fs/ceph/super.h | 19 +++++++++++-- 4 files changed, 85 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53db1fcbfdd3..d5ad2434bb07 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -597,6 +597,27 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, } } +/** + * change_auth_cap_ses - move inode to appropriate lists when auth caps change + * @ci: inode to be moved + * @session: new auth caps session + */ +static void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) +{ + lockdep_assert_held(&ci->i_ceph_lock); + + if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) + return; + + spin_lock(&session->s_mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) + list_move(&ci->i_dirty_item, &session->s_cap_dirty); + if (!list_empty(&ci->i_flushing_item)) + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + spin_unlock(&session->s_mdsc->cap_dirty_lock); +} + /* * Add a capability under the given MDS session. * @@ -727,6 +748,9 @@ void ceph_add_cap(struct inode *inode, if (flags & CEPH_CAP_FLAG_AUTH) { if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + if (ci->i_auth_cap && + ci->i_auth_cap->session != cap->session) + change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; cap->mds_wanted = wanted; } @@ -1123,8 +1147,10 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) + if (ci->i_auth_cap == cap) { + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); ci->i_auth_cap = NULL; + } /* remove from session list */ spin_lock(&session->s_cap_lock); @@ -1689,6 +1715,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { + struct ceph_mds_session *session = ci->i_auth_cap->session; + WARN_ON_ONCE(ci->i_prealloc_cap_flush); swap(ci->i_prealloc_cap_flush, *pcf); @@ -1701,7 +1729,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); @@ -3749,15 +3777,9 @@ retry: tcap->issue_seq = t_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; - if (cap == ci->i_auth_cap) + if (cap == ci->i_auth_cap) { ci->i_auth_cap = tcap; - - if (!list_empty(&ci->i_cap_flush_list) && - ci->i_auth_cap == tcap) { - spin_lock(&mdsc->cap_dirty_lock); - list_move_tail(&ci->i_flushing_item, - &tcap->session->s_cap_flushing); - spin_unlock(&mdsc->cap_dirty_lock); + change_auth_cap_ses(ci, tcap->session); } } __ceph_remove_cap(cap, false); @@ -4176,15 +4198,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) /* * Flush all dirty caps to the mds */ -void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +static void flush_dirty_session_caps(struct ceph_mds_session *s) { + struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_inode_info *ci; struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - while (!list_empty(&mdsc->cap_dirty)) { - ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + while (!list_empty(&s->s_cap_dirty)) { + ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->vfs_inode; ihold(inode); @@ -4198,6 +4221,35 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } +static void iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *)) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + if (!mdsc->sessions[mds]) + continue; + + s = ceph_get_mds_session(mdsc->sessions[mds]); + if (!s) + continue; + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + iterate_sessions(mdsc, flush_dirty_session_caps); +} + void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index de6bb8829837..588221b9b3d0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -755,6 +755,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); + INIT_LIST_HEAD(&s->s_cap_dirty); INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; @@ -4373,7 +4374,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); - INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cceeb33f2ff9..788182adcc51 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -199,8 +199,10 @@ struct ceph_mds_session { struct list_head s_cap_releases; /* waiting cap_release messages */ struct work_struct s_cap_release_work; - /* protected by mutex */ + /* both protected by s_mdsc->cap_dirty_lock */ + struct list_head s_cap_dirty; /* inodes w/ dirty caps */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; @@ -424,7 +426,6 @@ struct ceph_mds_client { u64 last_cap_flush_tid; struct list_head cap_flush_list; - struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index bb372859c0ad..3a95395a4217 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -351,9 +351,22 @@ struct ceph_inode_info { struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ - struct list_head i_dirty_item, i_flushing_item; /* protected by - * mdsc->cap_dirty_lock - */ + + /* + * Link to the the auth cap's session's s_cap_dirty list. s_cap_dirty + * is protected by the mdsc->cap_dirty_lock, but each individual item + * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty + * requires the mdsc->cap_dirty_lock. List presence for an item can + * be tested under the i_ceph_lock. Changing anything requires both. + */ + struct list_head i_dirty_item; + + /* + * Link to session's s_cap_flushing list. Protected by + * mdsc->cap_dirty_lock. + */ + struct list_head i_flushing_item; + /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ -- cgit v1.2.3 From d67c72e6cce99eab5ab9d62c599e33e5141ff8b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 1 Apr 2020 18:27:25 -0400 Subject: ceph: request expedited service on session's last cap flush When flushing a lot of caps to the MDS's at once (e.g. for syncfs), we can end up waiting a substantial amount of time for MDS replies, due to the fact that it may delay some of them so that it can batch them up together in a single journal transaction. This can lead to stalls when calling sync or syncfs. What we'd really like to do is request expedited service on the _last_ cap we're flushing back to the server. If the CHECK_CAPS_FLUSH flag is set on the request and the current inode was the last one on the session->s_cap_dirty list, then mark the request with CEPH_CLIENT_CAPS_SYNC. Note that this heuristic is not perfect. New inodes can race onto the list after we've started flushing, but it does seem to fix some common use cases. URL: https://tracker.ceph.com/issues/44744 Reported-by: Jan Fajerski Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d5ad2434bb07..2558fd14126a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1997,6 +1997,7 @@ retry_locked: } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + int mflags = 0; struct cap_msg_args arg; cap = rb_entry(p, struct ceph_cap, ci_node); @@ -2128,6 +2129,9 @@ ack: flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, false, &oldest_flush_tid); + if (flags & CHECK_CAPS_FLUSH && + list_empty(&session->s_cap_dirty)) + mflags |= CEPH_CLIENT_CAPS_SYNC; } else { flushing = 0; flush_tid = 0; @@ -2138,8 +2142,8 @@ ack: mds = cap->mds; /* remember mds, so we don't repeat */ - __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, - retain, flushing, flush_tid, oldest_flush_tid); + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, + want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(mdsc, &arg, ci); -- cgit v1.2.3 From 829ad4db952aac86d11a62647d2516ab46c2fcd2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Apr 2020 13:09:07 -0400 Subject: ceph: ceph_kick_flushing_caps needs the s_mutex The mdsc->cap_dirty_lock is not held while walking the list in ceph_kick_flushing_caps, which is not safe. ceph_early_kick_flushing_caps does something similar, but the s_mutex is held while it's called and I think that guards against changes to the list. Ensure we hold the s_mutex when calling ceph_kick_flushing_caps, and add some clarifying comments. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ fs/ceph/mds_client.c | 2 ++ fs/ceph/mds_client.h | 4 +++- fs/ceph/super.h | 7 +++++-- 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2558fd14126a..41eb999dadf0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2518,6 +2518,8 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; u64 oldest_flush_tid; + lockdep_assert_held(&session->s_mutex); + dout("kick_flushing_caps mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 588221b9b3d0..6c283c52d401 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4024,7 +4024,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate != CEPH_MDS_STATE_STARTING) pr_info("mds%d recovery completed\n", s->s_mds); kick_requests(mdsc, i); + mutex_lock(&s->s_mutex); ceph_kick_flushing_caps(mdsc, s); + mutex_unlock(&s->s_mutex); wake_up_session_caps(s, RECONNECT); } } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 788182adcc51..43111e408fa2 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -199,8 +199,10 @@ struct ceph_mds_session { struct list_head s_cap_releases; /* waiting cap_release messages */ struct work_struct s_cap_release_work; - /* both protected by s_mdsc->cap_dirty_lock */ + /* See ceph_inode_info->i_dirty_item. */ struct list_head s_cap_dirty; /* inodes w/ dirty caps */ + + /* See ceph_inode_info->i_flushing_item. */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3a95395a4217..b82f82d8213a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -362,8 +362,11 @@ struct ceph_inode_info { struct list_head i_dirty_item; /* - * Link to session's s_cap_flushing list. Protected by - * mdsc->cap_dirty_lock. + * Link to session's s_cap_flushing list. Protected in a similar + * fashion to i_dirty_item, but also by the s_mutex for changes. The + * s_cap_flushing list can be walked while holding either the s_mutex + * or msdc->cap_dirty_lock. List presence can also be checked while + * holding the i_ceph_lock for this inode. */ struct list_head i_flushing_item; -- cgit v1.2.3 From daa668fbacfdb9147b1828fc0f97b685bf4a51e3 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 7 Apr 2020 11:30:19 +0100 Subject: ceph: normalize 'delta' parameter usage in check_quota_exceeded Function check_quota_exceeded() uses delta parameter only for the QUOTA_CHECK_MAX_BYTES_OP operation. Using this parameter also for MAX_FILES will makes the code cleaner and will be required to support cross-quota-tree renames. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 19507e2fdb57..7377838d3f8b 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -361,8 +361,6 @@ restart: spin_unlock(&ci->i_ceph_lock); switch (op) { case QUOTA_CHECK_MAX_FILES_OP: - exceeded = (max && (rvalue >= max)); - break; case QUOTA_CHECK_MAX_BYTES_OP: exceeded = (max && (rvalue + delta > max)); break; @@ -417,7 +415,7 @@ bool ceph_quota_is_max_files_exceeded(struct inode *inode) WARN_ON(!S_ISDIR(inode->i_mode)); - return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0); + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 1); } /* -- cgit v1.2.3 From dffdcd71458e699e839f0bf47c3d42d64210b939 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 7 Apr 2020 11:30:20 +0100 Subject: ceph: allow rename operation under different quota realms Returning -EXDEV when trying to 'mv' files/directories from different quota realms results in copy+unlink operations instead of the faster CEPH_MDS_OP_RENAME. This will occur even when there aren't any quotas set in the destination directory, or if there's enough space left for the new file(s). This patch adds a new helper function to be called on rename operations which will allow these operations if they can be executed. This patch mimics userland fuse client commit b8954e5734b3 ("client: optimize rename operation under different quota root"). Since ceph_quota_is_same_realm() is now called only from this new helper, make it static. URL: https://tracker.ceph.com/issues/44791 Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 9 +++++---- fs/ceph/quota.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/ceph/super.h | 3 ++- 3 files changed, 64 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 93476d447a4b..39f5311404b0 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1209,11 +1209,12 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, op = CEPH_MDS_OP_RENAMESNAP; else return -EROFS; + } else if (old_dir != new_dir) { + err = ceph_quota_check_rename(mdsc, d_inode(old_dentry), + new_dir); + if (err) + return err; } - /* don't allow cross-quota renames */ - if ((old_dir != new_dir) && - (!ceph_quota_is_same_realm(old_dir, new_dir))) - return -EXDEV; dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 7377838d3f8b..198ddde5c1e6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -264,7 +264,7 @@ restart: return NULL; } -bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) { struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc; struct ceph_snap_realm *old_realm, *new_realm; @@ -516,3 +516,59 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) return is_updated; } +/* + * ceph_quota_check_rename - check if a rename can be executed + * @mdsc: MDS client instance + * @old: inode to be copied + * @new: destination inode (directory) + * + * This function verifies if a rename (e.g. moving a file or directory) can be + * executed. It forces an rstat update in the @new target directory (and in the + * source @old as well, if it's a directory). The actual check is done both for + * max_files and max_bytes. + * + * This function returns 0 if it's OK to do the rename, or, if quotas are + * exceeded, -EXDEV (if @old is a directory) or -EDQUOT. + */ +int ceph_quota_check_rename(struct ceph_mds_client *mdsc, + struct inode *old, struct inode *new) +{ + struct ceph_inode_info *ci_old = ceph_inode(old); + int ret = 0; + + if (ceph_quota_is_same_realm(old, new)) + return 0; + + /* + * Get the latest rstat for target directory (and for source, if a + * directory) + */ + ret = ceph_do_getattr(new, CEPH_STAT_RSTAT, false); + if (ret) + return ret; + + if (S_ISDIR(old->i_mode)) { + ret = ceph_do_getattr(old, CEPH_STAT_RSTAT, false); + if (ret) + return ret; + ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, + ci_old->i_rbytes); + if (!ret) + ret = check_quota_exceeded(new, + QUOTA_CHECK_MAX_FILES_OP, + ci_old->i_rfiles + + ci_old->i_rsubdirs); + if (ret) + ret = -EXDEV; + } else { + ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, + i_size_read(old)); + if (!ret) + ret = check_quota_exceeded(new, + QUOTA_CHECK_MAX_FILES_OP, 1); + if (ret) + ret = -EDQUOT; + } + + return ret; +} diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b82f82d8213a..226f19c9042f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1210,13 +1210,14 @@ extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); -extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newlen); extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); +extern int ceph_quota_check_rename(struct ceph_mds_client *mdsc, + struct inode *old, struct inode *new); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ -- cgit v1.2.3 From 878dabb64117406abd40977b87544d05bb3031fc Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 18 May 2020 18:47:26 +0100 Subject: ceph: don't return -ESTALE if there's still an open file Similarly to commit 03f219041fdb ("ceph: check i_nlink while converting a file handle to dentry"), this fixes another corner case with name_to_handle_at/open_by_handle_at. The issue has been detected by xfstest generic/467, when doing: - name_to_handle_at("/cephfs/myfile") - open("/cephfs/myfile") - unlink("/cephfs/myfile") - sync; sync; - drop caches - open_by_handle_at() The call to open_by_handle_at should not fail because the file hasn't been deleted yet (only unlinked) and we do have a valid handle to it. -ESTALE shall be returned only if i_nlink is 0 *and* i_count is 1. This patch also makes sure we have LINK caps before checking i_nlink. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Acked-by: Amir Goldstein Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 79dc06881e78..e088843a7734 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -172,9 +172,16 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + int err; + if (IS_ERR(inode)) return ERR_CAST(inode); - if (inode->i_nlink == 0) { + /* We need LINK caps to reliably check i_nlink */ + err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); + if (err) + return ERR_PTR(err); + /* -ESTALE if inode as been unlinked and no file is open */ + if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { iput(inode); return ERR_PTR(-ESTALE); } -- cgit v1.2.3 From ea8412b284c09742d5b11721e225b4ff011aa397 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 20 May 2020 03:51:19 -0400 Subject: ceph: make sure mdsc->mutex is nested in s->s_mutex to fix dead lock send_mds_reconnect takes the s_mutex while the mdsc->mutex is already held. That inverts the locking order documented in mds_client.h. Drop the mdsc->mutex, acquire the s_mutex and then reacquire the mdsc->mutex to prevent a deadlock. URL: https://tracker.ceph.com/issues/45609 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6c283c52d401..0e0ab01694dc 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3769,8 +3769,6 @@ fail: * recovering MDS might have. * * This is a relatively heavyweight operation, but it's rare. - * - * called with mdsc->mutex held. */ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) @@ -4024,7 +4022,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate != CEPH_MDS_STATE_STARTING) pr_info("mds%d recovery completed\n", s->s_mds); kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); ceph_kick_flushing_caps(mdsc, s); mutex_unlock(&s->s_mutex); wake_up_session_caps(s, RECONNECT); -- cgit v1.2.3 From e64f44a884657358812e6c057957be546db03cbe Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 27 May 2020 09:09:27 -0400 Subject: ceph: skip checking caps when session reconnecting and releasing reqs It make no sense to check the caps when reconnecting to mds. And for the async dirop caps, they will be put by its _cb() function, so when releasing the requests, it will make no sense too. URL: https://tracker.ceph.com/issues/45635 Signed-off-by: Xiubo Li Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 15 +++++++++++++-- fs/ceph/mds_client.c | 16 ++++++++++++++-- fs/ceph/mds_client.h | 1 + fs/ceph/super.h | 2 ++ 4 files changed, 30 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 41eb999dadf0..972c13aa4225 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3016,7 +3016,8 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ -void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, + bool skip_checking_caps) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; @@ -3072,7 +3073,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); - if (last) + if (last && !skip_checking_caps) ceph_check_caps(ci, 0, NULL); else if (flushsnaps) ceph_flush_snaps(ci, NULL); @@ -3082,6 +3083,16 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) iput(inode); } +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, false); +} + +void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, true); +} + /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0e0ab01694dc..a50497142e59 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -804,7 +804,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); - ceph_mdsc_release_dir_caps(req); + ceph_mdsc_release_dir_caps_no_check(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -3402,6 +3402,18 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) } } +void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +{ + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), + dcaps); + } +} + /* * called under session->mutex. */ @@ -3434,7 +3446,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_session->s_mds != session->s_mds) continue; - ceph_mdsc_release_dir_caps(req); + ceph_mdsc_release_dir_caps_no_check(req); __send_request(mdsc, session, req, true); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 43111e408fa2..5e0c4073a6be 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -507,6 +507,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 226f19c9042f..5a6cdd39bc10 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1095,6 +1095,8 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, + int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, -- cgit v1.2.3