June 2021 - Linux-stable-mirror

[PATCH AUTOSEL 5.4 01/29] net: ieee802154: fix null deref in parse dev addr

by Sasha Levin

From: Dan Robertson <dan(a)dlrobertson.com> [ Upstream commit 9fdd04918a452980631ecc499317881c1d120b70 ] Fix a logic error that could result in a null deref if the user sets the mode incorrectly for the given addr type. Signed-off-by: Dan Robertson <dan(a)dlrobertson.com> Acked-by: Alexander Aring <aahringo(a)redhat.com> Link: https://lore.kernel.org/r/20210423040214.15438-2-dan@dlrobertson.com Signed-off-by: Stefan Schmidt <stefan(a)datenfreihafen.org> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- net/ieee802154/nl802154.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 328bb9f5342e..b2ba1d2556f1 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1314,19 +1314,20 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; - if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || - !attrs[NL802154_DEV_ADDR_ATTR_MODE] || - !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || - attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE]) return -EINVAL; addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); switch (addr->mode) { case NL802154_DEV_ADDR_SHORT: + if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT]) + return -EINVAL; addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); break; case NL802154_DEV_ADDR_EXTENDED: + if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]) + return -EINVAL; addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); break; default: -- 2.30.2

4 years, 4 months

1
28
0 0

[PATCH AUTOSEL 5.10 01/39] net: ieee802154: fix null deref in parse dev addr

by Sasha Levin

From: Dan Robertson <dan(a)dlrobertson.com> [ Upstream commit 9fdd04918a452980631ecc499317881c1d120b70 ] Fix a logic error that could result in a null deref if the user sets the mode incorrectly for the given addr type. Signed-off-by: Dan Robertson <dan(a)dlrobertson.com> Acked-by: Alexander Aring <aahringo(a)redhat.com> Link: https://lore.kernel.org/r/20210423040214.15438-2-dan@dlrobertson.com Signed-off-by: Stefan Schmidt <stefan(a)datenfreihafen.org> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- net/ieee802154/nl802154.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index f0b47d43c9f6..b34e4f827e75 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1298,19 +1298,20 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; - if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || - !attrs[NL802154_DEV_ADDR_ATTR_MODE] || - !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || - attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE]) return -EINVAL; addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); switch (addr->mode) { case NL802154_DEV_ADDR_SHORT: + if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT]) + return -EINVAL; addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); break; case NL802154_DEV_ADDR_EXTENDED: + if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]) + return -EINVAL; addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); break; default: -- 2.30.2

4 years, 4 months

1
38
0 0

[PATCH AUTOSEL 5.12 01/49] net: ieee802154: fix null deref in parse dev addr

by Sasha Levin

From: Dan Robertson <dan(a)dlrobertson.com> [ Upstream commit 9fdd04918a452980631ecc499317881c1d120b70 ] Fix a logic error that could result in a null deref if the user sets the mode incorrectly for the given addr type. Signed-off-by: Dan Robertson <dan(a)dlrobertson.com> Acked-by: Alexander Aring <aahringo(a)redhat.com> Link: https://lore.kernel.org/r/20210423040214.15438-2-dan@dlrobertson.com Signed-off-by: Stefan Schmidt <stefan(a)datenfreihafen.org> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- net/ieee802154/nl802154.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 05f6bd89a7dd..0cf2374c143b 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1298,19 +1298,20 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; - if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || - !attrs[NL802154_DEV_ADDR_ATTR_MODE] || - !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || - attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE]) return -EINVAL; addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); switch (addr->mode) { case NL802154_DEV_ADDR_SHORT: + if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT]) + return -EINVAL; addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); break; case NL802154_DEV_ADDR_EXTENDED: + if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]) + return -EINVAL; addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); break; default: -- 2.30.2

4 years, 4 months

1
48
0 0

[patch V2 04/14] x86/pkru: Make the fpinit state update work

by Thomas Gleixner

It's unclear how this ever made any difference because init_fpstate.xsave.header.xfeatures is always 0 so get_xsave_addr() will always return a NULL pointer. Fix this by: - Creating a propagation function instead of several open coded instances and invoke it only once after the boot CPU has been initialized and from the debugfs file write function. - Set the PKRU feature bit in init_fpstate.xsave.header.xfeatures to make get_xsave_addr() work which allows to store the default value for real. - Having the feature bit set also gets rid of copy_init_pkru_to_fpregs() because now copy_init_fpstate_to_fpregs() already loads the default PKRU value. Fixes: a5eff7259790 ("x86/pkeys: Add PKRU value to init_fpstate") Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de> Cc: stable(a)vger.kernel.org --- V2: New patch --- arch/x86/include/asm/pkeys.h | 3 ++- arch/x86/kernel/cpu/bugs.c | 3 +++ arch/x86/kernel/cpu/common.c | 5 ----- arch/x86/kernel/fpu/core.c | 3 --- arch/x86/mm/pkeys.c | 31 ++++++++++++++----------------- include/linux/pkeys.h | 2 +- 6 files changed, 20 insertions(+), 27 deletions(-) --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -124,7 +124,8 @@ extern int arch_set_user_pkey_access(str unsigned long init_val); extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); -extern void copy_init_pkru_to_fpregs(void); + +extern void pkru_propagate_default(void); static inline int vma_pkey(struct vm_area_struct *vma) { --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include <linux/prctl.h> #include <linux/sched/smt.h> #include <linux/pgtable.h> +#include <linux/pkeys.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -120,6 +121,8 @@ void __init check_bugs(void) arch_smt_update(); + pkru_propagate_default(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -465,8 +465,6 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { - struct pkru_state *pk; - /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -477,9 +475,6 @@ static __always_inline void setup_pku(st return; cr4_set_bits(X86_CR4_PKE); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (pk) - pk->pkru = init_pkru_value; /* * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -348,9 +348,6 @@ static inline void copy_init_fpstate_to_ copy_kernel_to_fxregs(&init_fpstate.fxsave); else copy_kernel_to_fregs(&init_fpstate.fsave); - - if (boot_cpu_has(X86_FEATURE_OSPKE)) - copy_init_pkru_to_fpregs(); } /* --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -125,20 +125,20 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); -/* - * Called from the FPU code when creating a fresh set of FPU - * registers. This is called from a very specific context where - * we know the FPU registers are safe for use and we can use PKRU - * directly. - */ -void copy_init_pkru_to_fpregs(void) +void pkru_propagate_default(void) { - u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); + struct pkru_state *pk; + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; /* - * Override the PKRU state that came from 'init_fpstate' - * with the baseline from the process. + * Force XFEATURE_PKRU to be set in the header otherwise + * get_xsave_addr() does not work and it needs to be set + * to make XRSTOR(S) load it. */ - write_pkru(init_pkru_value_snapshot); + init_fpstate.xsave.header.xfeatures |= XFEATURE_MASK_PKRU; + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + pk->pkru = READ_ONCE(init_pkru_value); } static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, @@ -154,10 +154,9 @@ static ssize_t init_pkru_read_file(struc static ssize_t init_pkru_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - struct pkru_state *pk; + u32 new_init_pkru; char buf[32]; ssize_t len; - u32 new_init_pkru; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) @@ -177,10 +176,8 @@ static ssize_t init_pkru_write_file(stru return -EINVAL; WRITE_ONCE(init_pkru_value, new_init_pkru); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (!pk) - return -EINVAL; - pk->pkru = new_init_pkru; + + pkru_propagate_default(); return count; } --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -44,7 +44,7 @@ static inline bool arch_pkeys_enabled(vo return false; } -static inline void copy_init_pkru_to_fpregs(void) +static inline void pkru_propagate_default(void) { }

4 years, 4 months

2
1
0 0

[PATCH 4.19] sched/fair: Optimize select_idle_cpu

by Yang Wei

From: Cheng Jian <cj.chengjian(a)huawei.com> commit 60588bfa223ff675b95f866249f90616613fbe31 upstream. select_idle_cpu() will scan the LLC domain for idle CPUs, it's always expensive. so the next commit : 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") introduces a way to limit how many CPUs we scan. But it consume some CPUs out of 'nr' that are not allowed for the task and thus waste our attempts. The function always return nr_cpumask_bits, and we can't find a CPU which our task is allowed to run. Cpumask may be too big, similar to select_idle_core(), use per_cpu_ptr 'select_idle_mask' to prevent stack overflow. Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") Signed-off-by: Cheng Jian <cj.chengjian(a)huawei.com> Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org> Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com> Reviewed-by: Vincent Guittot <vincent.guittot(a)linaro.org> Reviewed-by: Valentin Schneider <valentin.schneider(a)arm.com> Link: https://lkml.kernel.org/r/20191213024530.28052-1-cj.chengjian@huawei.com Signed-off-by: Yang Wei <yang.wei(a)linux.alibaba.com> Tested-by: Yang Wei <yang.wei(a)linux.alibaba.com> --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 80392cd..f066870 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6154,6 +6154,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; @@ -6184,11 +6185,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) - continue; if (available_idle_cpu(cpu)) break; } -- 1.8.3.1

4 years, 4 months

1
0
0 0

[PATCH 2/2] bcache: avoid oversized read request in cache missing code path

by Coly Li

In the cache missing code path of cached device, if a proper location from the internal B+ tree is matched for a cache miss range, function cached_dev_cache_miss() will be called in cache_lookup_fn() in the following code block, [code block 1] 526 unsigned int sectors = KEY_INODE(k) == s->iop.inode 527 ? min_t(uint64_t, INT_MAX, 528 KEY_START(k) - bio->bi_iter.bi_sector) 529 : INT_MAX; 530 int ret = s->d->cache_miss(b, s, bio, sectors); Here s->d->cache_miss() is the call backfunction pointer initialized as cached_dev_cache_miss(), the last parameter 'sectors' is an important hint to calculate the size of read request to backing device of the missing cache data. Current calculation in above code block may generate oversized value of 'sectors', which consequently may trigger 2 different potential kernel panics by BUG() or BUG_ON() as listed below, 1) BUG_ON() inside bch_btree_insert_key(), [code block 2] 886 BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); 2) BUG() inside biovec_slab(), [code block 3] 51 default: 52 BUG(); 53 return NULL; All the above panics are original from cached_dev_cache_miss() by the oversized parameter 'sectors'. Inside cached_dev_cache_miss(), parameter 'sectors' is used to calculate the size of data read from backing device for the cache missing. This size is stored in s->insert_bio_sectors by the following lines of code, [code block 4] 909 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); Then the actual key inserting to the internal B+ tree is generated and stored in s->iop.replace_key by the following lines of code, [code block 5] 911 s->iop.replace_key = KEY(s->iop.inode, 912 bio->bi_iter.bi_sector + s->insert_bio_sectors, 913 s->insert_bio_sectors); The oversized parameter 'sectors' may trigger panic 1) by BUG_ON() from the above code block. And the bio sending to backing device for the missing data is allocated with hint from s->insert_bio_sectors by the following lines of code, [code block 6] 926 cache_bio = bio_alloc_bioset(GFP_NOWAIT, 927 DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), 928 &dc->disk.bio_split); The oversized parameter 'sectors' may trigger panic 2) by BUG() from the agove code block. Now let me explain how the panics happen with the oversized 'sectors'. In code block 5, replace_key is generated by macro KEY(). From the definition of macro KEY(), [code block 7] 71 #define KEY(inode, offset, size) \ 72 ((struct bkey) { \ 73 .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ 74 .low = (offset) \ 75 }) Here 'size' is 16bits width embedded in 64bits member 'high' of struct bkey. But in code block 1, if "KEY_START(k) - bio->bi_iter.bi_sector" is very probably to be larger than (1<<16) - 1, which makes the bkey size calculation in code block 5 is overflowed. In one bug report the value of parameter 'sectors' is 131072 (= 1 << 17), the overflowed 'sectors' results the overflowed s->insert_bio_sectors in code block 4, then makes size field of s->iop.replace_key to be 0 in code block 5. Then the 0- sized s->iop.replace_key is inserted into the internal B+ tree as cache missing check key (a special key to detect and avoid a racing between normal write request and cache missing read request) as, [code block 8] 915 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); Then the 0-sized s->iop.replace_key as 3rd parameter triggers the bkey size check BUG_ON() in code block 2, and causes the kernel panic 1). Another kernel panic is from code block 6, is by the bvecs number oversized value s->insert_bio_sectors from code block 4, min(sectors, bio_sectors(bio) + reada) There are two possibility for oversized reresult, - bio_sectors(bio) is valid, but bio_sectors(bio) + reada is oversized. - sectors < bio_sectors(bio) + reada, but sectors is oversized. >From a bug report the result of "DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS)" from code block 6 can be 344, 282, 946, 342 and many other values which larther than BIO_MAX_VECS (a.k.a 256). When calling bio_alloc_bioset() with such larger-than-256 value as the 2nd parameter, this value will eventually be sent to biovec_slab() as parameter 'nr_vecs' in following code path, bio_alloc_bioset() ==> bvec_alloc() ==> biovec_slab() Because parameter 'nr_vecs' is larger-than-256 value, the panic by BUG() in code block 3 is triggered inside biovec_slab(). >From the above analysis, we know that the 4th parameter 'sector' sent into cached_dev_cache_miss() may cause overflow in code block 5 and 6, and finally cause kernel panic in code block 2 and 3. And if result of bio_sectors(bio) + reada exceeds valid bvecs number, it may also trigger kernel panic in code block 3 from code block 6. Now the almost-useless readahead size for cache missing request back to backing device is removed, this patch can fix the oversized issue with more simpler method. - add a local variable size_limit, set it by the minimum value from the max bkey size and max bio bvecs number. - set s->insert_bio_sectors by the minimum value from size_limit, sectors, and the sectors size of bio. - replace sectors by s->insert_bio_sectors to do bio_next_split. By the above method with size_limit, s->insert_bio_sectors will never result oversized replace_key size or bio bvecs number. And split bio 'miss' from bio_next_split() will always match the size of 'cache_bio', that is the current maximum bio size we can sent to backing device for fetching the cache missing data. Current problmatic code can be partially found since Linux v3.13-rc1, therefore all maintained stable kernels should try to apply this fix. Reported-by: Alexander Ullrich <ealex1979(a)gmail.com> Reported-by: Diego Ercolani <diego.ercolani(a)gmail.com> Reported-by: Jan Szubiak <jan.szubiak(a)linuxpolska.pl> Reported-by: Marco Rebhan <me(a)dblsaiko.net> Reported-by: Matthias Ferdinand <bcache(a)mfedv.net> Reported-by: Victor Westerhuis <victor(a)westerhu.is> Reported-by: Vojtech Pavlik <vojtech(a)suse.cz> Reported-and-tested-by: Rolf Fokkens <rolf(a)rolffokkens.nl> Reported-and-tested-by: Thorsten Knabe <linux(a)thorsten-knabe.de> Signed-off-by: Coly Li <colyli(a)suse.de> Cc: stable(a)vger.kernel.org Cc: Christoph Hellwig <hch(a)lst.de> Cc: Kent Overstreet <kent.overstreet(a)gmail.com> Cc: Nix <nix(a)esperi.org.uk> Cc: Takashi Iwai <tiwai(a)suse.com> --- Changelog, v6, Use BIO_MAX_VECS back and keep 80 chars line limit by request from Christoph Hellwig. v5, improvement and fix based on v4 comments from Christoph Hellwig and Nix. v4, not directly access BIO_MAX_VECS and reduce reada value to avoid oversized bvecs number, by hint from Christoph Hellwig. v3, fix typo in v2. v2, fix the bypass bio size calculation in v1. v1, the initial version. drivers/md/bcache/request.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ab8ff18df32a..6d1de889baeb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -882,6 +882,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, int ret = MAP_CONTINUE; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + unsigned int size_limit; s->cache_missed = 1; @@ -891,7 +892,10 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - s->insert_bio_sectors = min(sectors, bio_sectors(bio)); + /* Limitation for valid replace key size and cache_bio bvecs number */ + size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS, + (1 << KEY_SIZE_BITS) - 1); + s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio)); s->iop.replace_key = KEY(s->iop.inode, bio->bi_iter.bi_sector + s->insert_bio_sectors, @@ -903,7 +907,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->iop.replace = true; - miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); + miss = bio_next_split(bio, s->insert_bio_sectors, GFP_NOIO, + &s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ ret = miss == bio ? MAP_DONE : -EINTR; -- 2.26.2

4 years, 4 months

1
0
0 0

[PATCH 1/2] bcache: remove bcache device self-defined readahead

by Coly Li

For read cache missing, bcache defines a readahead size for the read I/O request to the backing device for the missing data. This readahead size is initialized to 0, and almost no one uses it to avoid unnecessary read amplifying onto backing device and write amplifying onto cache device. Considering upper layer file system code has readahead logic allready and works fine with readahead_cache_policy sysfile interface, we don't have to keep bcache self-defined readahead anymore. This patch removes the bcache self-defined readahead for cache missing request for backing device, and the readahead sysfs file interfaces are removed as well. This is the preparation for next patch to fix potential kernel panic due to oversized request in a simpler method. Reported-by: Alexander Ullrich <ealex1979(a)gmail.com> Reported-by: Diego Ercolani <diego.ercolani(a)gmail.com> Reported-by: Jan Szubiak <jan.szubiak(a)linuxpolska.pl> Reported-by: Marco Rebhan <me(a)dblsaiko.net> Reported-by: Matthias Ferdinand <bcache(a)mfedv.net> Reported-by: Victor Westerhuis <victor(a)westerhu.is> Reported-by: Vojtech Pavlik <vojtech(a)suse.cz> Reported-and-tested-by: Rolf Fokkens <rolf(a)rolffokkens.nl> Reported-and-tested-by: Thorsten Knabe <linux(a)thorsten-knabe.de> Signed-off-by: Coly Li <colyli(a)suse.de> Reviewed-by: Christoph Hellwig <hch(a)lst.de> Cc: stable(a)vger.kernel.org Cc: Kent Overstreet <kent.overstreet(a)gmail.com> Cc: Nix <nix(a)esperi.org.uk> Cc: Takashi Iwai <tiwai(a)suse.com> --- Changlog, v1, the initial version by hint from Christoph Hellwig. drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/request.c | 13 +------------ drivers/md/bcache/stats.c | 14 -------------- drivers/md/bcache/stats.h | 1 - drivers/md/bcache/sysfs.c | 4 ---- 5 files changed, 1 insertion(+), 32 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0a4551e165ab..5fc989a6d452 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,7 +364,6 @@ struct cached_dev { /* The rest of this all shows up in sysfs */ unsigned int sequential_cutoff; - unsigned int readahead; unsigned int io_disable:1; unsigned int verify:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 29c231758293..ab8ff18df32a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -880,7 +880,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors) { int ret = MAP_CONTINUE; - unsigned int reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; @@ -892,14 +891,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) - reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_bdev->bd_disk) - - bio_end_sector(bio)); - - s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); + s->insert_bio_sectors = min(sectors, bio_sectors(bio)); s->iop.replace_key = KEY(s->iop.inode, bio->bi_iter.bi_sector + s->insert_bio_sectors, @@ -933,9 +925,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - if (reada) - bch_mark_cache_readahead(s->iop.c, s->d); - s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..4c7ee5fedb9d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -46,7 +46,6 @@ read_attribute(cache_misses); read_attribute(cache_bypass_hits); read_attribute(cache_bypass_misses); read_attribute(cache_hit_ratio); -read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); read_attribute(bypassed); @@ -64,7 +63,6 @@ SHOW(bch_stats) DIV_SAFE(var(cache_hits) * 100, var(cache_hits) + var(cache_misses))); - var_print(cache_readaheads); var_print(cache_miss_collisions); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var @@ -86,7 +84,6 @@ static struct attribute *bch_stats_files[] = { &sysfs_cache_bypass_hits, &sysfs_cache_bypass_misses, &sysfs_cache_hit_ratio, - &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, &sysfs_bypassed, NULL @@ -113,7 +110,6 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) acc->total.cache_misses = 0; acc->total.cache_bypass_hits = 0; acc->total.cache_bypass_misses = 0; - acc->total.cache_readaheads = 0; acc->total.cache_miss_collisions = 0; acc->total.sectors_bypassed = 0; } @@ -145,7 +141,6 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_misses); scale_stat(&stats->cache_bypass_hits); scale_stat(&stats->cache_bypass_misses); - scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); scale_stat(&stats->sectors_bypassed); } @@ -168,7 +163,6 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_misses); move_stat(cache_bypass_hits); move_stat(cache_bypass_misses); - move_stat(cache_readaheads); move_stat(cache_miss_collisions); move_stat(sectors_bypassed); @@ -209,14 +203,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, mark_cache_stats(&c->accounting.collector, hit, bypass); } -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) -{ - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - - atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&c->accounting.collector.cache_readaheads); -} - void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..ca4f435f7216 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -7,7 +7,6 @@ struct cache_stat_collector { atomic_t cache_misses; atomic_t cache_bypass_hits; atomic_t cache_bypass_misses; - atomic_t cache_readaheads; atomic_t cache_miss_collisions; atomic_t sectors_bypassed; }; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index cc89f3156d1a..05ac1d6fbbf3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -137,7 +137,6 @@ rw_attribute(io_disable); rw_attribute(discard); rw_attribute(running); rw_attribute(label); -rw_attribute(readahead); rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); @@ -260,7 +259,6 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u"); var_hprint(sequential_cutoff); - var_hprint(readahead); sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); @@ -365,7 +363,6 @@ STORE(__cached_dev) sysfs_strtoul_clamp(sequential_cutoff, dc->sequential_cutoff, 0, UINT_MAX); - d_strtoi_h(readahead); if (attr == &sysfs_clear_stats) bch_cache_accounting_clear(&dc->accounting); @@ -538,7 +535,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_running, &sysfs_state, &sysfs_label, - &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_bypass_torture_test, -- 2.26.2

4 years, 4 months

1
0
0 0

[PATCH 4.14] sched/fair: Optimize select_idle_cpu

by Yang Wei

From: Cheng Jian <cj.chengjian(a)huawei.com> select_idle_cpu() will scan the LLC domain for idle CPUs, it's always expensive. so the next commit : 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") introduces a way to limit how many CPUs we scan. But it consume some CPUs out of 'nr' that are not allowed for the task and thus waste our attempts. The function always return nr_cpumask_bits, and we can't find a CPU which our task is allowed to run. Cpumask may be too big, similar to select_idle_core(), use per_cpu_ptr 'select_idle_mask' to prevent stack overflow. Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") Signed-off-by: Cheng Jian <cj.chengjian(a)huawei.com> Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org> Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com> Reviewed-by: Vincent Guittot <vincent.guittot(a)linaro.org> Reviewed-by: Valentin Schneider <valentin.schneider(a)arm.com> Link: https://lkml.kernel.org/r/20191213024530.28052-1-cj.chengjian@huawei.com Signed-off-by: Yang Wei <yang.wei(a)linux.alibaba.com> Tested-by: Yang Wei <yang.wei(a)linux.alibaba.com> --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81096dd..37ac76d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5779,6 +5779,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; @@ -5809,11 +5810,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) - continue; if (idle_cpu(cpu)) break; } -- 1.8.3.1

4 years, 4 months

1
0
0 0

[PATCH v5 2/2] bcache: avoid oversized read request in cache missing code path

by Coly Li

In the cache missing code path of cached device, if a proper location from the internal B+ tree is matched for a cache miss range, function cached_dev_cache_miss() will be called in cache_lookup_fn() in the following code block, [code block 1] 526 unsigned int sectors = KEY_INODE(k) == s->iop.inode 527 ? min_t(uint64_t, INT_MAX, 528 KEY_START(k) - bio->bi_iter.bi_sector) 529 : INT_MAX; 530 int ret = s->d->cache_miss(b, s, bio, sectors); Here s->d->cache_miss() is the call backfunction pointer initialized as cached_dev_cache_miss(), the last parameter 'sectors' is an important hint to calculate the size of read request to backing device of the missing cache data. Current calculation in above code block may generate oversized value of 'sectors', which consequently may trigger 2 different potential kernel panics by BUG() or BUG_ON() as listed below, 1) BUG_ON() inside bch_btree_insert_key(), [code block 2] 886 BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); 2) BUG() inside biovec_slab(), [code block 3] 51 default: 52 BUG(); 53 return NULL; All the above panics are original from cached_dev_cache_miss() by the oversized parameter 'sectors'. Inside cached_dev_cache_miss(), parameter 'sectors' is used to calculate the size of data read from backing device for the cache missing. This size is stored in s->insert_bio_sectors by the following lines of code, [code block 4] 909 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); Then the actual key inserting to the internal B+ tree is generated and stored in s->iop.replace_key by the following lines of code, [code block 5] 911 s->iop.replace_key = KEY(s->iop.inode, 912 bio->bi_iter.bi_sector + s->insert_bio_sectors, 913 s->insert_bio_sectors); The oversized parameter 'sectors' may trigger panic 1) by BUG_ON() from the above code block. And the bio sending to backing device for the missing data is allocated with hint from s->insert_bio_sectors by the following lines of code, [code block 6] 926 cache_bio = bio_alloc_bioset(GFP_NOWAIT, 927 DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), 928 &dc->disk.bio_split); The oversized parameter 'sectors' may trigger panic 2) by BUG() from the agove code block. Now let me explain how the panics happen with the oversized 'sectors'. In code block 5, replace_key is generated by macro KEY(). From the definition of macro KEY(), [code block 7] 71 #define KEY(inode, offset, size) \ 72 ((struct bkey) { \ 73 .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ 74 .low = (offset) \ 75 }) Here 'size' is 16bits width embedded in 64bits member 'high' of struct bkey. But in code block 1, if "KEY_START(k) - bio->bi_iter.bi_sector" is very probably to be larger than (1<<16) - 1, which makes the bkey size calculation in code block 5 is overflowed. In one bug report the value of parameter 'sectors' is 131072 (= 1 << 17), the overflowed 'sectors' results the overflowed s->insert_bio_sectors in code block 4, then makes size field of s->iop.replace_key to be 0 in code block 5. Then the 0- sized s->iop.replace_key is inserted into the internal B+ tree as cache missing check key (a special key to detect and avoid a racing between normal write request and cache missing read request) as, [code block 8] 915 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); Then the 0-sized s->iop.replace_key as 3rd parameter triggers the bkey size check BUG_ON() in code block 2, and causes the kernel panic 1). Another kernel panic is from code block 6, is by the bvecs number oversized value s->insert_bio_sectors from code block 4, min(sectors, bio_sectors(bio) + reada) There are two possibility for oversized reresult, - bio_sectors(bio) is valid, but bio_sectors(bio) + reada is oversized. - sectors < bio_sectors(bio) + reada, but sectors is oversized. >From a bug report the result of "DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS)" from code block 6 can be 344, 282, 946, 342 and many other values which larther than BIO_MAX_VECS (a.k.a 256). When calling bio_alloc_bioset() with such larger-than-256 value as the 2nd parameter, this value will eventually be sent to biovec_slab() as parameter 'nr_vecs' in following code path, bio_alloc_bioset() ==> bvec_alloc() ==> biovec_slab() Because parameter 'nr_vecs' is larger-than-256 value, the panic by BUG() in code block 3 is triggered inside biovec_slab(). >From the above analysis, we know that the 4th parameter 'sector' sent into cached_dev_cache_miss() may cause overflow in code block 5 and 6, and finally cause kernel panic in code block 2 and 3. And if result of bio_sectors(bio) + reada exceeds valid bvecs number, it may also trigger kernel panic in code block 3 from code block 6. Now the almost-useless readahead size for cache missing request back to backing device is removed, this patch can fix the oversized issue with more simpler method. - add a local variable size_limit, set it by the minimum value from the max bkey size and max bio bvecs number. - set s->insert_bio_sectors by the minimum value from size_limit, sectors, and the sectors size of bio. - replace sectors by s->insert_bio_sectors to do bio_next_split. By the above method with size_limit, s->insert_bio_sectors will never result oversized replace_key size or bio bvecs number. And split bio 'miss' from bio_next_split() will always match the size of 'cache_bio', that is the current maximum bio size we can sent to backing device for fetching the cache missing data. Current problmatic code can be partially found since Linux v3.13-rc1, therefore all maintained stable kernels should try to apply this fix. Reported-by: Alexander Ullrich <ealex1979(a)gmail.com> Reported-by: Diego Ercolani <diego.ercolani(a)gmail.com> Reported-by: Jan Szubiak <jan.szubiak(a)linuxpolska.pl> Reported-by: Marco Rebhan <me(a)dblsaiko.net> Reported-by: Matthias Ferdinand <bcache(a)mfedv.net> Reported-by: Victor Westerhuis <victor(a)westerhu.is> Reported-by: Vojtech Pavlik <vojtech(a)suse.cz> Reported-and-tested-by: Rolf Fokkens <rolf(a)rolffokkens.nl> Reported-and-tested-by: Thorsten Knabe <linux(a)thorsten-knabe.de> Signed-off-by: Coly Li <colyli(a)suse.de> Cc: stable(a)vger.kernel.org Cc: Christoph Hellwig <hch(a)lst.de> Cc: Kent Overstreet <kent.overstreet(a)gmail.com> Cc: Nix <nix(a)esperi.org.uk> Cc: Takashi Iwai <tiwai(a)suse.com> --- Changelog, v5, improvement and fix based on v4 comments from Christoph Hellwig and Nix. v4, not directly access BIO_MAX_VECS and reduce reada value to avoid oversized bvecs number, by hint from Christoph Hellwig. v3, fix typo in v2. v2, fix the bypass bio size calculation in v1. v1, the initial version. drivers/md/bcache/request.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ab8ff18df32a..d855a8882cbc 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -882,6 +882,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, int ret = MAP_CONTINUE; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + unsigned int size_limit; s->cache_missed = 1; @@ -891,7 +892,10 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - s->insert_bio_sectors = min(sectors, bio_sectors(bio)); + /* Limitation for valid replace key size and cache_bio bvecs number */ + size_limit = min_t(unsigned int, bio_max_segs(UINT_MAX) * PAGE_SECTORS, + (1 << KEY_SIZE_BITS) - 1); + s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio)); s->iop.replace_key = KEY(s->iop.inode, bio->bi_iter.bi_sector + s->insert_bio_sectors, @@ -903,7 +907,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->iop.replace = true; - miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); + miss = bio_next_split(bio, s->insert_bio_sectors, GFP_NOIO, &s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ ret = miss == bio ? MAP_DONE : -EINTR; -- 2.26.2

4 years, 4 months

2
4
0 0

[PATCH v5 1/2] bcache: remove bcache device self-defined readahead

by Coly Li

For read cache missing, bcache defines a readahead size for the read I/O request to the backing device for the missing data. This readahead size is initialized to 0, and almost no one uses it to avoid unnecessary read amplifying onto backing device and write amplifying onto cache device. Considering upper layer file system code has readahead logic allready and works fine with readahead_cache_policy sysfile interface, we don't have to keep bcache self-defined readahead anymore. This patch removes the bcache self-defined readahead for cache missing request for backing device, and the readahead sysfs file interfaces are removed as well. This is the preparation for next patch to fix potential kernel panic due to oversized request in a simpler method. Reported-by: Alexander Ullrich <ealex1979(a)gmail.com> Reported-by: Diego Ercolani <diego.ercolani(a)gmail.com> Reported-by: Jan Szubiak <jan.szubiak(a)linuxpolska.pl> Reported-by: Marco Rebhan <me(a)dblsaiko.net> Reported-by: Matthias Ferdinand <bcache(a)mfedv.net> Reported-by: Victor Westerhuis <victor(a)westerhu.is> Reported-by: Vojtech Pavlik <vojtech(a)suse.cz> Signed-off-by: Coly Li <colyli(a)suse.de> Reported-and-tested-by: Rolf Fokkens <rolf(a)rolffokkens.nl> Reported-and-tested-by: Thorsten Knabe <linux(a)thorsten-knabe.de> Cc: stable(a)vger.kernel.org Cc: Christoph Hellwig <hch(a)lst.de> Cc: Kent Overstreet <kent.overstreet(a)gmail.com> Cc: Nix <nix(a)esperi.org.uk> Cc: Takashi Iwai <tiwai(a)suse.com> --- Changlog, v1, the initial version by hint from Christoph Hellwig. drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/request.c | 13 +------------ drivers/md/bcache/stats.c | 14 -------------- drivers/md/bcache/stats.h | 1 - drivers/md/bcache/sysfs.c | 4 ---- 5 files changed, 1 insertion(+), 32 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0a4551e165ab..5fc989a6d452 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,7 +364,6 @@ struct cached_dev { /* The rest of this all shows up in sysfs */ unsigned int sequential_cutoff; - unsigned int readahead; unsigned int io_disable:1; unsigned int verify:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 29c231758293..ab8ff18df32a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -880,7 +880,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors) { int ret = MAP_CONTINUE; - unsigned int reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; @@ -892,14 +891,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) - reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_bdev->bd_disk) - - bio_end_sector(bio)); - - s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); + s->insert_bio_sectors = min(sectors, bio_sectors(bio)); s->iop.replace_key = KEY(s->iop.inode, bio->bi_iter.bi_sector + s->insert_bio_sectors, @@ -933,9 +925,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - if (reada) - bch_mark_cache_readahead(s->iop.c, s->d); - s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..4c7ee5fedb9d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -46,7 +46,6 @@ read_attribute(cache_misses); read_attribute(cache_bypass_hits); read_attribute(cache_bypass_misses); read_attribute(cache_hit_ratio); -read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); read_attribute(bypassed); @@ -64,7 +63,6 @@ SHOW(bch_stats) DIV_SAFE(var(cache_hits) * 100, var(cache_hits) + var(cache_misses))); - var_print(cache_readaheads); var_print(cache_miss_collisions); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var @@ -86,7 +84,6 @@ static struct attribute *bch_stats_files[] = { &sysfs_cache_bypass_hits, &sysfs_cache_bypass_misses, &sysfs_cache_hit_ratio, - &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, &sysfs_bypassed, NULL @@ -113,7 +110,6 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) acc->total.cache_misses = 0; acc->total.cache_bypass_hits = 0; acc->total.cache_bypass_misses = 0; - acc->total.cache_readaheads = 0; acc->total.cache_miss_collisions = 0; acc->total.sectors_bypassed = 0; } @@ -145,7 +141,6 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_misses); scale_stat(&stats->cache_bypass_hits); scale_stat(&stats->cache_bypass_misses); - scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); scale_stat(&stats->sectors_bypassed); } @@ -168,7 +163,6 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_misses); move_stat(cache_bypass_hits); move_stat(cache_bypass_misses); - move_stat(cache_readaheads); move_stat(cache_miss_collisions); move_stat(sectors_bypassed); @@ -209,14 +203,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, mark_cache_stats(&c->accounting.collector, hit, bypass); } -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) -{ - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - - atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&c->accounting.collector.cache_readaheads); -} - void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..ca4f435f7216 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -7,7 +7,6 @@ struct cache_stat_collector { atomic_t cache_misses; atomic_t cache_bypass_hits; atomic_t cache_bypass_misses; - atomic_t cache_readaheads; atomic_t cache_miss_collisions; atomic_t sectors_bypassed; }; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index cc89f3156d1a..05ac1d6fbbf3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -137,7 +137,6 @@ rw_attribute(io_disable); rw_attribute(discard); rw_attribute(running); rw_attribute(label); -rw_attribute(readahead); rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); @@ -260,7 +259,6 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u"); var_hprint(sequential_cutoff); - var_hprint(readahead); sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); @@ -365,7 +363,6 @@ STORE(__cached_dev) sysfs_strtoul_clamp(sequential_cutoff, dc->sequential_cutoff, 0, UINT_MAX); - d_strtoi_h(readahead); if (attr == &sysfs_clear_stats) bch_cache_accounting_clear(&dc->accounting); @@ -538,7 +535,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_running, &sysfs_state, &sysfs_label, - &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_bypass_torture_test, -- 2.26.2

4 years, 4 months

2
1
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror June 2021