- Linux-stable-mirror - lists.linaro.org

[PATCH] slub: track number of slabs irrespective of CONFIG_SLUB_DEBUG

by Shakeel Butt

For !CONFIG_SLUB_DEBUG, SLUB does not maintain the number of slabs allocated per node for a kmem_cache. Thus, slabs_node() in __kmem_cache_empty(), __kmem_cache_shrink() and __kmem_cache_destroy() will always return 0 for such config. This is wrong and can cause issues for all users of these functions. Infact in [1] Jason has reported a system crash while using SLUB without CONFIG_SLUB_DEBUG. The reason was the usage of slabs_node() by __kmem_cache_empty(). The right solution is to make slabs_node() work even for !CONFIG_SLUB_DEBUG. The commit 0f389ec63077 ("slub: No need for per node slab counters if !SLUB_DEBUG") had put the per node slab counter under CONFIG_SLUB_DEBUG because it was only read through sysfs API and the sysfs API was disabled on !CONFIG_SLUB_DEBUG. However the users of the per node slab counter assumed that it will work in the absence of CONFIG_SLUB_DEBUG. So, make the counter work for !CONFIG_SLUB_DEBUG. Please note that commit f9e13c0a5a33 ("slab, slub: skip unnecessary kasan_cache_shutdown()") exposed this issue but it is present even before. [1] http://lkml.kernel.org/r/CAHmME9rtoPwxUSnktxzKso14iuVCWT7BE_-_8PAC=pGw1iJnQ… Fixes: f9e13c0a5a33 ("slab, slub: skip unnecessary kasan_cache_shutdown()") Signed-off-by: Shakeel Butt <shakeelb(a)google.com> Suggested-by: David Rientjes <rientjes(a)google.com> Reported-by: Jason A . Donenfeld <Jason(a)zx2c4.com> Cc: Christoph Lameter <cl(a)linux.com> Cc: Pekka Enberg <penberg(a)kernel.org> Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com> Cc: Andrew Morton <akpm(a)linux-foundation.org> Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com> Cc: <stable(a)vger.kernel.org> Cc: <linux-mm(a)kvack.org> Cc: <linux-kernel(a)vger.kernel.org> --- mm/slab.h | 2 +- mm/slub.c | 80 +++++++++++++++++++++++++------------------------------ 2 files changed, 38 insertions(+), 44 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 68bdf498da3b..a6545332cc86 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -473,8 +473,8 @@ struct kmem_cache_node { #ifdef CONFIG_SLUB unsigned long nr_partial; struct list_head partial; -#ifdef CONFIG_SLUB_DEBUG atomic_long_t nr_slabs; +#ifdef CONFIG_SLUB_DEBUG atomic_long_t total_objects; struct list_head full; #endif diff --git a/mm/slub.c b/mm/slub.c index a3b8467c14af..c9c190d54687 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1030,42 +1030,6 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct list_del(&page->lru); } -/* Tracking of the number of slabs for debugging purposes */ -static inline unsigned long slabs_node(struct kmem_cache *s, int node) -{ - struct kmem_cache_node *n = get_node(s, node); - - return atomic_long_read(&n->nr_slabs); -} - -static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) -{ - return atomic_long_read(&n->nr_slabs); -} - -static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) -{ - struct kmem_cache_node *n = get_node(s, node); - - /* - * May be called early in order to allocate a slab for the - * kmem_cache_node structure. Solve the chicken-egg - * dilemma by deferring the increment of the count during - * bootstrap (see early_kmem_cache_node_alloc). - */ - if (likely(n)) { - atomic_long_inc(&n->nr_slabs); - atomic_long_add(objects, &n->total_objects); - } -} -static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) -{ - struct kmem_cache_node *n = get_node(s, node); - - atomic_long_dec(&n->nr_slabs); - atomic_long_sub(objects, &n->total_objects); -} - /* Object debug checks for alloc/free paths */ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) @@ -1321,16 +1285,46 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #define disable_higher_order_debug 0 +#endif /* CONFIG_SLUB_DEBUG */ + static inline unsigned long slabs_node(struct kmem_cache *s, int node) - { return 0; } +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) - { return 0; } -static inline void inc_slabs_node(struct kmem_cache *s, int node, - int objects) {} -static inline void dec_slabs_node(struct kmem_cache *s, int node, - int objects) {} +{ + return atomic_long_read(&n->nr_slabs); +} -#endif /* CONFIG_SLUB_DEBUG */ +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (likely(n)) { + atomic_long_inc(&n->nr_slabs); +#ifdef CONFIG_SLUB_DEBUG + atomic_long_add(objects, &n->total_objects); +#endif + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); +#ifdef CONFIG_SLUB_DEBUG + atomic_long_sub(objects, &n->total_objects); +#endif +} /* * Hooks for other subsystems that check memory allocations. In a typical -- 2.18.0.rc1.244.gcf134e6275-goog

7 years

4
7
0 0

[PATCH v3] time: Make sure jiffies_to_msecs() preserves non-zero time periods

by Geert Uytterhoeven

For the common cases where 1000 is a multiple of HZ, or HZ is a multiple of 1000, jiffies_to_msecs() never returns zero when passed a non-zero time period. However, if HZ > 1000 and not an integer multiple of 1000 (e.g. 1024 or 1200, as used on alpha and DECstation), jiffies_to_msecs() may return zero for small non-zero time periods. This may break code that relies on receiving back a non-zero value. jiffies_to_usecs() does not need such a fix: one jiffy can only be less than one µs if HZ > 1000000, and such large values of HZ are already rejected at build time, twice: - include/linux/jiffies.h does #error if HZ >= 12288, - kernel/time/time.c has BUILD_BUG_ON(HZ > USEC_PER_SEC). Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org> Reviewed-by: Arnd Bergmann <arnd(a)arndb.de> Cc: stable(a)vger.kernel.org --- Broken since forever. v3: - Add Reviewed-by, - Cc stable, - Explain better why jiffies_to_usecs() does not need a fix, v2: - Add examples of affected systems, - Use DIV_ROUND_UP(). --- kernel/time/time.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/time/time.c b/kernel/time/time.c index 6fa99213fc720e4b..2b41e8e2d31db26f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -28,6 +28,7 @@ */ #include <linux/export.h> +#include <linux/kernel.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/timekeeper_internal.h> @@ -314,9 +315,10 @@ unsigned int jiffies_to_msecs(const unsigned long j) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; + return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> + HZ_TO_MSEC_SHR32; # else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; + return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } -- 2.17.1

7 years

1
0
0 0

[PATCH v3 1/3] mtd: rawnand: denali_dt: add more clocks based on IP datasheet

by Masahiro Yamada

According to the Denali User's Guide, this IP needs three clocks: - clk: controller core clock - clk_x: bus interface clock - ecc_clk: clock at which ECC circuitry is run Currently, denali_dt.c requires a single anonymous clock and its frequency. However, the driver needs to get the frequency of "clk_x" not "clk". This is confusing because people tend to assume the anonymous clock means the core clock. In fact, I got a report of SOCFPGA breakage because the timing parameters are calculated based on a wrong frequency. Instead of the cheesy implementation, the clocks in the real hardware should be represented in the driver and the DT-binding. However, adding new clocks would break the existing platforms. For the backward compatibility, the driver still accepts a single clock just as before. If clk_x is missing, clk_x_rate is set to a hardcoded value. This is fine for existing DT of Socionext UniPhier, and also fixes the issue of Altera (Intel) SOCFPGA because both platforms use 200 MHz for the bus interface clock. Fixes: 1bb88666775e ("mtd: nand: denali: handle timing parameters by setup_data_interface()") Cc: linux-stable <stable(a)vger.kernel.org> #4.14+ Reported-by: Richard Weinberger <richard(a)nod.at> Signed-off-by: Masahiro Yamada <yamada.masahiro(a)socionext.com> --- Changes in v3: - Change the patch order so that the bug-fix one comes the first Changes in v2: - Split patches into sensible chunks .../devicetree/bindings/mtd/denali-nand.txt | 5 +++ drivers/mtd/nand/raw/denali_dt.c | 49 ++++++++++++++++++++-- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/mtd/denali-nand.txt b/Documentation/devicetree/bindings/mtd/denali-nand.txt index 0ee8edb..f33da87 100644 --- a/Documentation/devicetree/bindings/mtd/denali-nand.txt +++ b/Documentation/devicetree/bindings/mtd/denali-nand.txt @@ -8,6 +8,9 @@ Required properties: - reg : should contain registers location and length for data and reg. - reg-names: Should contain the reg names "nand_data" and "denali_reg" - interrupts : The interrupt number. + - clocks: should contain phandle of the controller core clock, the bus + interface clock, and the ECC circuit clock. + - clock-names: should contain "nand", "nand_x", "ecc" Optional properties: - nand-ecc-step-size: see nand.txt for details. If present, the value must be @@ -31,5 +34,7 @@ nand: nand@ff900000 { compatible = "altr,socfpga-denali-nand"; reg = <0xff900000 0x20>, <0xffb80000 0x1000>; reg-names = "nand_data", "denali_reg"; + clocks = <&nand_clk>, <&nand_x_clk>, <&nand_ecc_clk>; + clock-names = "nand", "nand_x", "ecc"; interrupts = <0 144 4>; }; diff --git a/drivers/mtd/nand/raw/denali_dt.c b/drivers/mtd/nand/raw/denali_dt.c index cfd33e6..ce6239d 100644 --- a/drivers/mtd/nand/raw/denali_dt.c +++ b/drivers/mtd/nand/raw/denali_dt.c @@ -27,7 +27,9 @@ struct denali_dt { struct denali_nand_info denali; - struct clk *clk; + struct clk *clk; /* core clock */ + struct clk *clk_x; /* bus interface clock */ + struct clk *clk_ecc; /* ECC circuit clock */ }; struct denali_dt_data { @@ -114,24 +116,61 @@ static int denali_dt_probe(struct platform_device *pdev) if (IS_ERR(denali->host)) return PTR_ERR(denali->host); - dt->clk = devm_clk_get(&pdev->dev, NULL); + /* + * A single anonymous clock is supported for the backward compatibility. + * New platforms should support all the named clocks. + */ + dt->clk = devm_clk_get(&pdev->dev, "nand"); + if (IS_ERR(dt->clk)) + dt->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(dt->clk)) { dev_err(&pdev->dev, "no clk available\n"); return PTR_ERR(dt->clk); } + + dt->clk_x = devm_clk_get(&pdev->dev, "nand_x"); + if (IS_ERR(dt->clk_x)) + dt->clk_x = NULL; + + dt->clk_ecc = devm_clk_get(&pdev->dev, "ecc"); + if (IS_ERR(dt->clk_ecc)) + dt->clk_ecc = NULL; + ret = clk_prepare_enable(dt->clk); if (ret) return ret; - denali->clk_x_rate = clk_get_rate(dt->clk); + ret = clk_prepare_enable(dt->clk_x); + if (ret) + goto out_disable_clk; + + ret = clk_prepare_enable(dt->clk_ecc); + if (ret) + goto out_disable_clk_x; + + if (dt->clk_x) { + denali->clk_x_rate = clk_get_rate(dt->clk_x); + } else { + /* + * Hardcode the clock rates for the backward compatibility. + * This works for both SOCFPGA and UniPhier. + */ + dev_notice(&pdev->dev, + "necessary clock is missing. default clock rates are used.\n"); + denali->clk_x_rate = 200000000; + } ret = denali_init(denali); if (ret) - goto out_disable_clk; + goto out_disable_clk_ecc; platform_set_drvdata(pdev, dt); return 0; +out_disable_clk_ecc: + clk_disable_unprepare(dt->clk_ecc); +out_disable_clk_x: + clk_disable_unprepare(dt->clk_x); out_disable_clk: clk_disable_unprepare(dt->clk); @@ -143,6 +182,8 @@ static int denali_dt_remove(struct platform_device *pdev) struct denali_dt *dt = platform_get_drvdata(pdev); denali_remove(&dt->denali); + clk_disable_unprepare(dt->clk_ecc); + clk_disable_unprepare(dt->clk_x); clk_disable_unprepare(dt->clk); return 0; -- 2.7.4

7 years

5
10
0 0

[tip:ras/core] x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out()

by tip-bot for Borislav Petkov

Commit-ID: 1f74c8a64798e2c488f86efc97e308b85fb7d7aa Gitweb: https://git.kernel.org/tip/1f74c8a64798e2c488f86efc97e308b85fb7d7aa Author: Borislav Petkov <bp(a)suse.de> AuthorDate: Fri, 22 Jun 2018 11:54:28 +0200 Committer: Thomas Gleixner <tglx(a)linutronix.de> CommitDate: Fri, 22 Jun 2018 14:35:50 +0200 x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out() mce_no_way_out() does a quick check during #MC to see whether some of the MCEs logged would require the kernel to panic immediately. And it passes a struct mce where MCi_STATUS gets written. However, after having saved a valid status value, the next iteration of the loop which goes over the MCA banks on the CPU, overwrites the valid status value because we're using struct mce as storage instead of a temporary variable. Which leads to MCE records with an empty status value: mce: [Hardware Error]: CPU 0: Machine Check Exception: 6 Bank 0: 0000000000000000 mce: [Hardware Error]: RIP 10:<ffffffffbd42fbd7> {trigger_mce+0x7/0x10} In order to prevent the loss of the status register value, return immediately when severity is a panic one so that we can panic immediately with the first fatal MCE logged. This is also the intention of this function and not to noodle over the banks while a fatal MCE is already logged. Tony: read the rest of the MCA bank to populate the struct mce fully. Suggested-by: Tony Luck <tony.luck(a)intel.com> Signed-off-by: Borislav Petkov <bp(a)suse.de> Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de> Cc: <stable(a)vger.kernel.org> Link: https://lkml.kernel.org/r/20180622095428.626-8-bp@alien8.de --- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cd76380af79f..7e6f51a9d917 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll); static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { - int i, ret = 0; char *tmp; + int i; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(msr_ops.status(i)); - if (m->status & MCI_STATUS_VAL) { - __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); - } + if (!(m->status & MCI_STATUS_VAL)) + continue; + + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + mce_read_aux(m, i); *msg = tmp; - ret = 1; + return 1; } } - return ret; + return 0; } /*

7 years

1
0
0 0

[PATCH v2 1/4] mtd: cfi_cmdset_0002: Use right chip in do_ppb_xxlock()

by Joakim Tjernlund

do_ppb_xxlock() fails to add chip->start when quering for lock status(and chip_ready test), which caused false status reports. Fix by adding adr += chip->start and adjust call sites accordingly. Fixes: 1648eaaa1575 ("mtd: cfi_cmdset_0002: Support Persistent Protection Bits (PPB) locking") Cc: stable(a)vger.kernel.org Signed-off-by: Joakim Tjernlund <joakim.tjernlund(a)infinera.com> --- v2 - Spilt into several patches drivers/mtd/chips/cfi_cmdset_0002.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index 53a976a8e614..8648b1adccd5 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -2554,8 +2554,9 @@ static int __maybe_unused do_ppb_xxlock(struct map_info *map, unsigned long timeo; int ret; + adr += chip->start; mutex_lock(&chip->mutex); - ret = get_chip(map, chip, adr + chip->start, FL_LOCKING); + ret = get_chip(map, chip, adr, FL_LOCKING); if (ret) { mutex_unlock(&chip->mutex); return ret; @@ -2573,8 +2574,8 @@ static int __maybe_unused do_ppb_xxlock(struct map_info *map, if (thunk == DO_XXLOCK_ONEBLOCK_LOCK) { chip->state = FL_LOCKING; - map_write(map, CMD(0xA0), chip->start + adr); - map_write(map, CMD(0x00), chip->start + adr); + map_write(map, CMD(0xA0), adr); + map_write(map, CMD(0x00), adr); } else if (thunk == DO_XXLOCK_ONEBLOCK_UNLOCK) { /* * Unlocking of one specific sector is not supported, so we @@ -2612,7 +2613,7 @@ static int __maybe_unused do_ppb_xxlock(struct map_info *map, map_write(map, CMD(0x00), chip->start); chip->state = FL_READY; - put_chip(map, chip, adr + chip->start); + put_chip(map, chip, adr); mutex_unlock(&chip->mutex); return ret; -- 2.13.6

7 years

3
14
0 0

[PATCH v2] mtd: rawnand: All AC chips have a broken GET_FEATURES(TIMINGS).

by Boris Brezillon

From: Mason Yang <masonccyang(a)mxic.com.tw> Make sure we flag all broken chips as not supporting this feature. Also move this logic to a new function to keep things readable. Fixes: 34c5c01e0c8c ("mtd: rawnand: macronix: nack the support of changing timings for one chip") Cc: <stable(a)vger.kernel.org> Signed-off-by: Mason Yang <masonccyang(a)mxic.com.tw> Signed-off-by: Boris Brezillon <boris.brezillon(a)bootlin.com> --- Changes in v2: - Add Fixes and Cc-stable tags --- drivers/mtd/nand/raw/nand_macronix.c | 48 +++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/mtd/nand/raw/nand_macronix.c b/drivers/mtd/nand/raw/nand_macronix.c index 7ed1f87e742a..49c546c97c6f 100644 --- a/drivers/mtd/nand/raw/nand_macronix.c +++ b/drivers/mtd/nand/raw/nand_macronix.c @@ -17,23 +17,47 @@ #include <linux/mtd/rawnand.h> +/* + * Macronix AC series does not support using SET/GET_FEATURES to change + * the timings unlike what is declared in the parameter page. Unflag + * this feature to avoid unnecessary downturns. + */ +static void macronix_nand_fix_broken_get_timings(struct nand_chip *chip) +{ + unsigned int i; + static const char * const broken_get_timings[] = { + "MX30LF1G18AC", + "MX30LF1G28AC", + "MX30LF2G18AC", + "MX30LF2G28AC", + "MX30LF4G18AC", + "MX30LF4G28AC", + "MX60LF8G18AC", + }; + + if (!chip->parameters.supports_set_get_features) + return; + + for (i = 0; i < ARRAY_SIZE(broken_get_timings); i++) { + if (!strcmp(broken_get_timings[i], chip->parameters.model)) + break; + } + + if (i == ARRAY_SIZE(broken_get_timings)) + return; + + bitmap_clear(chip->parameters.get_feature_list, + ONFI_FEATURE_ADDR_TIMING_MODE, 1); + bitmap_clear(chip->parameters.set_feature_list, + ONFI_FEATURE_ADDR_TIMING_MODE, 1); +} + static int macronix_nand_init(struct nand_chip *chip) { if (nand_is_slc(chip)) chip->bbt_options |= NAND_BBT_SCAN2NDPAGE; - /* - * MX30LF2G18AC chip does not support using SET/GET_FEATURES to change - * the timings unlike what is declared in the parameter page. Unflag - * this feature to avoid unnecessary downturns. - */ - if (chip->parameters.supports_set_get_features && - !strcmp("MX30LF2G18AC", chip->parameters.model)) { - bitmap_clear(chip->parameters.get_feature_list, - ONFI_FEATURE_ADDR_TIMING_MODE, 1); - bitmap_clear(chip->parameters.set_feature_list, - ONFI_FEATURE_ADDR_TIMING_MODE, 1); - } + macronix_nand_fix_broken_get_timings(chip); return 0; } -- 2.14.1

7 years

2
2
0 0

[PATCH] mtd: rawnand: fix return value check for bad block status

by Abhishek Sahu

Positive return value from read_oob() is making false BAD blocks. For some of the NAND controllers, OOB bytes will be protected with ECC and read_oob() will return number of bitflips. If there is any bitflip in ECC protected OOB bytes for BAD block status page, then that block is getting treated as BAD. Fixes: c120e75e0e7d ("mtd: nand: use read_oob() instead of cmdfunc() for bad block check") Cc: <stable(a)vger.kernel.org> Signed-off-by: Abhishek Sahu <absahu(a)codeaurora.org> --- drivers/mtd/nand/raw/nand_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index f28c3a5..4a73f73 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -440,7 +440,7 @@ static int nand_block_bad(struct mtd_info *mtd, loff_t ofs) for (; page < page_end; page++) { res = chip->ecc.read_oob(mtd, chip, page); - if (res) + if (res < 0) return res; bad = chip->oob_poi[chip->badblockpos]; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation

7 years

3
2
0 0

[PATCH] cpuidle: powernv: Fix promotion from snooze if next state disabled

by Gautham R. Shenoy

From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com> [ Upstream commit 0a4ec6aa035a52c422eceb2ed51ed88392a3d6c2 ] The commit 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") introduced a timeout for the snooze idle state so that it could be eventually be promoted to a deeper idle state. The snooze timeout value is static and set to the target residency of the next idle state, which would train the cpuidle governor to pick the next idle state eventually. The unfortunate side-effect of this is that if the next idle state(s) is disabled, the CPU will forever remain in snooze, despite the fact that the system is completely idle, and other deeper idle states are available. This patch fixes the issue by dynamically setting the snooze timeout to the target residency of the next enabled state on the device. Before Patch: POWER8 : Only nap disabled. $ cpupower monitor sleep 30 sleep took 30.01297 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | Nap | Fast 0| 8| 0| 96.41| 0.00| 0.00 0| 8| 1| 96.43| 0.00| 0.00 0| 8| 2| 96.47| 0.00| 0.00 0| 8| 3| 96.35| 0.00| 0.00 0| 8| 4| 96.37| 0.00| 0.00 0| 8| 5| 96.37| 0.00| 0.00 0| 8| 6| 96.47| 0.00| 0.00 0| 8| 7| 96.47| 0.00| 0.00 POWER9: Shallow states (stop0lite, stop1lite, stop2lite, stop0, stop1, stop2) disabled: $ cpupower monitor sleep 30 sleep took 30.05033 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop | stop | stop 0| 16| 0| 89.79| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 1| 90.12| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 2| 90.21| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 3| 90.29| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 After Patch: POWER8 : Only nap disabled. $ cpupower monitor sleep 30 sleep took 30.01200 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | Nap | Fast 0| 8| 0| 16.58| 0.00| 77.21 0| 8| 1| 18.42| 0.00| 75.38 0| 8| 2| 4.70| 0.00| 94.09 0| 8| 3| 17.06| 0.00| 81.73 0| 8| 4| 3.06| 0.00| 95.73 0| 8| 5| 7.00| 0.00| 96.80 0| 8| 6| 1.00| 0.00| 98.79 0| 8| 7| 5.62| 0.00| 94.17 POWER9: Shallow states (stop0lite, stop1lite, stop2lite, stop0, stop1, stop2) disabled: $ cpupower monitor sleep 30 sleep took 30.02110 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop | stop | stop 0| 0| 0| 0.69| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 9.39| 89.70 0| 0| 1| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.05| 93.21 0| 0| 2| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 89.93 0| 0| 3| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 93.26 Fixes: 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") Cc: stable(a)vger.kernel.org # v4.14+ Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com> Reviewed-by: Balbir Singh <bsingharora(a)gmail.com> Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au> --- drivers/cpuidle/cpuidle-powernv.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1a8234e..d29e4f0 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -43,9 +43,31 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; -static u64 snooze_timeout __read_mostly; +static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 get_snooze_timeout(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int i; + + if (unlikely(!snooze_timeout_en)) + return default_snooze_timeout; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return s->target_residency * tb_ticks_per_usec; + } + + return default_snooze_timeout; +} + static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -56,7 +78,7 @@ static int snooze_loop(struct cpuidle_device *dev, local_irq_enable(); - snooze_exit_time = get_tb() + snooze_timeout; + snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { @@ -465,11 +487,9 @@ static int powernv_idle_probe(void) cpuidle_state_table = powernv_states; /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); - if (max_idle_state > 1) { + default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; + if (max_idle_state > 1) snooze_timeout_en = true; - snooze_timeout = powernv_states[1].target_residency * - tb_ticks_per_usec; - } } else return -ENODEV; -- 1.9.4

7 years

1
0
0 0

[PATCH] cpuidle: powernv: Fix promotion from snooze if next state disabled

by Gautham R. Shenoy

From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com> [ Upstream commit 0a4ec6aa035a52c422eceb2ed51ed88392a3d6c2 ] The commit 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") introduced a timeout for the snooze idle state so that it could be eventually be promoted to a deeper idle state. The snooze timeout value is static and set to the target residency of the next idle state, which would train the cpuidle governor to pick the next idle state eventually. The unfortunate side-effect of this is that if the next idle state(s) is disabled, the CPU will forever remain in snooze, despite the fact that the system is completely idle, and other deeper idle states are available. This patch fixes the issue by dynamically setting the snooze timeout to the target residency of the next enabled state on the device. Before Patch: POWER8 : Only nap disabled. $ cpupower monitor sleep 30 sleep took 30.01297 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | Nap | Fast 0| 8| 0| 96.41| 0.00| 0.00 0| 8| 1| 96.43| 0.00| 0.00 0| 8| 2| 96.47| 0.00| 0.00 0| 8| 3| 96.35| 0.00| 0.00 0| 8| 4| 96.37| 0.00| 0.00 0| 8| 5| 96.37| 0.00| 0.00 0| 8| 6| 96.47| 0.00| 0.00 0| 8| 7| 96.47| 0.00| 0.00 POWER9: Shallow states (stop0lite, stop1lite, stop2lite, stop0, stop1, stop2) disabled: $ cpupower monitor sleep 30 sleep took 30.05033 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop | stop | stop 0| 16| 0| 89.79| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 1| 90.12| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 2| 90.21| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 3| 90.29| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 After Patch: POWER8 : Only nap disabled. $ cpupower monitor sleep 30 sleep took 30.01200 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | Nap | Fast 0| 8| 0| 16.58| 0.00| 77.21 0| 8| 1| 18.42| 0.00| 75.38 0| 8| 2| 4.70| 0.00| 94.09 0| 8| 3| 17.06| 0.00| 81.73 0| 8| 4| 3.06| 0.00| 95.73 0| 8| 5| 7.00| 0.00| 96.80 0| 8| 6| 1.00| 0.00| 98.79 0| 8| 7| 5.62| 0.00| 94.17 POWER9: Shallow states (stop0lite, stop1lite, stop2lite, stop0, stop1, stop2) disabled: $ cpupower monitor sleep 30 sleep took 30.02110 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop | stop | stop 0| 0| 0| 0.69| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 9.39| 89.70 0| 0| 1| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.05| 93.21 0| 0| 2| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 89.93 0| 0| 3| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 93.26 Fixes: 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") Cc: stable(a)vger.kernel.org # v4.9 Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com> Reviewed-by: Balbir Singh <bsingharora(a)gmail.com> Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au> --- drivers/cpuidle/cpuidle-powernv.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index cda8f62..069a689 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -42,9 +42,31 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX]; -static u64 snooze_timeout; +static u64 default_snooze_timeout; static bool snooze_timeout_en; +static u64 get_snooze_timeout(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int i; + + if (unlikely(!snooze_timeout_en)) + return default_snooze_timeout; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return s->target_residency * tb_ticks_per_usec; + } + + return default_snooze_timeout; +} + static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -54,7 +76,7 @@ static int snooze_loop(struct cpuidle_device *dev, local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); - snooze_exit_time = get_tb() + snooze_timeout; + snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); ppc64_runlatch_off(); while (!need_resched()) { HMT_low(); @@ -384,11 +406,9 @@ static int powernv_idle_probe(void) cpuidle_state_table = powernv_states; /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); - if (max_idle_state > 1) { + default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; + if (max_idle_state > 1) snooze_timeout_en = true; - snooze_timeout = powernv_states[1].target_residency * - tb_ticks_per_usec; - } } else return -ENODEV; -- 1.9.4

7 years

1
0
0 0

[PATCH] cpuidle: powernv: Fix promotion from snooze if next state disabled

by Gautham R. Shenoy

From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com> [ Upstream commit 0a4ec6aa035a52c422eceb2ed51ed88392a3d6c2 ] The commit 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") introduced a timeout for the snooze idle state so that it could be eventually be promoted to a deeper idle state. The snooze timeout value is static and set to the target residency of the next idle state, which would train the cpuidle governor to pick the next idle state eventually. The unfortunate side-effect of this is that if the next idle state(s) is disabled, the CPU will forever remain in snooze, despite the fact that the system is completely idle, and other deeper idle states are available. This patch fixes the issue by dynamically setting the snooze timeout to the target residency of the next enabled state on the device. Before Patch: POWER8 : Only nap disabled. $ cpupower monitor sleep 30 sleep took 30.01297 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | Nap | Fast 0| 8| 0| 96.41| 0.00| 0.00 0| 8| 1| 96.43| 0.00| 0.00 0| 8| 2| 96.47| 0.00| 0.00 0| 8| 3| 96.35| 0.00| 0.00 0| 8| 4| 96.37| 0.00| 0.00 0| 8| 5| 96.37| 0.00| 0.00 0| 8| 6| 96.47| 0.00| 0.00 0| 8| 7| 96.47| 0.00| 0.00 POWER9: Shallow states (stop0lite, stop1lite, stop2lite, stop0, stop1, stop2) disabled: $ cpupower monitor sleep 30 sleep took 30.05033 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop | stop | stop 0| 16| 0| 89.79| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 1| 90.12| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 2| 90.21| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 16| 3| 90.29| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 After Patch: POWER8 : Only nap disabled. $ cpupower monitor sleep 30 sleep took 30.01200 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | Nap | Fast 0| 8| 0| 16.58| 0.00| 77.21 0| 8| 1| 18.42| 0.00| 75.38 0| 8| 2| 4.70| 0.00| 94.09 0| 8| 3| 17.06| 0.00| 81.73 0| 8| 4| 3.06| 0.00| 95.73 0| 8| 5| 7.00| 0.00| 96.80 0| 8| 6| 1.00| 0.00| 98.79 0| 8| 7| 5.62| 0.00| 94.17 POWER9: Shallow states (stop0lite, stop1lite, stop2lite, stop0, stop1, stop2) disabled: $ cpupower monitor sleep 30 sleep took 30.02110 seconds and exited with status 0 |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop | stop | stop 0| 0| 0| 0.69| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 9.39| 89.70 0| 0| 1| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.05| 93.21 0| 0| 2| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 89.93 0| 0| 3| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 93.26 Fixes: 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") Cc: stable(a)vger.kernel.org # v4.4 Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com> Reviewed-by: Balbir Singh <bsingharora(a)gmail.com> Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au> --- drivers/cpuidle/cpuidle-powernv.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index e12dc30..d075071 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -29,9 +29,31 @@ struct cpuidle_driver powernv_idle_driver = { static int max_idle_state; static struct cpuidle_state *cpuidle_state_table; -static u64 snooze_timeout; +static u64 default_snooze_timeout; static bool snooze_timeout_en; +static u64 get_snooze_timeout(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int i; + + if (unlikely(!snooze_timeout_en)) + return default_snooze_timeout; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return s->target_residency * tb_ticks_per_usec; + } + + return default_snooze_timeout; +} + static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -41,7 +63,7 @@ static int snooze_loop(struct cpuidle_device *dev, local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); - snooze_exit_time = get_tb() + snooze_timeout; + snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); ppc64_runlatch_off(); while (!need_resched()) { HMT_low(); @@ -268,11 +290,9 @@ static int powernv_idle_probe(void) cpuidle_state_table = powernv_states; /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); - if (max_idle_state > 1) { + default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; + if (max_idle_state > 1) snooze_timeout_en = true; - snooze_timeout = powernv_states[1].target_residency * - tb_ticks_per_usec; - } } else return -ENODEV; -- 1.9.4

7 years

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror