If kthread_is_per_cpu runs concurrently with free_kthread_struct the
kthread_struct that was just freed may be read from.
This bug was introduced by commit 40966e316f86 ("kthread: Ensure
struct kthread is present for all kthreads"). When kthread_struct
started to be allocated for all tasks that have PF_KTHREAD set. This
in turn required the kthread_struct to be freed in kernel_execve and
violated the assumption that kthread_struct will have the same
lifetime as the task.
Looking a bit deeper this only applies to callers of kernel_execve
which is just the init process and the user mode helper processes.
These processes really don't want to be kernel threads but are for
historical reasons. Mostly that copy_thread does not know how to take
a kernel mode function to the process with for processes without
PF_KTHREAD or PF_IO_WORKER set.
Solve this by not allocating kthread_struct for the init process and
the user mode helper processes.
This is done by adding a kthread member to struct kernel_clone_args.
Setting kthread in fork_idle and kernel_thread. Adding
user_mode_thread that works like kernel_thread except it does not set
kthread. In fork only allocating the kthread_struct if .kthread is set.
I have looked at kernel/kthread.c and since commit 40966e316f86
("kthread: Ensure struct kthread is present for all kthreads") there
have been no assumptions added that to_kthread or __to_kthread will
not return NULL.
There are a few callers of to_kthread or __to_kthread that assume a
non-NULL struct kthread pointer will be returned. These functions are
kthread_data(), kthread_parmme(), kthread_exit(), kthread(),
kthread_park(), kthread_unpark(), kthread_stop(). All of those functions
can reasonably expected to be called when it is know that a task is a
kthread so that assumption seems reasonable.
Cc: stable(a)vger.kernel.org
Fixes: 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads")
Reported-by: Максим Кутявин <maximkabox13(a)gmail.com>
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
---
fs/exec.c | 6 ++++--
include/linux/sched/task.h | 2 ++
init/main.c | 2 +-
kernel/fork.c | 22 ++++++++++++++++++++--
kernel/umh.c | 6 +++---
5 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index e3e55d5e0be1..75eb6e0ee7b2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1308,8 +1308,6 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- if (me->flags & PF_KTHREAD)
- free_kthread_struct(me);
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
@@ -1955,6 +1953,10 @@ int kernel_execve(const char *kernel_filename,
int fd = AT_FDCWD;
int retval;
+ if (WARN_ON_ONCE((current->flags & PF_KTHREAD) &&
+ (current->worker_private)))
+ return -EINVAL;
+
filename = getname_kernel(kernel_filename);
if (IS_ERR(filename))
return PTR_ERR(filename);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 719c9a6cac8d..4492266935dd 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -32,6 +32,7 @@ struct kernel_clone_args {
size_t set_tid_size;
int cgroup;
int io_thread;
+ int kthread;
struct cgroup *cgrp;
struct css_set *cset;
};
@@ -89,6 +90,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);
diff --git a/init/main.c b/init/main.c
index 98182c3c2c4b..39baac0211c6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -688,7 +688,7 @@ noinline void __ref rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ pid = user_mode_thread(kernel_init, NULL, CLONE_FS);
/*
* Pin init on the boot CPU. Task migration is not properly working
* until sched_init_smp() has been run. It will set the allowed
diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..27c5203750b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2157,7 +2157,7 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
- if (p->flags & PF_KTHREAD) {
+ if (args->kthread) {
if (!set_kthread_struct(p))
goto bad_fork_cleanup_delayacct;
}
@@ -2548,7 +2548,8 @@ struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
- .flags = CLONE_VM,
+ .flags = CLONE_VM,
+ .kthread = 1,
};
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@ -2679,6 +2680,23 @@ pid_t kernel_clone(struct kernel_clone_args *args)
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ .kthread = 1,
+ };
+
+ return kernel_clone(&args);
+}
+
+/*
+ * Create a user mode thread.
+ */
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
diff --git a/kernel/umh.c b/kernel/umh.c
index 36c123360ab8..b989736e8707 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -132,7 +132,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
/* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0)
sub_info->retval = pid;
else
@@ -171,8 +171,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
* want to pollute current->children, and we need a parent
* that always ignores SIGCHLD to ensure auto-reaping.
*/
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
+ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
--
2.35.3
This reverts commit a4efc174b382fcdb62e2d90d39e78a274a975e38 which
introduced a regression issue that when there're multiple processes
allocating dma memory in parallel by calling dma_alloc_coherent(), it
may fail sometimes as follows:
Error log:
cma: cma_alloc: linux,cma: alloc failed, req-size: 148 pages, ret: -16
cma: number of available pages:
3@125+20@172+12@236+4@380+32@736+17@2287+23@2473+20@36076+99@40477+108@40852+44@41108+20@41196+108@41364+108@41620+
108@42900+108@43156+483@44061+1763@45341+1440@47712+20@49324+20@49388+5076@49452+2304@55040+35@58141+20@58220+20@58284+
7188@58348+84@66220+7276@66452+227@74525+6371@75549=> 33161 free of 81920 total pages
When issue happened, we saw there were still 33161 pages (129M) free CMA
memory and a lot available free slots for 148 pages in CMA bitmap that we
want to allocate.
When dumping memory info, we found that there was also ~342M normal memory,
but only 1352K CMA memory left in buddy system while a lot of pageblocks
were isolated.
Memory info log:
Normal free:351096kB min:30000kB low:37500kB high:45000kB reserved_highatomic:0KB
active_anon:98060kB inactive_anon:98948kB active_file:60864kB inactive_file:31776kB
unevictable:0kB writepending:0kB present:1048576kB managed:1018328kB mlocked:0kB
bounce:0kB free_pcp:220kB local_pcp:192kB free_cma:1352kB lowmem_reserve[]: 0 0 0
Normal: 78*4kB (UECI) 1772*8kB (UMECI) 1335*16kB (UMECI) 360*32kB (UMECI) 65*64kB (UMCI)
36*128kB (UMECI) 16*256kB (UMCI) 6*512kB (EI) 8*1024kB (UEI) 4*2048kB (MI) 8*4096kB (EI)
8*8192kB (UI) 3*16384kB (EI) 8*32768kB (M) = 489288kB
The root cause of this issue is that since commit a4efc174b382
("mm/cma.c: remove redundant cma_mutex lock"), CMA supports concurrent
memory allocation. It's possible that the memory range process A trying
to alloc has already been isolated by the allocation of process B during
memory migration.
The problem here is that the memory range isolated during one allocation
by start_isolate_page_range() could be much bigger than the real size we
want to alloc due to the range is aligned to MAX_ORDER_NR_PAGES.
Taking an ARMv7 platform with 1G memory as an example, when MAX_ORDER_NR_PAGES
is big (e.g. 32M with max_order 14) and CMA memory is relatively small
(e.g. 128M), there're only 4 MAX_ORDER slot, then it's very easy that
all CMA memory may have already been isolated by other processes when
one trying to allocate memory using dma_alloc_coherent().
Since current CMA code will only scan one time of whole available CMA
memory, then dma_alloc_coherent() may easy fail due to contention with
other processes.
This patch simply falls back to the original method that using cma_mutex
to make alloc_contig_range() run sequentially to avoid the issue.
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Marek Szyprowski <m.szyprowski(a)samsung.com>
Cc: Lecopzer Chen <lecopzer.chen(a)mediatek.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Minchan Kim <minchan(a)kernel.org>
CC: stable(a)vger.kernel.org # 5.11+
Fixes: a4efc174b382 ("mm/cma.c: remove redundant cma_mutex lock")
Signed-off-by: Dong Aisheng <aisheng.dong(a)nxp.com>
---
Patch is based on
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-stable
---
mm/cma.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/cma.c b/mm/cma.c
index eaa4b5c920a2..4a978e09547a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -37,6 +37,7 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned cma_area_count;
+static DEFINE_MUTEX(cma_mutex);
phys_addr_t cma_get_base(const struct cma *cma)
{
@@ -468,9 +469,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
spin_unlock_irq(&cma->lock);
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
+ mutex_lock(&cma_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
-
+ mutex_unlock(&cma_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
break;
--
2.25.1
Good Day!
Your swift responds will be greatly appreciated. I viewed your profile
on Linkedin regarding a proposal that has something in common with
you, reply for more details on my private
email: nikkifenton79(a)gmail.com
Nikki Fenton
nikkifenton79(a)gmail.com
This is the start of the stable review cycle for the 5.4.193 release.
There are 52 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu, 12 May 2022 13:07:16 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.4.193-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.4.193-rc1
Ricky WU <ricky_wu(a)realtek.com>
mmc: rtsx: add 74 Clocks in power on flow
Pali Rohár <pali(a)kernel.org>
PCI: aardvark: Fix reading MSI interrupt number
Pali Rohár <pali(a)kernel.org>
PCI: aardvark: Clear all MSIs at setup
Mike Snitzer <snitzer(a)redhat.com>
dm: interlock pending dm_io and dm_wait_for_bios_completion
Jiazi Li <jqqlijiazi(a)gmail.com>
dm: fix mempool NULL pointer race when completing IO
Eric Dumazet <edumazet(a)google.com>
tcp: make sure treq->af_specific is initialized
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix potential AB/BA lock with buffer_mutex and mmap_lock
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent prealloc proc writes
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent prepare and hw_params/hw_free calls
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent read/write and buffer changes
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent hw_params and hw_free calls
Minchan Kim <minchan(a)kernel.org>
mm: fix unexpected zeroed page mapping with zram swap
Haimin Zhang <tcs.kernel(a)gmail.com>
block-map: add __GFP_ZERO flag for alloc_page in function bio_copy_kern
j.nixdorf(a)avm.de <j.nixdorf(a)avm.de>
net: ipv6: ensure we call ipv6_mc_down() at most once
Wanpeng Li <wanpengli(a)tencent.com>
KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised
Wanpeng Li <wanpengli(a)tencent.com>
x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
Sandipan Das <sandipan.das(a)amd.com>
kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural PMU
Trond Myklebust <trond.myklebust(a)hammerspace.com>
NFSv4: Don't invalidate inode attributes on delegation return
Felix Kuehling <Felix.Kuehling(a)amd.com>
drm/amdkfd: Use drm_priv to pass VM from KFD to amdgpu
Eric Dumazet <edumazet(a)google.com>
net: igmp: respect RCU rules in ip_mc_source() and ip_mc_msfilter()
Filipe Manana <fdmanana(a)suse.com>
btrfs: always log symlinks in full mode
Sergey Shtylyov <s.shtylyov(a)omp.ru>
smsc911x: allow using IRQ0
Somnath Kotur <somnath.kotur(a)broadcom.com>
bnxt_en: Fix possible bnxt_open() failure caused by wrong RFS flag
Ido Schimmel <idosch(a)nvidia.com>
selftests: mirror_gre_bridge_1q: Avoid changing PVID while interface is operational
Shravya Kumbham <shravya.kumbham(a)xilinx.com>
net: emaclite: Add error handling for of_address_to_resource()
Yang Yingliang <yangyingliang(a)huawei.com>
net: stmmac: dwmac-sun8i: add missing of_node_put() in sun8i_dwmac_register_mdio_mux()
Yang Yingliang <yangyingliang(a)huawei.com>
net: ethernet: mediatek: add missing of_node_put() in mtk_sgmii_init()
Cheng Xu <chengyou(a)linux.alibaba.com>
RDMA/siw: Fix a condition race issue in MPA request processing
Codrin Ciubotariu <codrin.ciubotariu(a)microchip.com>
ASoC: dmaengine: Restore NULL prepare_slave_config() callback
Armin Wolf <W_Armin(a)gmx.de>
hwmon: (adt7470) Fix warning on module removal
Duoming Zhou <duoming(a)zju.edu.cn>
NFC: netlink: fix sleep in atomic bug when firmware download timeout
Duoming Zhou <duoming(a)zju.edu.cn>
nfc: nfcmrvl: main: reorder destructive operations in nfcmrvl_nci_unregister_dev to avoid bugs
Duoming Zhou <duoming(a)zju.edu.cn>
nfc: replace improper check device_is_registered() in netlink related functions
Daniel Hellstrom <daniel(a)gaisler.com>
can: grcan: use ofdev->dev when allocating DMA memory
Duoming Zhou <duoming(a)zju.edu.cn>
can: grcan: grcan_close(): fix deadlock
Jan Höppner <hoeppner(a)linux.ibm.com>
s390/dasd: Fix read inconsistency for ESE DASD devices
Jan Höppner <hoeppner(a)linux.ibm.com>
s390/dasd: Fix read for ESE with blksize < 4k
Stefan Haberland <sth(a)linux.ibm.com>
s390/dasd: prevent double format of tracks for ESE devices
Stefan Haberland <sth(a)linux.ibm.com>
s390/dasd: fix data corruption for ESE devices
Mark Brown <broonie(a)kernel.org>
ASoC: meson: Fix event generation for G12A tohdmi mux
Mark Brown <broonie(a)kernel.org>
ASoC: wm8958: Fix change notifications for DSP controls
Mark Brown <broonie(a)kernel.org>
ASoC: da7219: Fix change notifications for tone generator frequency
Thomas Pfaff <tpfaff(a)pcs.com>
genirq: Synchronize interrupt thread startup
Vegard Nossum <vegard.nossum(a)oracle.com>
ACPICA: Always create namespace nodes using acpi_ns_create_node()
Niels Dossche <dossche.niels(a)gmail.com>
firewire: core: extend card->lock in fw_core_handle_bus_reset
Jakob Koschel <jakobkoschel(a)gmail.com>
firewire: remove check of list iterator against head past the loop body
Chengfeng Ye <cyeaa(a)connect.ust.hk>
firewire: fix potential uaf in outbound_phy_packet_callback()
Trond Myklebust <trond.myklebust(a)hammerspace.com>
Revert "SUNRPC: attempt AF_LOCAL connect on setup"
Andrei Lalaev <andrei.lalaev(a)emlid.com>
gpiolib: of: fix bounds check for 'gpio-reserved-ranges'
Takashi Sakamoto <o-takashi(a)sakamocchi.jp>
ALSA: fireworks: fix wrong return count shorter than expected by 4 bytes
Helge Deller <deller(a)gmx.de>
parisc: Merge model and model name into one line in /proc/cpuinfo
Maciej W. Rozycki <macro(a)orcam.me.uk>
MIPS: Fix CP0 counter erratum detection for R4k CPUs
-------------
Diffstat:
Makefile | 4 +-
arch/mips/include/asm/timex.h | 8 +-
arch/mips/kernel/time.c | 11 +--
arch/parisc/kernel/processor.c | 3 +-
arch/x86/kernel/kvm.c | 13 +++
arch/x86/kvm/cpuid.c | 5 +
arch/x86/kvm/lapic.c | 3 +-
block/bio.c | 2 +-
drivers/acpi/acpica/nsaccess.c | 3 +-
drivers/firewire/core-card.c | 3 +
drivers/firewire/core-cdev.c | 4 +-
drivers/firewire/core-topology.c | 9 +-
drivers/firewire/core-transaction.c | 30 +++---
drivers/firewire/sbp2.c | 13 +--
drivers/gpio/gpiolib-of.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 +-
drivers/hwmon/adt7470.c | 4 +-
drivers/infiniband/sw/siw/siw_cm.c | 7 +-
drivers/md/dm.c | 25 +++--
drivers/mmc/host/rtsx_pci_sdmmc.c | 31 ++++--
drivers/net/can/grcan.c | 8 +-
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 +-
drivers/net/ethernet/mediatek/mtk_sgmii.c | 1 +
drivers/net/ethernet/smsc/smsc911x.c | 2 +-
drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 1 +
drivers/net/ethernet/xilinx/xilinx_emaclite.c | 15 ++-
drivers/nfc/nfcmrvl/main.c | 2 +-
drivers/pci/controller/pci-aardvark.c | 16 ++-
drivers/s390/block/dasd.c | 18 +++-
drivers/s390/block/dasd_eckd.c | 28 ++++--
drivers/s390/block/dasd_int.h | 14 +++
fs/btrfs/tree-log.c | 14 ++-
fs/nfs/nfs4proc.c | 12 ++-
include/net/tcp.h | 5 +
include/sound/pcm.h | 2 +
kernel/irq/internals.h | 2 +
kernel/irq/irqdesc.c | 2 +
kernel/irq/manage.c | 39 ++++++--
mm/page_io.c | 54 ----------
net/ipv4/igmp.c | 9 +-
net/ipv4/syncookies.c | 1 +
net/ipv4/tcp_ipv4.c | 2 +-
net/ipv6/addrconf.c | 8 +-
net/ipv6/syncookies.c | 1 +
net/ipv6/tcp_ipv6.c | 2 +-
net/nfc/core.c | 29 +++---
net/nfc/netlink.c | 4 +-
net/sunrpc/xprtsock.c | 3 -
sound/core/pcm.c | 3 +
sound/core/pcm_lib.c | 5 +
sound/core/pcm_memory.c | 11 ++-
sound/core/pcm_native.c | 110 +++++++++++++++------
sound/firewire/fireworks/fireworks_hwdep.c | 1 +
sound/soc/codecs/da7219.c | 14 ++-
sound/soc/codecs/wm8958-dsp2.c | 8 +-
sound/soc/meson/g12a-tohdmitx.c | 2 +-
sound/soc/soc-generic-dmaengine-pcm.c | 6 +-
.../net/forwarding/mirror_gre_bridge_1q.sh | 3 +
58 files changed, 409 insertions(+), 247 deletions(-)
Our Ref: BG/WA0151/2022
Dear Beneficiary
Subject: An Estate of US$15.8 Million
Blount and Griffin Genealogical Investigators specializes in probate
research to locate missing heirs and beneficiaries to estates in the
United Kingdom and Europe.
We can also help you find wills, obtain copies of certificates, help
you to administer an estate, as well as calculating how an estate,
intestacy or trust should be distributed.
You may be entitled to a large pay out for an inheritance in Europe
worth US$15.8 million. We have discovered an estate belonging to the
late Depositor has remained unclaimed since he died in 2011 and we
have strong reasons to believe you are the closest living relative to
the deceased we can find.
You may unknowingly be the heir of this person who died without
leaving a will (intestate). We will conduct a probate research to
prove your entitlement, and can submit a claim on your behalf all at
no risk to yourselves.
Our service fee of 10% will be paid to us after you have received the estate.
The estate transfer process should take just a matter of days as we
have the mechanism and expertise to get this done very quickly. This
message may come to you as a shock, however we hope to work with you
to transfer the estate to you as quickly as possible.
Feel free to email our senior case worker Mr. Malcolm Casey on email:
malcolmcasey68(a)yahoo.com for further discussions.
With warm regards,
Mr. Blount W. Gort, CEO.
Blount and Griffin Associates Inc
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: 792ea6a074ae7ea5ab6f1b8b31f76bb0297de66c
Gitweb: https://git.kernel.org/tip/792ea6a074ae7ea5ab6f1b8b31f76bb0297de66c
Author: Lukas Wunner <lukas(a)wunner.de>
AuthorDate: Tue, 10 May 2022 09:56:05 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Wed, 11 May 2022 02:22:52 +02:00
genirq: Remove WARN_ON_ONCE() in generic_handle_domain_irq()
Since commit 0953fb263714 ("irq: remove handle_domain_{irq,nmi}()"),
generic_handle_domain_irq() warns if called outside hardirq context, even
though the function calls down to handle_irq_desc(), which warns about the
same, but conditionally on handle_enforce_irqctx().
The newly added warning is a false positive if the interrupt originates
from any other irqchip than x86 APIC or ARM GIC/GICv3. Those are the only
ones for which handle_enforce_irqctx() returns true. Per commit
c16816acd086 ("genirq: Add protection against unsafe usage of
generic_handle_irq()"):
"In general calling generic_handle_irq() with interrupts disabled from non
interrupt context is harmless. For some interrupt controllers like the
x86 trainwrecks this is outright dangerous as it might corrupt state if
an interrupt affinity change is pending."
Examples for interrupt chips where the warning is a false positive are
USB-attached GPIO controllers such as drivers/gpio/gpio-dln2.c:
USB gadgets are incapable of directly signaling an interrupt because they
cannot initiate a bus transaction by themselves. All communication on
the bus is initiated by the host controller, which polls a gadget's
Interrupt Endpoint in regular intervals. If an interrupt is pending,
that information is passed up the stack in softirq context, from which a
hardirq is synthesized via generic_handle_domain_irq().
Remove the warning to eliminate such false positives.
Fixes: 0953fb263714 ("irq: remove handle_domain_{irq,nmi}()")
Signed-off-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Jakub Kicinski <kuba(a)kernel.org>
CC: Linus Walleij <linus.walleij(a)linaro.org>
Cc: Bartosz Golaszewski <brgl(a)bgdev.pl>
Cc: Octavian Purdila <octavian.purdila(a)nxp.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220505113207.487861b2@kernel.org
Link: https://lore.kernel.org/r/20220506203242.GA1855@wunner.de
Link: https://lore.kernel.org/r/c3caf60bfa78e5fdbdf483096b7174da65d1813a.16521688…
---
kernel/irq/irqdesc.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0099b87..d323b18 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -701,7 +701,6 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- WARN_ON_ONCE(!in_hardirq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
My name is Paul Isreal a broker from Republic of Togo, My client is
interested to invest in Transportation business, Tourism, Real Estate
business, Agriculture, Health sector or any other business you have
or may propose in your country.
Please reply for more information only if you are interested.
Mr Paul Isreal
With the new OS handshake introduced with the commit: "c7ff29763989
("thermal: int340x: Update OS policy capability handshake")",
thermal zone mode "enabled" doesn't work in the same way as the legacy
handshake. The mode "enabled" fails with -EINVAL using new handshake.
To address this issue, when the new OS UUID mask is set:
- When mode is "enabled", return 0 as the firmware already has the
latest policy mask.
- When mode is "disabled", update the firmware with UUID mask of zero.
In this way firmware can take control of the thermal control. Also
reset the OS UUID mask. This allows user space to update with new
set of policies.
Fixes: c7ff29763989 ("thermal: int340x: Update OS policy capability handshake")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
update:
Added Fixes tag
.../intel/int340x_thermal/int3400_thermal.c | 48 ++++++++++++-------
1 file changed, 32 insertions(+), 16 deletions(-)
diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
index d97f496bab9b..1061728ad5a9 100644
--- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
@@ -194,12 +194,31 @@ static int int3400_thermal_run_osc(acpi_handle handle, char *uuid_str, int *enab
return result;
}
+static int set_os_uuid_mask(struct int3400_thermal_priv *priv, u32 mask)
+{
+ int cap = 0;
+
+ /*
+ * Capability bits:
+ * Bit 0: set to 1 to indicate DPTF is active
+ * Bi1 1: set to 1 to active cooling is supported by user space daemon
+ * Bit 2: set to 1 to passive cooling is supported by user space daemon
+ * Bit 3: set to 1 to critical trip is handled by user space daemon
+ */
+ if (mask)
+ cap = ((priv->os_uuid_mask << 1) | 0x01);
+
+ return int3400_thermal_run_osc(priv->adev->handle,
+ "b23ba85d-c8b7-3542-88de-8de2ffcfd698",
+ &cap);
+}
+
static ssize_t current_uuid_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct int3400_thermal_priv *priv = dev_get_drvdata(dev);
- int i;
+ int ret, i;
for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; ++i) {
if (!strncmp(buf, int3400_thermal_uuids[i],
@@ -231,19 +250,7 @@ static ssize_t current_uuid_store(struct device *dev,
}
if (priv->os_uuid_mask) {
- int cap, ret;
-
- /*
- * Capability bits:
- * Bit 0: set to 1 to indicate DPTF is active
- * Bi1 1: set to 1 to active cooling is supported by user space daemon
- * Bit 2: set to 1 to passive cooling is supported by user space daemon
- * Bit 3: set to 1 to critical trip is handled by user space daemon
- */
- cap = ((priv->os_uuid_mask << 1) | 0x01);
- ret = int3400_thermal_run_osc(priv->adev->handle,
- "b23ba85d-c8b7-3542-88de-8de2ffcfd698",
- &cap);
+ ret = set_os_uuid_mask(priv, priv->os_uuid_mask);
if (ret)
return ret;
}
@@ -469,17 +476,26 @@ static int int3400_thermal_change_mode(struct thermal_zone_device *thermal,
if (mode != thermal->mode) {
int enabled;
+ enabled = (mode == THERMAL_DEVICE_ENABLED);
+
+ if (priv->os_uuid_mask) {
+ if (!enabled) {
+ priv->os_uuid_mask = 0;
+ result = set_os_uuid_mask(priv, priv->os_uuid_mask);
+ }
+ goto eval_odvp;
+ }
+
if (priv->current_uuid_index < 0 ||
priv->current_uuid_index >= INT3400_THERMAL_MAXIMUM_UUID)
return -EINVAL;
- enabled = (mode == THERMAL_DEVICE_ENABLED);
result = int3400_thermal_run_osc(priv->adev->handle,
int3400_thermal_uuids[priv->current_uuid_index],
&enabled);
}
-
+eval_odvp:
evaluate_odvp(priv);
return result;
--
2.31.1
From: Mike Rapoport <rppt(a)linux.ibm.com>
commit 5e545df3292fbd3d5963c68980f1527ead2a2b3f upstream.
ARM is the only architecture that defines CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
which in turn enables memmap_valid_within() function that is intended to
verify existence of struct page associated with a pfn when there are holes
in the memory map.
However, the ARCH_HAS_HOLES_MEMORYMODEL also enables HAVE_ARCH_PFN_VALID
and arch-specific pfn_valid() implementation that also deals with the holes
in the memory map.
The only two users of memmap_valid_within() call this function after
a call to pfn_valid() so the memmap_valid_within() check becomes redundant.
Remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL and memmap_valid_within() and rely
entirely on ARM's implementation of pfn_valid() that is now enabled
unconditionally.
Link: https://lkml.kernel.org/r/20201101170454.9567-9-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt(a)linux.ibm.com>
Cc: Alexey Dobriyan <adobriyan(a)gmail.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Geert Uytterhoeven <geert(a)linux-m68k.org>
Cc: Greg Ungerer <gerg(a)linux-m68k.org>
Cc: John Paul Adrian Glaubitz <glaubitz(a)physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Matt Turner <mattst88(a)gmail.com>
Cc: Meelis Roos <mroos(a)linux.ee>
Cc: Michael Schmitz <schmitzmic(a)gmail.com>
Cc: Russell King <linux(a)armlinux.org.uk>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: Vineet Gupta <vgupta(a)synopsys.com>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Reported-by: kernel test robot <lkp(a)intel.com>
Fixes: 8dd559d53b3b ("arm: ioremap: don't abuse pfn_valid() to check if pfn is in RAM")
Signed-off-by: Mike Rapoport <rppt(a)linux.ibm.com>
---
Documentation/vm/memory-model.rst | 3 +--
arch/arm/Kconfig | 8 ++------
arch/arm/mach-bcm/Kconfig | 1 -
arch/arm/mach-davinci/Kconfig | 1 -
arch/arm/mach-exynos/Kconfig | 1 -
arch/arm/mach-highbank/Kconfig | 1 -
arch/arm/mach-omap2/Kconfig | 2 +-
arch/arm/mach-s5pv210/Kconfig | 1 -
arch/arm/mach-tango/Kconfig | 1 -
fs/proc/kcore.c | 2 --
include/linux/mmzone.h | 31 -------------------------------
mm/mmzone.c | 14 --------------
mm/vmstat.c | 4 ----
13 files changed, 4 insertions(+), 66 deletions(-)
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
index 58a12376b7df..94db75ba7fbe 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -52,8 +52,7 @@ wrapper :c:func:`free_area_init`. Yet, the mappings array is not
usable until the call to :c:func:`memblock_free_all` that hands all
the memory to the page allocator.
-If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option,
-it may free parts of the `mem_map` array that do not cover the
+An architecture may free parts of the `mem_map` array that do not cover the
actual physical pages. In such case, the architecture specific
:c:func:`pfn_valid` implementation should take the holes in the
`mem_map` into account.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4b36bbcf5a5b..a1622b9290fd 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -26,7 +26,7 @@ config ARM
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC
+ select ARCH_KEEP_MEMBLOCK
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -521,7 +521,6 @@ config ARCH_S3C24XX
config ARCH_OMAP1
bool "TI OMAP1"
depends on MMU
- select ARCH_HAS_HOLES_MEMORYMODEL
select ARCH_OMAP
select CLKDEV_LOOKUP
select CLKSRC_MMIO
@@ -1518,9 +1517,6 @@ config OABI_COMPAT
UNPREDICTABLE (in fact it can be predicted that it won't work
at all). If in doubt say N.
-config ARCH_HAS_HOLES_MEMORYMODEL
- bool
-
config ARCH_SPARSEMEM_ENABLE
bool
@@ -1528,7 +1524,7 @@ config ARCH_SPARSEMEM_DEFAULT
def_bool ARCH_SPARSEMEM_ENABLE
config HAVE_ARCH_PFN_VALID
- def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+ def_bool y
config HIGHMEM
bool "High Memory Support"
diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index 5e5f1fabc3d4..634d1bc3c011 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -214,7 +214,6 @@ config ARCH_BRCMSTB
select HAVE_ARM_ARCH_TIMER
select BRCMSTB_L2_IRQ
select BCM7120_L2_IRQ
- select ARCH_HAS_HOLES_MEMORYMODEL
select ZONE_DMA if ARM_LPAE
select SOC_BRCMSTB
select SOC_BUS
diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig
index 02b180ad7245..4d3b7d0418c4 100644
--- a/arch/arm/mach-davinci/Kconfig
+++ b/arch/arm/mach-davinci/Kconfig
@@ -5,7 +5,6 @@ menuconfig ARCH_DAVINCI
depends on ARCH_MULTI_V5
select DAVINCI_TIMER
select ZONE_DMA
- select ARCH_HAS_HOLES_MEMORYMODEL
select PM_GENERIC_DOMAINS if PM
select PM_GENERIC_DOMAINS_OF if PM && OF
select REGMAP_MMIO
diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig
index 9dab1f50a02f..fc01137628e4 100644
--- a/arch/arm/mach-exynos/Kconfig
+++ b/arch/arm/mach-exynos/Kconfig
@@ -8,7 +8,6 @@
menuconfig ARCH_EXYNOS
bool "Samsung EXYNOS"
depends on ARCH_MULTI_V7
- select ARCH_HAS_HOLES_MEMORYMODEL
select ARCH_SUPPORTS_BIG_ENDIAN
select ARM_AMBA
select ARM_GIC
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 1bc68913d62c..9de38ce8124f 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -2,7 +2,6 @@
config ARCH_HIGHBANK
bool "Calxeda ECX-1000/2000 (Highbank/Midway)"
depends on ARCH_MULTI_V7
- select ARCH_HAS_HOLES_MEMORYMODEL
select ARCH_SUPPORTS_BIG_ENDIAN
select ARM_AMBA
select ARM_ERRATA_764369 if SMP
diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig
index fdb6743760a2..0211f4aa8cc7 100644
--- a/arch/arm/mach-omap2/Kconfig
+++ b/arch/arm/mach-omap2/Kconfig
@@ -94,7 +94,7 @@ config SOC_DRA7XX
config ARCH_OMAP2PLUS
bool
select ARCH_HAS_BANDGAP
- select ARCH_HAS_HOLES_MEMORYMODEL
+ select ARCH_HAS_RESET_CONTROLLER
select ARCH_OMAP
select CLKSRC_MMIO
select GENERIC_IRQ_CHIP
diff --git a/arch/arm/mach-s5pv210/Kconfig b/arch/arm/mach-s5pv210/Kconfig
index 03984a791879..69ff1bb89f38 100644
--- a/arch/arm/mach-s5pv210/Kconfig
+++ b/arch/arm/mach-s5pv210/Kconfig
@@ -8,7 +8,6 @@
config ARCH_S5PV210
bool "Samsung S5PV210/S5PC110"
depends on ARCH_MULTI_V7
- select ARCH_HAS_HOLES_MEMORYMODEL
select ARM_VIC
select CLKSRC_SAMSUNG_PWM
select COMMON_CLK_SAMSUNG
diff --git a/arch/arm/mach-tango/Kconfig b/arch/arm/mach-tango/Kconfig
index 25b2fd434861..a9eeda36aeb1 100644
--- a/arch/arm/mach-tango/Kconfig
+++ b/arch/arm/mach-tango/Kconfig
@@ -3,7 +3,6 @@ config ARCH_TANGO
bool "Sigma Designs Tango4 (SMP87xx)"
depends on ARCH_MULTI_V7
# Cortex-A9 MPCore r3p0, PL310 r3p2
- select ARCH_HAS_HOLES_MEMORYMODEL
select ARM_ERRATA_754322
select ARM_ERRATA_764369 if SMP
select ARM_ERRATA_775420
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e2ed8e08cc7a..d1cabccc02b7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -193,8 +193,6 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
return 1;
p = pfn_to_page(pfn);
- if (!memmap_valid_within(pfn, p, page_zone(p)))
- return 1;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e4f9df955040..b138ddad619e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1438,37 +1438,6 @@ void memory_present(int nid, unsigned long start, unsigned long end);
#define pfn_valid_within(pfn) (1)
#endif
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-/*
- * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
- * associated with it or not. This means that a struct page exists for this
- * pfn. The caller cannot assume the page is fully initialized in general.
- * Hotplugable pages might not have been onlined yet. pfn_to_online_page()
- * will ensure the struct page is fully online and initialized. Special pages
- * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly.
- *
- * In FLATMEM, it is expected that holes always have valid memmap as long as
- * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed
- * that a valid section has a memmap for the entire section.
- *
- * However, an ARM, and maybe other embedded architectures in the future
- * free memmap backing holes to save memory on the assumption the memmap is
- * never used. The page_zone linkages are then broken even though pfn_valid()
- * returns true. A walker of the full memmap must then do this additional
- * check to ensure the memmap they are looking at is sane by making sure
- * the zone and PFN linkages are still valid. This is expensive, but walkers
- * of the full memmap are extremely rare.
- */
-bool memmap_valid_within(unsigned long pfn,
- struct page *page, struct zone *zone);
-#else
-static inline bool memmap_valid_within(unsigned long pfn,
- struct page *page, struct zone *zone)
-{
- return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..f337831affc2 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,20 +72,6 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
return z;
}
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-bool memmap_valid_within(unsigned long pfn,
- struct page *page, struct zone *zone)
-{
- if (page_to_pfn(page) != pfn)
- return false;
-
- if (page_zone(page) != zone)
- return false;
-
- return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a8222041bd44..240fe2153ca9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1444,10 +1444,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!page)
continue;
- /* Watch for unexpected holes punched in the memmap */
- if (!memmap_valid_within(pfn, page, zone))
- continue;
-
if (page_zone(page) != zone)
continue;
--
2.34.1
--
My Name Is Rev Sister Grace, Contact me for a Charity donation of $2.2
MILLION Dollars to you for charity work. Send Your Full Name & Your
Phone No At: biiioqq(a)gmail.com
With the new OS handshake introduced with the commit: "c7ff29763989
("thermal: int340x: Update OS policy capability handshake")",
thermal zone mode "enabled" doesn't work in the same way as the legacy
handshake. The mode "enabled" fails with -EINVAL using new handshake.
To address this issue, when the new OS UUID mask is set:
- When mode is "enabled", return 0 as the firmware already has the
latest policy mask.
- When mode is "disabled", update the firmware with UUID mask of zero.
In this way firmware can take control of the thermal control. Also
reset the OS UUID mask. This allows user space to update with new
set of policies.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
.../intel/int340x_thermal/int3400_thermal.c | 48 ++++++++++++-------
1 file changed, 32 insertions(+), 16 deletions(-)
diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
index d97f496bab9b..1061728ad5a9 100644
--- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
@@ -194,12 +194,31 @@ static int int3400_thermal_run_osc(acpi_handle handle, char *uuid_str, int *enab
return result;
}
+static int set_os_uuid_mask(struct int3400_thermal_priv *priv, u32 mask)
+{
+ int cap = 0;
+
+ /*
+ * Capability bits:
+ * Bit 0: set to 1 to indicate DPTF is active
+ * Bi1 1: set to 1 to active cooling is supported by user space daemon
+ * Bit 2: set to 1 to passive cooling is supported by user space daemon
+ * Bit 3: set to 1 to critical trip is handled by user space daemon
+ */
+ if (mask)
+ cap = ((priv->os_uuid_mask << 1) | 0x01);
+
+ return int3400_thermal_run_osc(priv->adev->handle,
+ "b23ba85d-c8b7-3542-88de-8de2ffcfd698",
+ &cap);
+}
+
static ssize_t current_uuid_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct int3400_thermal_priv *priv = dev_get_drvdata(dev);
- int i;
+ int ret, i;
for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; ++i) {
if (!strncmp(buf, int3400_thermal_uuids[i],
@@ -231,19 +250,7 @@ static ssize_t current_uuid_store(struct device *dev,
}
if (priv->os_uuid_mask) {
- int cap, ret;
-
- /*
- * Capability bits:
- * Bit 0: set to 1 to indicate DPTF is active
- * Bi1 1: set to 1 to active cooling is supported by user space daemon
- * Bit 2: set to 1 to passive cooling is supported by user space daemon
- * Bit 3: set to 1 to critical trip is handled by user space daemon
- */
- cap = ((priv->os_uuid_mask << 1) | 0x01);
- ret = int3400_thermal_run_osc(priv->adev->handle,
- "b23ba85d-c8b7-3542-88de-8de2ffcfd698",
- &cap);
+ ret = set_os_uuid_mask(priv, priv->os_uuid_mask);
if (ret)
return ret;
}
@@ -469,17 +476,26 @@ static int int3400_thermal_change_mode(struct thermal_zone_device *thermal,
if (mode != thermal->mode) {
int enabled;
+ enabled = (mode == THERMAL_DEVICE_ENABLED);
+
+ if (priv->os_uuid_mask) {
+ if (!enabled) {
+ priv->os_uuid_mask = 0;
+ result = set_os_uuid_mask(priv, priv->os_uuid_mask);
+ }
+ goto eval_odvp;
+ }
+
if (priv->current_uuid_index < 0 ||
priv->current_uuid_index >= INT3400_THERMAL_MAXIMUM_UUID)
return -EINVAL;
- enabled = (mode == THERMAL_DEVICE_ENABLED);
result = int3400_thermal_run_osc(priv->adev->handle,
int3400_thermal_uuids[priv->current_uuid_index],
&enabled);
}
-
+eval_odvp:
evaluate_odvp(priv);
return result;
--
2.31.1
From: Sven Schnelle <svens(a)linux.ibm.com>
clock_delta is declared as unsigned long in various places. However,
the clock sync delta can be negative. This would add a huge positive
offset in clock_sync_global where clock_delta is added to clk.eitod
which is a 72 bit integer. Declare it as signed long to fix this.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sven Schnelle <svens(a)linux.ibm.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
---
arch/s390/include/asm/cio.h | 2 +-
arch/s390/kernel/time.c | 8 ++++----
drivers/s390/cio/chsc.c | 4 ++--
3 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index 1effac6a0152..1c4f585dd39b 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -369,7 +369,7 @@ void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev);
struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages);
/* Function from drivers/s390/cio/chsc.c */
-int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
+int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta);
int chsc_sstpi(void *page, void *result, size_t size);
int chsc_stzi(void *page, void *result, size_t size);
int chsc_sgib(u32 origin);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 2506bfdc91c7..6b7b6d5e3632 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -364,7 +364,7 @@ static inline int check_sync_clock(void)
* Apply clock delta to the global data structures.
* This is called once on the CPU that performed the clock sync.
*/
-static void clock_sync_global(unsigned long delta)
+static void clock_sync_global(long delta)
{
unsigned long now, adj;
struct ptff_qto qto;
@@ -400,7 +400,7 @@ static void clock_sync_global(unsigned long delta)
* Apply clock delta to the per-CPU data structures of this CPU.
* This is called for each online CPU after the call to clock_sync_global.
*/
-static void clock_sync_local(unsigned long delta)
+static void clock_sync_local(long delta)
{
/* Add the delta to the clock comparator. */
if (S390_lowcore.clock_comparator != clock_comparator_max) {
@@ -424,7 +424,7 @@ static void __init time_init_wq(void)
struct clock_sync_data {
atomic_t cpus;
int in_sync;
- unsigned long clock_delta;
+ long clock_delta;
};
/*
@@ -544,7 +544,7 @@ static int stpinfo_valid(void)
static int stp_sync_clock(void *data)
{
struct clock_sync_data *sync = data;
- u64 clock_delta, flags;
+ long clock_delta, flags;
static int first;
int rc;
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 297fb399363c..620a917cd3a1 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -1255,7 +1255,7 @@ chsc_determine_css_characteristics(void)
EXPORT_SYMBOL_GPL(css_general_characteristics);
EXPORT_SYMBOL_GPL(css_chsc_characteristics);
-int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta)
+int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta)
{
struct {
struct chsc_header request;
@@ -1266,7 +1266,7 @@ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta)
unsigned int rsvd2[5];
struct chsc_header response;
unsigned int rsvd3[3];
- u64 clock_delta;
+ s64 clock_delta;
unsigned int rsvd4[2];
} *rr;
int rc;
--
2.35.2
From: Masami Hiramatsu <mhiramat(a)kernel.org>
commit 004e8dce9c5595697951f7cd0e9f66b35c92265e upstream
Prohibit probing on instruction which has XEN_EMULATE_PREFIX
or KVM's emulate prefix. Since that prefix is a marker for Xen
and KVM, if we modify the marker by kprobe's int3, that doesn't
work as expected.
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: x86(a)kernel.org
Cc: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Stefano Stabellini <sstabellini(a)kernel.org>
Cc: Andrew Cooper <andrew.cooper3(a)citrix.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: xen-devel(a)lists.xenproject.org
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Link: https://lkml.kernel.org/r/156777566048.25081.6296162369492175325.stgit@devn…
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Cc: stable(a)vger.kernel.org # 5.4.x
---
arch/x86/kernel/kprobes/core.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index c205d77d57da..3700dc94847c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -358,6 +358,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
kernel_insn_init(insn, dest, MAX_INSN_SIZE);
insn_get_length(insn);
+ /* We can not probe force emulate prefixed instruction */
+ if (insn_has_emulate_prefix(insn))
+ return 0;
+
/* Another subsystem puts a breakpoint, failed to recover */
if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
--
2.32.0
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Motivation of backport:
-----------------------
1. The cfcdef5e30469 ("rcu: Allow rcu_do_batch() to dynamically adjust batch sizes")
broke the default behaviour of "offloading rcu callbacks" setup. In that scenario
after each callback the caller context was used to check if it has to be rescheduled
giving a CPU time for others. After that change an "offloaded" setup can switch to
time-based RCU callbacks processing, what can be long for latency sensitive workloads
and SCHED_FIFO processes, i.e. callbacks are invoked for a long time with keeping
preemption off and without checking cond_resched().
2. Our devices which run Android and 5.10 kernel have some critical areas which
are sensitive to latency. It is a low latency audio, 8k video, UI stack and so on.
For example below is a trace that illustrates a delay of "irq/396-5-0072" RT task
to complete IRQ processing:
<snip>
rcuop/6-54 [000] d.h2 183.752989: irq_handler_entry: irq=85 name=i2c_geni
rcuop/6-54 [000] d.h5 183.753007: sched_waking: comm=irq/396-5-0072 pid=12675 prio=49 target_cpu=000
rcuop/6-54 [000] dNh6 183.753014: sched_wakeup: irq/396-5-0072:12675 [49] success=1 CPU:000
rcuop/6-54 [000] dNh2 183.753015: irq_handler_exit: irq=85 ret=handled
rcuop/6-54 [000] .N.. 183.753018: rcu_invoke_callback: rcu_preempt rhp=0xffffff88ffd440b0 func=__d_free.cfi_jt
rcuop/6-54 [000] .N.. 183.753020: rcu_invoke_callback: rcu_preempt rhp=0xffffff892ffd8400 func=inode_free_by_rcu.cfi_jt
rcuop/6-54 [000] .N.. 183.753021: rcu_invoke_callback: rcu_preempt rhp=0xffffff89327cd708 func=i_callback.cfi_jt
...
rcuop/6-54 [000] .N.. 183.755941: rcu_invoke_callback: rcu_preempt rhp=0xffffff8993c5a968 func=i_callback.cfi_jt
rcuop/6-54 [000] .N.. 183.755942: rcu_invoke_callback: rcu_preempt rhp=0xffffff8993c4bd20 func=__d_free.cfi_jt
rcuop/6-54 [000] dN.. 183.755944: rcu_batch_end: rcu_preempt CBs-invoked=2112 idle=>c<>c<>c<>c<
rcuop/6-54 [000] dN.. 183.755946: rcu_utilization: Start context switch
rcuop/6-54 [000] dN.. 183.755946: rcu_utilization: End context switch
rcuop/6-54 [000] d..2 183.755959: sched_switch: rcuop/6:54 [120] R ==> migration/0:16 [0]
...
migratio-16 [000] d..2 183.756021: sched_switch: migration/0:16 [0] S ==> irq/396-5-0072:12675 [49]
<snip>
The "irq/396-5-0072:12675" was delayed for ~3 milliseconds due to introduced side effect.
Please note, on our Android devices we get ~70 000 callbacks registered to be invoked by
the "rcuop/x" workers. This is during 1 seconds time interval and regular handset usage.
Latencies bigger that 3 milliseconds affect our high-resolution audio streaming over the
LDAC/Bluetooth stack.
Two patches depend on each other.
Frederic Weisbecker (2):
rcu: [for 5.15 stable] Fix callbacks processing time limit retaining cond_resched()
rcu: [for 5.15 stable] Apply callbacks processing time limit only on softirq
kernel/rcu/tree.c | 31 ++++++++++++++++++-------------
1 file changed, 18 insertions(+), 13 deletions(-)
--
2.30.2
Commit 863771a28e27 ("powerpc/32s: Convert switch_mmu_context() to C")
moved the switch_mmu_context() to C. While in principle a good idea, it
meant that the function now uses the stack. The stack is not accessible
from real mode though.
So to keep calling the function, let's turn on MSR_DR while we call it.
That way, all pointer references to the stack are handled virtually.
In addition, make sure to save/restore r12 in an SPRG, as it may get
clobbered by the C function.
Reported-by: Matt Evans <matt(a)ozlabs.org>
Fixes: 863771a28e27 ("powerpc/32s: Convert switch_mmu_context() to C")
Signed-off-by: Alexander Graf <graf(a)amazon.com>
Cc: stable(a)vger.kernel.org # v5.14+
---
v1 -> v2:
- Save and restore R12, so that we don't touch volatile registers
while calling into C.
---
arch/powerpc/kvm/book3s_32_sr.S | 26 +++++++++++++++++++++-----
1 file changed, 21 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
index e3ab9df6cf19..1ce13e3ab072 100644
--- a/arch/powerpc/kvm/book3s_32_sr.S
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -122,11 +122,27 @@
/* 0x0 - 0xb */
- /* 'current->mm' needs to be in r4 */
- tophys(r4, r2)
- lwz r4, MM(r4)
- tophys(r4, r4)
- /* This only clobbers r0, r3, r4 and r5 */
+ /* switch_mmu_context() clobbers r12, rescue it */
+ SET_SCRATCH0(r12)
+
+ /* switch_mmu_context() needs paging, let's enable it */
+ mfmsr r9
+ ori r11, r9, MSR_DR
+ mtmsr r11
+ sync
+
+ /* Calling switch_mmu_context(<inv>, current->mm, <inv>); */
+ lwz r4, MM(r2)
bl switch_mmu_context
+ /* Disable paging again */
+ mfmsr r9
+ li r6, MSR_DR
+ andc r9, r9, r6
+ mtmsr r9
+ sync
+
+ /* restore r12 */
+ GET_SCRATCH0(r12)
+
.endm
--
2.28.0.394.ge197136389
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Hello Greg, Sasha,
this is backport of all the recet changes for the Aardvark PCIe controller
for 5.15.
These include
- memory leak fix in driver unbind
- more complete driver unbind
- fixes for MSI support
- add MSI-X support, which fixes support for some cards
- add ERR interrupt support (which we really missed when debugging)
I also included some small cosmetic changes - this will make it easier
to backport next fixes for the driver (there is another batch pending
on linux-pci).
Marek
Marek Behún (5):
PCI: aardvark: Make MSI irq_chip structures static driver structures
PCI: aardvark: Make msi_domain_info structure a static driver
structure
PCI: aardvark: Use dev_fwnode() instead of
of_node_to_fwnode(dev->of_node)
PCI: aardvark: Drop __maybe_unused from advk_pcie_disable_phy()
PCI: aardvark: Update comment about link going down after link-up
Pali Rohár (25):
PCI: pci-bridge-emul: Add description for class_revision field
PCI: pci-bridge-emul: Add definitions for missing capabilities
registers
PCI: aardvark: Add support for DEVCAP2, DEVCTL2, LNKCAP2 and LNKCTL2
registers on emulated bridge
PCI: aardvark: Clear all MSIs at setup
PCI: aardvark: Comment actions in driver remove method
PCI: aardvark: Disable bus mastering when unbinding driver
PCI: aardvark: Mask all interrupts when unbinding driver
PCI: aardvark: Fix memory leak in driver unbind
PCI: aardvark: Assert PERST# when unbinding driver
PCI: aardvark: Disable link training when unbinding driver
PCI: aardvark: Disable common PHY when unbinding driver
PCI: aardvark: Replace custom PCIE_CORE_INT_* macros with
PCI_INTERRUPT_*
PCI: aardvark: Rewrite IRQ code to chained IRQ handler
PCI: aardvark: Check return value of generic_handle_domain_irq() when
processing INTx IRQ
PCI: aardvark: Refactor unmasking summary MSI interrupt
PCI: aardvark: Add support for masking MSI interrupts
PCI: aardvark: Fix setting MSI address
PCI: aardvark: Enable MSI-X support
PCI: aardvark: Add support for ERR interrupt on emulated bridge
PCI: aardvark: Optimize writing PCI_EXP_RTCTL_PMEIE and
PCI_EXP_RTSTA_PME on emulated bridge
PCI: aardvark: Add support for PME interrupts
PCI: aardvark: Fix support for PME requester on emulated bridge
PCI: aardvark: Use separate INTA interrupt for emulated root bridge
PCI: aardvark: Remove irq_mask_ack() callback for INTx interrupts
PCI: aardvark: Don't mask irq when mapping
drivers/pci/controller/pci-aardvark.c | 428 +++++++++++++++++++-------
drivers/pci/pci-bridge-emul.c | 49 ++-
2 files changed, 371 insertions(+), 106 deletions(-)
--
2.35.1
Hello Greg, Sasha,
this is backport of all the recet changes for the Aardvark PCIe controller
for 5.17.
These include
- fixes for MSI support
- add MSI-X support, which fixes support for some cards
- add ERR interrupt support (which we really missed when debugging)
As in series for 5.15, I included some small cosmetic changes - this
will make it easier to backport next fixes for the driver (there is
another batch pending on linux-pci).
Marek
Marek Behún (5):
PCI: aardvark: Make MSI irq_chip structures static driver structures
PCI: aardvark: Make msi_domain_info structure a static driver
structure
PCI: aardvark: Use dev_fwnode() instead of
of_node_to_fwnode(dev->of_node)
PCI: aardvark: Drop __maybe_unused from advk_pcie_disable_phy()
PCI: aardvark: Update comment about link going down after link-up
Pali Rohár (14):
PCI: aardvark: Replace custom PCIE_CORE_INT_* macros with
PCI_INTERRUPT_*
PCI: aardvark: Rewrite IRQ code to chained IRQ handler
PCI: aardvark: Check return value of generic_handle_domain_irq() when
processing INTx IRQ
PCI: aardvark: Refactor unmasking summary MSI interrupt
PCI: aardvark: Add support for masking MSI interrupts
PCI: aardvark: Fix setting MSI address
PCI: aardvark: Enable MSI-X support
PCI: aardvark: Add support for ERR interrupt on emulated bridge
PCI: aardvark: Optimize writing PCI_EXP_RTCTL_PMEIE and
PCI_EXP_RTSTA_PME on emulated bridge
PCI: aardvark: Add support for PME interrupts
PCI: aardvark: Fix support for PME requester on emulated bridge
PCI: aardvark: Use separate INTA interrupt for emulated root bridge
PCI: aardvark: Remove irq_mask_ack() callback for INTx interrupts
PCI: aardvark: Don't mask irq when mapping
drivers/pci/controller/pci-aardvark.c | 367 +++++++++++++++++++-------
1 file changed, 266 insertions(+), 101 deletions(-)
--
2.35.1
This is backport of the patch 9f6dc6337610 ("dm: interlock pending dm_io
and dm_wait_for_bios_completion") for the kernel 4.9.
The bugs fixed by this patch can cause random crashing when reloading dm
table, so it is eligible for stable backport.
Note that the kernel 4.9 uses md->pending to count the number of
in-progress I/Os and md->pending is decremented after dm_stats_account_io,
so the race condition doesn't really exist there (except for missing
smp_rmb()).
The percpu variable md->pending_io is not needed in the stable kernels,
because md->pending counts the same value, so it is not backported.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Reviewed-by: Mike Snitzer <snitzer(a)kernel.org>
---
drivers/md/dm.c | 2 ++
1 file changed, 2 insertions(+)
Index: linux-stable/drivers/md/dm.c
===================================================================
--- linux-stable.orig/drivers/md/dm.c 2022-04-30 19:03:08.000000000 +0200
+++ linux-stable/drivers/md/dm.c 2022-04-30 19:03:46.000000000 +0200
@@ -2027,6 +2027,8 @@ static int dm_wait_for_completion(struct
}
finish_wait(&md->wait, &wait);
+ smp_rmb(); /* paired with atomic_dec_return in end_io_acct */
+
return r;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e914d8f00391520ecc4495dd0ca0124538ab7119 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan(a)kernel.org>
Date: Thu, 14 Apr 2022 19:13:46 -0700
Subject: [PATCH] mm: fix unexpected zeroed page mapping with zram swap
Two processes under CLONE_VM cloning, user process can be corrupted by
seeing zeroed page unexpectedly.
CPU A CPU B
do_swap_page do_swap_page
SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path
swap_readpage valid data
swap_slot_free_notify
delete zram entry
swap_readpage zeroed(invalid) data
pte_lock
map the *zero data* to userspace
pte_unlock
pte_lock
if (!pte_same)
goto out_nomap;
pte_unlock
return and next refault will
read zeroed data
The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't
increase the refcount of swap slot at copy_mm so it couldn't catch up
whether it's safe or not to discard data from backing device. In the
case, only the lock it could rely on to synchronize swap slot freeing is
page table lock. Thus, this patch gets rid of the swap_slot_free_notify
function. With this patch, CPU A will see correct data.
CPU A CPU B
do_swap_page do_swap_page
SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path
swap_readpage original data
pte_lock
map the original data
swap_free
swap_range_free
bd_disk->fops->swap_slot_free_notify
swap_readpage read zeroed data
pte_unlock
pte_lock
if (!pte_same)
goto out_nomap;
pte_unlock
return
on next refault will see mapped data by CPU B
The concern of the patch would increase memory consumption since it
could keep wasted memory with compressed form in zram as well as
uncompressed form in address space. However, most of cases of zram uses
no readahead and do_swap_page is followed by swap_free so it will free
the compressed form from in zram quickly.
Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: Ivan Babrou <ivan(a)cloudflare.com>
Tested-by: Ivan Babrou <ivan(a)cloudflare.com>
Signed-off-by: Minchan Kim <minchan(a)kernel.org>
Cc: Nitin Gupta <ngupta(a)vflare.org>
Cc: Sergey Senozhatsky <senozhatsky(a)chromium.org>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [4.14+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/page_io.c b/mm/page_io.c
index b417f000b49e..89fbf3cae30f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -51,54 +51,6 @@ void end_swap_bio_write(struct bio *bio)
bio_put(bio);
}
-static void swap_slot_free_notify(struct page *page)
-{
- struct swap_info_struct *sis;
- struct gendisk *disk;
- swp_entry_t entry;
-
- /*
- * There is no guarantee that the page is in swap cache - the software
- * suspend code (at least) uses end_swap_bio_read() against a non-
- * swapcache page. So we must check PG_swapcache before proceeding with
- * this optimization.
- */
- if (unlikely(!PageSwapCache(page)))
- return;
-
- sis = page_swap_info(page);
- if (data_race(!(sis->flags & SWP_BLKDEV)))
- return;
-
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- disk = sis->bdev->bd_disk;
- entry.val = page_private(page);
- if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
- unsigned long offset;
-
- offset = swp_offset(entry);
-
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
-}
-
static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -114,7 +66,6 @@ static void end_swap_bio_read(struct bio *bio)
}
SetPageUptodate(page);
- swap_slot_free_notify(page);
out:
unlock_page(page);
WRITE_ONCE(bio->bi_private, NULL);
@@ -394,11 +345,6 @@ int swap_readpage(struct page *page, bool synchronous)
if (sis->flags & SWP_SYNCHRONOUS_IO) {
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
- if (trylock_page(page)) {
- swap_slot_free_notify(page);
- unlock_page(page);
- }
-
count_vm_event(PSWPIN);
goto out;
}
[ Upstream commit cc8f7fe1f5eab010191aa4570f27641876fa1267 ]
Add __GFP_ZERO flag for alloc_page in function bio_copy_kern to initialize
the buffer of a bio.
Signed-off-by: Haimin Zhang <tcs.kernel(a)gmail.com>
Reviewed-by: Chaitanya Kulkarni <kch(a)nvidia.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Link: https://lore.kernel.org/r/20220216084038.15635-1-tcs.kernel@gmail.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
[nobelbarakat: Backported to 5.10: Manually added flag]
Signed-off-by: Nobel Barakat <nobelbarakat(a)google.com>
---
This changes fixes a kernel info leak since it's possible for bio_copy_kern to
copy unitialized memory into userspace.
For the backport, I had to manually add the __GFP_ZERO
flag since alloc_page on 5.10 uses a different parameter
than on 5.15. On 5.10, alloc_page is called with q->bounce_gfp
whereas on 5.15 it's called with GFP_NOIO.
Version 5.4 is also affected, and I intend to submit a backport
there as well.
block/blk-map.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk-map.c b/block/blk-map.c
index 21630dccac62..ede73f4f7014 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -488,7 +488,7 @@
if (bytes > len)
bytes = len;
- page = alloc_page(q->bounce_gfp | gfp_mask);
+ page = alloc_page(q->bounce_gfp | __GFP_ZERO | gfp_mask);
if (!page)
goto cleanup;
--
2.36.0.464.gb9c8b46e94-goog
commit 9995b408f17ff8c7f11bc725c8aa225ba3a63b1c upstream.
There are two reasons for addrconf_notify() to be called with NETDEV_DOWN:
either the network device is actually going down, or IPv6 was disabled
on the interface.
If either of them stays down while the other is toggled, we repeatedly
call the code for NETDEV_DOWN, including ipv6_mc_down(), while never
calling the corresponding ipv6_mc_up() in between. This will cause a
new entry in idev->mc_tomb to be allocated for each multicast group
the interface is subscribed to, which in turn leaks one struct ifmcaddr6
per nontrivial multicast group the interface is subscribed to.
The following reproducer will leak at least $n objects:
ip addr add ff2e::4242/32 dev eth0 autojoin
sysctl -w net.ipv6.conf.eth0.disable_ipv6=1
for i in $(seq 1 $n); do
ip link set up eth0; ip link set down eth0
done
Joining groups with IPV6_ADD_MEMBERSHIP (unprivileged) or setting the
sysctl net.ipv6.conf.eth0.forwarding to 1 (=> subscribing to ff02::2)
can also be used to create a nontrivial idev->mc_list, which will the
leak objects with the right up-down-sequence.
Based on both sources for NETDEV_DOWN events the interface IPv6 state
should be considered:
- not ready if the network interface is not ready OR IPv6 is disabled
for it
- ready if the network interface is ready AND IPv6 is enabled for it
The functions ipv6_mc_up() and ipv6_down() should only be run when this
state changes.
Implement this by remembering when the IPv6 state is ready, and only
run ipv6_mc_down() if it actually changed from ready to not ready.
The other direction (not ready -> ready) already works correctly, as:
- the interface notification triggered codepath for NETDEV_UP /
NETDEV_CHANGE returns early if ipv6 is disabled, and
- the disable_ipv6=0 triggered codepath skips fully initializing the
interface as long as addrconf_link_ready(dev) returns false
- calling ipv6_mc_up() repeatedly does not leak anything
Fixes: 3ce62a84d53c ("ipv6: exit early in addrconf_notify() if IPv6 is disabled")
Signed-off-by: Johannes Nixdorf <j.nixdorf(a)avm.de>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
[jnixdorf: context updated for bpo to v4.9/v4.14]
Signed-off-by: Johannes Nixdorf <j.nixdorf(a)avm.de>
---
net/ipv6/addrconf.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6119ab33a56e..30ca73c78125 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3539,6 +3539,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct list_head del_list;
int _keep_addr;
bool keep_addr;
+ bool was_ready;
int state, i;
ASSERT_RTNL();
@@ -3602,7 +3603,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
addrconf_del_rs_timer(idev);
- /* Step 2: clear flags for stateless addrconf */
+ /* Step 2: clear flags for stateless addrconf, repeated down
+ * detection
+ */
+ was_ready = idev->if_flags & IF_READY;
if (!how)
idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
@@ -3689,7 +3693,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
if (how) {
ipv6_ac_destroy_dev(idev);
ipv6_mc_destroy_dev(idev);
- } else {
+ } else if (was_ready) {
ipv6_mc_down(idev);
}
--
2.36.0
Commit 863771a28e27 ("powerpc/32s: Convert switch_mmu_context() to C")
moved the switch_mmu_context() to C. While in principle a good idea, it
meant that the function now uses the stack. The stack is not accessible
from real mode though.
So to keep calling the function, let's turn on MSR_DR while we call it.
That way, all pointer references to the stack are handled virtually.
Reported-by: Matt Evans <matt(a)ozlabs.org>
Fixes: 863771a28e27 ("powerpc/32s: Convert switch_mmu_context() to C")
Signed-off-by: Alexander Graf <graf(a)amazon.com>
Cc: stable(a)vger.kernel.org
---
arch/powerpc/kvm/book3s_32_sr.S | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
index e3ab9df6cf19..bd4f798f7a46 100644
--- a/arch/powerpc/kvm/book3s_32_sr.S
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -122,11 +122,21 @@
/* 0x0 - 0xb */
- /* 'current->mm' needs to be in r4 */
- tophys(r4, r2)
- lwz r4, MM(r4)
- tophys(r4, r4)
- /* This only clobbers r0, r3, r4 and r5 */
+ /* switch_mmu_context() needs paging, let's enable it */
+ mfmsr r9
+ ori r11, r9, MSR_DR
+ mtmsr r11
+ sync
+
+ /* Calling switch_mmu_context(<inv>, current->mm, <inv>); */
+ lwz r4, MM(r2)
bl switch_mmu_context
+ /* Disable paging again */
+ mfmsr r9
+ li r6, MSR_DR
+ andc r9, r9, r6
+ mtmsr r9
+ sync
+
.endm
--
2.28.0.394.ge197136389
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7962c0896429af2a0e00ec6bc15d992536453b2d Mon Sep 17 00:00:00 2001
From: Helge Deller <deller(a)gmx.de>
Date: Sat, 7 May 2022 15:32:38 +0200
Subject: [PATCH] Revert "parisc: Mark sched_clock unstable only if clocks are
not syncronized"
This reverts commit d97180ad68bdb7ee10f327205a649bc2f558741d.
It triggers RCU stalls at boot with a 32-bit kernel.
Signed-off-by: Helge Deller <deller(a)gmx.de>
Noticed-by: John David Anglin <dave.anglin(a)bell.net>
Cc: stable(a)vger.kernel.org # v5.15+
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index b91cb45ffd4e..f005ddedb50e 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -161,6 +161,8 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_PA11
dma_ops_init();
#endif
+
+ clear_sched_clock_stable();
}
/*
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 95ee9e1a364b..19c31a72fe76 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -267,9 +267,6 @@ static int __init init_cr16_clocksource(void)
(cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc))
continue;
- /* mark sched_clock unstable */
- clear_sched_clock_stable();
-
clocksource_cr16.name = "cr16_unstable";
clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE;
clocksource_cr16.rating = 0;
@@ -277,6 +274,10 @@ static int __init init_cr16_clocksource(void)
}
}
+ /* XXX: We may want to mark sched_clock stable here if cr16 clocks are
+ * in sync:
+ * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */
+
/* register at clocksource framework */
clocksource_register_hz(&clocksource_cr16,
100 * PAGE0->mem_10msec);
Hello all,
Commit f00432063db1a0db484e85193eccc6845435b80e upstream ("SUNRPC:
Ensure we flush any closed sockets before xs_xprt_free()") fixes
CVE-2022-28893. Looking to cherry-pick this fix to versions 5.10+.
Thanks,
Dexter
Can we get 1f311c94aabdb419c28e3147bcc8ab89269f1a7e merged into the stable tree?
I have some compatibility issues on Realtek chips because of the missing initialization clocks.
Thanks!
Regards,
Christian
From: Ulf Hansson <ulf.hansson(a)linaro.org>
Sent: Monday, February 28, 2022 5:12 PM
To: Ricky WU
Cc: gregkh(a)linuxfoundation.org; kai.heng.feng(a)canonical.com; tommyhebb(a)gmail.com; linux-mmc(a)vger.kernel.org; linux-kernel(a)vger.kernel.org
Subject: Re: [PATCH] mmc: rtsx: add 74 Clocks in power on flow
On Tue, 22 Feb 2022 at 08:28, Ricky WU <ricky_wu(a)realtek.com> wrote:
>
> After 1ms stabilizing the voltage time
> add "Host provides at least 74 Clocks
> before issuing first command" that is
> spec definition
>
> Signed-off-by: Ricky Wu <ricky_wu(a)realtek.com>
> ---
> drivers/mmc/host/rtsx_pci_sdmmc.c | 7 +++++++
> 1 file changed, 7 insertions(+)
>
> diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
> index 2a3f14afe9f8..e016d720e453 100644
> --- a/drivers/mmc/host/rtsx_pci_sdmmc.c
> +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
> @@ -940,10 +940,17 @@ static int sd_power_on(struct realtek_pci_sdmmc *host)
> if (err < 0)
> return err;
>
> + mdelay(1);
> +
> err = rtsx_pci_write_register(pcr, CARD_OE, SD_OUTPUT_EN, SD_OUTPUT_EN);
> if (err < 0)
> return err;
>
> + /* send init 74 clocks */
> + rtsx_pci_write_register(pcr, SD_BUS_STAT, SD_CLK_TOGGLE_EN, SD_CLK_TOGGLE_EN);
> + mdelay(5);
> + rtsx_pci_write_register(pcr, SD_BUS_STAT, SD_CLK_TOGGLE_EN, 0);
> +
> if (PCI_PID(pcr) == PID_5261) {
> /*
> * If test mode is set switch to SD Express mandatorily,
As you probably are aware of, the mmc core uses three power states
(MMC_POWER_ON, MMC_POWER_UP and MMC_POWER_OFF) to manage the
initialization, while it invokes the ->set_ios() callback for the mmc
host driver. During these steps the core also tries to manage the
different delays that are needed according to the eMMC/SD specs. You
may have a look at mmc_power_up() in drivers/mmc/core/core.c. In the
rtsx case, MMC_POWER_ON and MMC_POWER_UP are treated as one single
step.
Moreover, it has turned out that some mmc HWs are actually controlling
these delays during the initialization themselves, which makes the
delays in the core superfluous. Therefore we have made the delays
configurable for host drivers. For DT based platforms, we have the DT
property "post-power-on-delay-ms" and for others, it's perfectly fine
to update host->power_delay_ms before calling mmc_add_host().
Would it be possible to take advantage of the above "features" from
the core, to avoid hard coded and superfluous delays?
Kind regards
Uffe
=
Hyperstone GmbH | Reichenaustr. 39a | 78467 Konstanz
Managing Director: Dr. Jan Peter Berns.
Commercial register of local courts: Freiburg HRB381782
We have run into an issue that a task gets stuck in
balance_dirty_pages_ratelimited() when perform I/O stress testing.
The reason we observed is that an I_DIRTY_PAGES inode with lots
of dirty pages is in b_dirty_time list and standard background
writeback cannot writeback the inode.
After studing the relevant code, the following scenario may lead
to the issue:
task1 task2
----- -----
fuse_flush
write_inode_now //in b_dirty_time
writeback_single_inode
__writeback_single_inode
fuse_write_end
filemap_dirty_folio
__xa_set_mark:PAGECACHE_TAG_DIRTY
lock inode->i_lock
if mapping tagged PAGECACHE_TAG_DIRTY
inode->i_state |= I_DIRTY_PAGES
unlock inode->i_lock
__mark_inode_dirty:I_DIRTY_PAGES
lock inode->i_lock
-was dirty,inode stays in
-b_dirty_time
unlock inode->i_lock
if(!(inode->i_state & I_DIRTY_All))
-not true,so nothing done
This patch moves the dirty inode to b_dirty list when the inode
currently is not queued in b_io or b_more_io list at the end of
writeback_single_inode.
Reviewed-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
CC: stable(a)vger.kernel.org
Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option")
Signed-off-by: Jing Xia <jing.xia(a)unisoc.com>
---
fs/fs-writeback.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591fe9cf1659..1fae0196292a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode,
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED) &&
+ (inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
--
2.17.1
Set "HCD_FLAG_DEFER_RH_REGISTER" to hcd->flags in xhci_run() to defer
registering primary roothub in usb_add_hcd() if xhci has two roothubs.
This will make sure both primary roothub and secondary roothub will be
registered along with the second HCD.
This is required for cold plugged USB devices to be detected in certain
PCIe USB cards (like Inateck USB card connected to AM64 EVM or J7200 EVM).
This patch has been added and reverted earier as it triggered a race
in usb device enumeration.
That race is now fixed in 5.16-rc3, and in stable back to 5.4
commit 6cca13de26ee ("usb: hub: Fix locking issues with address0_mutex")
commit 6ae6dc22d2d1 ("usb: hub: Fix usb enumeration issue due to address0
race")
[minor rebase change, and commit message update -Mathias]
CC: stable(a)vger.kernel.org # 5.4+
Suggested-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Tested-by: Chris Chiu <chris.chiu(a)canonical.com>
Signed-off-by: Kishon Vijay Abraham I <kishon(a)ti.com>
---
drivers/usb/host/xhci.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 25b87e99b4dd..2be38d9de8df 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -696,6 +696,8 @@ int xhci_run(struct usb_hcd *hcd)
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
"Finished xhci_run for USB2 roothub");
+ set_bit(HCD_FLAG_DEFER_RH_REGISTER, &hcd->flags);
+
xhci_create_dbc_dev(xhci);
xhci_debugfs_init(xhci);
--
2.17.1
It has been observed with certain PCIe USB cards (like Inateck connected
to AM64 EVM or J7200 EVM) that as soon as the primary roothub is
registered, port status change is handled even before xHC is running
leading to cold plug USB devices not detected. For such cases, registering
both the root hubs along with the second HCD is required. Add support for
deferring roothub registration in usb_add_hcd(), so that both primary and
secondary roothubs are registered along with the second HCD.
This patch has been added and reverted earier as it triggered a race
in usb device enumeration.
That race is now fixed in 5.16-rc3, and in stable back to 5.4
commit 6cca13de26ee ("usb: hub: Fix locking issues with address0_mutex")
commit 6ae6dc22d2d1 ("usb: hub: Fix usb enumeration issue due to address0
race")
CC: stable(a)vger.kernel.org # 5.4+
Suggested-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Tested-by: Chris Chiu <chris.chiu(a)canonical.com>
Acked-by: Alan Stern <stern(a)rowland.harvard.edu>
Signed-off-by: Kishon Vijay Abraham I <kishon(a)ti.com>
---
drivers/usb/core/hcd.c | 29 +++++++++++++++++++++++------
include/linux/usb/hcd.h | 2 ++
2 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index d9712c2602af..06eea8848ccc 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2816,6 +2816,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
{
int retval;
struct usb_device *rhdev;
+ struct usb_hcd *shared_hcd;
if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) {
hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev);
@@ -2976,13 +2977,26 @@ int usb_add_hcd(struct usb_hcd *hcd,
goto err_hcd_driver_start;
}
+ /* starting here, usbcore will pay attention to the shared HCD roothub */
+ shared_hcd = hcd->shared_hcd;
+ if (!usb_hcd_is_primary_hcd(hcd) && shared_hcd && HCD_DEFER_RH_REGISTER(shared_hcd)) {
+ retval = register_root_hub(shared_hcd);
+ if (retval != 0)
+ goto err_register_root_hub;
+
+ if (shared_hcd->uses_new_polling && HCD_POLL_RH(shared_hcd))
+ usb_hcd_poll_rh_status(shared_hcd);
+ }
+
/* starting here, usbcore will pay attention to this root hub */
- retval = register_root_hub(hcd);
- if (retval != 0)
- goto err_register_root_hub;
+ if (!HCD_DEFER_RH_REGISTER(hcd)) {
+ retval = register_root_hub(hcd);
+ if (retval != 0)
+ goto err_register_root_hub;
- if (hcd->uses_new_polling && HCD_POLL_RH(hcd))
- usb_hcd_poll_rh_status(hcd);
+ if (hcd->uses_new_polling && HCD_POLL_RH(hcd))
+ usb_hcd_poll_rh_status(hcd);
+ }
return retval;
@@ -3020,6 +3034,7 @@ EXPORT_SYMBOL_GPL(usb_add_hcd);
void usb_remove_hcd(struct usb_hcd *hcd)
{
struct usb_device *rhdev = hcd->self.root_hub;
+ bool rh_registered;
dev_info(hcd->self.controller, "remove, state %x\n", hcd->state);
@@ -3030,6 +3045,7 @@ void usb_remove_hcd(struct usb_hcd *hcd)
dev_dbg(hcd->self.controller, "roothub graceful disconnect\n");
spin_lock_irq (&hcd_root_hub_lock);
+ rh_registered = hcd->rh_registered;
hcd->rh_registered = 0;
spin_unlock_irq (&hcd_root_hub_lock);
@@ -3039,7 +3055,8 @@ void usb_remove_hcd(struct usb_hcd *hcd)
cancel_work_sync(&hcd->died_work);
mutex_lock(&usb_bus_idr_lock);
- usb_disconnect(&rhdev); /* Sets rhdev to NULL */
+ if (rh_registered)
+ usb_disconnect(&rhdev); /* Sets rhdev to NULL */
mutex_unlock(&usb_bus_idr_lock);
/*
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 548a028f2dab..2c1fc9212cf2 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -124,6 +124,7 @@ struct usb_hcd {
#define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */
#define HCD_FLAG_DEAD 6 /* controller has died? */
#define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */
+#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */
/* The flags can be tested using these macros; they are likely to
* be slightly faster than test_bit().
@@ -134,6 +135,7 @@ struct usb_hcd {
#define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING))
#define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING))
#define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD))
+#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER))
/*
* Specifies if interfaces are authorized by default
--
2.17.1